1 //===- InstCombineCalls.cpp -----------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the visitCall and visitInvoke functions. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "InstCombineInternal.h" 15 #include "llvm/ADT/APFloat.h" 16 #include "llvm/ADT/APInt.h" 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/None.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/Analysis/InstructionSimplify.h" 24 #include "llvm/Analysis/MemoryBuiltins.h" 25 #include "llvm/Analysis/ValueTracking.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/CallSite.h" 28 #include "llvm/IR/Constant.h" 29 #include "llvm/IR/DataLayout.h" 30 #include "llvm/IR/DerivedTypes.h" 31 #include "llvm/IR/Function.h" 32 #include "llvm/IR/GlobalVariable.h" 33 #include "llvm/IR/InstrTypes.h" 34 #include "llvm/IR/Instruction.h" 35 #include "llvm/IR/Instructions.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Intrinsics.h" 38 #include "llvm/IR/LLVMContext.h" 39 #include "llvm/IR/Metadata.h" 40 #include "llvm/IR/PatternMatch.h" 41 #include "llvm/IR/Statepoint.h" 42 #include "llvm/IR/Type.h" 43 #include "llvm/IR/Value.h" 44 #include "llvm/IR/ValueHandle.h" 45 #include "llvm/Support/Casting.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/KnownBits.h" 48 #include "llvm/Support/MathExtras.h" 49 #include "llvm/Transforms/Utils/Local.h" 50 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 51 #include <algorithm> 52 #include <cassert> 53 #include <cstdint> 54 #include <cstring> 55 #include <vector> 56 57 using namespace llvm; 58 using namespace PatternMatch; 59 60 #define DEBUG_TYPE "instcombine" 61 62 STATISTIC(NumSimplified, "Number of library calls simplified"); 63 64 static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements( 65 "unfold-element-atomic-memcpy-max-elements", 66 cl::init(16), 67 cl::desc("Maximum number of elements in atomic memcpy the optimizer is " 68 "allowed to unfold")); 69 70 /// Return the specified type promoted as it would be to pass though a va_arg 71 /// area. 72 static Type *getPromotedType(Type *Ty) { 73 if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) { 74 if (ITy->getBitWidth() < 32) 75 return Type::getInt32Ty(Ty->getContext()); 76 } 77 return Ty; 78 } 79 80 /// Return a constant boolean vector that has true elements in all positions 81 /// where the input constant data vector has an element with the sign bit set. 82 static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { 83 SmallVector<Constant *, 32> BoolVec; 84 IntegerType *BoolTy = Type::getInt1Ty(V->getContext()); 85 for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) { 86 Constant *Elt = V->getElementAsConstant(I); 87 assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) && 88 "Unexpected constant data vector element type"); 89 bool Sign = V->getElementType()->isIntegerTy() 90 ? cast<ConstantInt>(Elt)->isNegative() 91 : cast<ConstantFP>(Elt)->isNegative(); 92 BoolVec.push_back(ConstantInt::get(BoolTy, Sign)); 93 } 94 return ConstantVector::get(BoolVec); 95 } 96 97 Instruction *InstCombiner::SimplifyElementUnorderedAtomicMemCpy( 98 ElementUnorderedAtomicMemCpyInst *AMI) { 99 // Try to unfold this intrinsic into sequence of explicit atomic loads and 100 // stores. 101 // First check that number of elements is compile time constant. 102 auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength()); 103 if (!LengthCI) 104 return nullptr; 105 106 // Check that there are not too many elements. 107 uint64_t LengthInBytes = LengthCI->getZExtValue(); 108 uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes(); 109 uint64_t NumElements = LengthInBytes / ElementSizeInBytes; 110 if (NumElements >= UnfoldElementAtomicMemcpyMaxElements) 111 return nullptr; 112 113 // Only expand if there are elements to copy. 114 if (NumElements > 0) { 115 // Don't unfold into illegal integers 116 uint64_t ElementSizeInBits = ElementSizeInBytes * 8; 117 if (!getDataLayout().isLegalInteger(ElementSizeInBits)) 118 return nullptr; 119 120 // Cast source and destination to the correct type. Intrinsic input 121 // arguments are usually represented as i8*. Often operands will be 122 // explicitly casted to i8* and we can just strip those casts instead of 123 // inserting new ones. However it's easier to rely on other InstCombine 124 // rules which will cover trivial cases anyway. 125 Value *Src = AMI->getRawSource(); 126 Value *Dst = AMI->getRawDest(); 127 Type *ElementPointerType = 128 Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits, 129 Src->getType()->getPointerAddressSpace()); 130 131 Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType, 132 "memcpy_unfold.src_casted"); 133 Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType, 134 "memcpy_unfold.dst_casted"); 135 136 for (uint64_t i = 0; i < NumElements; ++i) { 137 // Get current element addresses 138 ConstantInt *ElementIdxCI = 139 ConstantInt::get(AMI->getContext(), APInt(64, i)); 140 Value *SrcElementAddr = 141 Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr"); 142 Value *DstElementAddr = 143 Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr"); 144 145 // Load from the source. Transfer alignment information and mark load as 146 // unordered atomic. 147 LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val"); 148 Load->setOrdering(AtomicOrdering::Unordered); 149 // We know alignment of the first element. It is also guaranteed by the 150 // verifier that element size is less or equal than first element 151 // alignment and both of this values are powers of two. This means that 152 // all subsequent accesses are at least element size aligned. 153 // TODO: We can infer better alignment but there is no evidence that this 154 // will matter. 155 Load->setAlignment(i == 0 ? AMI->getParamAlignment(1) 156 : ElementSizeInBytes); 157 Load->setDebugLoc(AMI->getDebugLoc()); 158 159 // Store loaded value via unordered atomic store. 160 StoreInst *Store = Builder->CreateStore(Load, DstElementAddr); 161 Store->setOrdering(AtomicOrdering::Unordered); 162 Store->setAlignment(i == 0 ? AMI->getParamAlignment(0) 163 : ElementSizeInBytes); 164 Store->setDebugLoc(AMI->getDebugLoc()); 165 } 166 } 167 168 // Set the number of elements of the copy to 0, it will be deleted on the 169 // next iteration. 170 AMI->setLength(Constant::getNullValue(LengthCI->getType())); 171 return AMI; 172 } 173 174 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { 175 unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT); 176 unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT); 177 unsigned MinAlign = std::min(DstAlign, SrcAlign); 178 unsigned CopyAlign = MI->getAlignment(); 179 180 if (CopyAlign < MinAlign) { 181 MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false)); 182 return MI; 183 } 184 185 // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with 186 // load/store. 187 ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2)); 188 if (!MemOpLength) return nullptr; 189 190 // Source and destination pointer types are always "i8*" for intrinsic. See 191 // if the size is something we can handle with a single primitive load/store. 192 // A single load+store correctly handles overlapping memory in the memmove 193 // case. 194 uint64_t Size = MemOpLength->getLimitedValue(); 195 assert(Size && "0-sized memory transferring should be removed already."); 196 197 if (Size > 8 || (Size&(Size-1))) 198 return nullptr; // If not 1/2/4/8 bytes, exit. 199 200 // Use an integer load+store unless we can find something better. 201 unsigned SrcAddrSp = 202 cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace(); 203 unsigned DstAddrSp = 204 cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace(); 205 206 IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3); 207 Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp); 208 Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp); 209 210 // If the memcpy has metadata describing the members, see if we can get the 211 // TBAA tag describing our copy. 212 MDNode *CopyMD = nullptr; 213 if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { 214 if (M->getNumOperands() == 3 && M->getOperand(0) && 215 mdconst::hasa<ConstantInt>(M->getOperand(0)) && 216 mdconst::extract<ConstantInt>(M->getOperand(0))->isNullValue() && 217 M->getOperand(1) && 218 mdconst::hasa<ConstantInt>(M->getOperand(1)) && 219 mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == 220 Size && 221 M->getOperand(2) && isa<MDNode>(M->getOperand(2))) 222 CopyMD = cast<MDNode>(M->getOperand(2)); 223 } 224 225 // If the memcpy/memmove provides better alignment info than we can 226 // infer, use it. 227 SrcAlign = std::max(SrcAlign, CopyAlign); 228 DstAlign = std::max(DstAlign, CopyAlign); 229 230 Value *Src = Builder->CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy); 231 Value *Dest = Builder->CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); 232 LoadInst *L = Builder->CreateLoad(Src, MI->isVolatile()); 233 L->setAlignment(SrcAlign); 234 if (CopyMD) 235 L->setMetadata(LLVMContext::MD_tbaa, CopyMD); 236 MDNode *LoopMemParallelMD = 237 MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access); 238 if (LoopMemParallelMD) 239 L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 240 241 StoreInst *S = Builder->CreateStore(L, Dest, MI->isVolatile()); 242 S->setAlignment(DstAlign); 243 if (CopyMD) 244 S->setMetadata(LLVMContext::MD_tbaa, CopyMD); 245 if (LoopMemParallelMD) 246 S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 247 248 // Set the size of the copy to 0, it will be deleted on the next iteration. 249 MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType())); 250 return MI; 251 } 252 253 Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { 254 unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); 255 if (MI->getAlignment() < Alignment) { 256 MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), 257 Alignment, false)); 258 return MI; 259 } 260 261 // Extract the length and alignment and fill if they are constant. 262 ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength()); 263 ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue()); 264 if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) 265 return nullptr; 266 uint64_t Len = LenC->getLimitedValue(); 267 Alignment = MI->getAlignment(); 268 assert(Len && "0-sized memory setting should be removed already."); 269 270 // memset(s,c,n) -> store s, c (for n=1,2,4,8) 271 if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) { 272 Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8. 273 274 Value *Dest = MI->getDest(); 275 unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace(); 276 Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp); 277 Dest = Builder->CreateBitCast(Dest, NewDstPtrTy); 278 279 // Alignment 0 is identity for alignment 1 for memset, but not store. 280 if (Alignment == 0) Alignment = 1; 281 282 // Extract the fill value and store. 283 uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; 284 StoreInst *S = Builder->CreateStore(ConstantInt::get(ITy, Fill), Dest, 285 MI->isVolatile()); 286 S->setAlignment(Alignment); 287 288 // Set the size of the copy to 0, it will be deleted on the next iteration. 289 MI->setLength(Constant::getNullValue(LenC->getType())); 290 return MI; 291 } 292 293 return nullptr; 294 } 295 296 static Value *simplifyX86immShift(const IntrinsicInst &II, 297 InstCombiner::BuilderTy &Builder) { 298 bool LogicalShift = false; 299 bool ShiftLeft = false; 300 301 switch (II.getIntrinsicID()) { 302 default: llvm_unreachable("Unexpected intrinsic!"); 303 case Intrinsic::x86_sse2_psra_d: 304 case Intrinsic::x86_sse2_psra_w: 305 case Intrinsic::x86_sse2_psrai_d: 306 case Intrinsic::x86_sse2_psrai_w: 307 case Intrinsic::x86_avx2_psra_d: 308 case Intrinsic::x86_avx2_psra_w: 309 case Intrinsic::x86_avx2_psrai_d: 310 case Intrinsic::x86_avx2_psrai_w: 311 case Intrinsic::x86_avx512_psra_q_128: 312 case Intrinsic::x86_avx512_psrai_q_128: 313 case Intrinsic::x86_avx512_psra_q_256: 314 case Intrinsic::x86_avx512_psrai_q_256: 315 case Intrinsic::x86_avx512_psra_d_512: 316 case Intrinsic::x86_avx512_psra_q_512: 317 case Intrinsic::x86_avx512_psra_w_512: 318 case Intrinsic::x86_avx512_psrai_d_512: 319 case Intrinsic::x86_avx512_psrai_q_512: 320 case Intrinsic::x86_avx512_psrai_w_512: 321 LogicalShift = false; ShiftLeft = false; 322 break; 323 case Intrinsic::x86_sse2_psrl_d: 324 case Intrinsic::x86_sse2_psrl_q: 325 case Intrinsic::x86_sse2_psrl_w: 326 case Intrinsic::x86_sse2_psrli_d: 327 case Intrinsic::x86_sse2_psrli_q: 328 case Intrinsic::x86_sse2_psrli_w: 329 case Intrinsic::x86_avx2_psrl_d: 330 case Intrinsic::x86_avx2_psrl_q: 331 case Intrinsic::x86_avx2_psrl_w: 332 case Intrinsic::x86_avx2_psrli_d: 333 case Intrinsic::x86_avx2_psrli_q: 334 case Intrinsic::x86_avx2_psrli_w: 335 case Intrinsic::x86_avx512_psrl_d_512: 336 case Intrinsic::x86_avx512_psrl_q_512: 337 case Intrinsic::x86_avx512_psrl_w_512: 338 case Intrinsic::x86_avx512_psrli_d_512: 339 case Intrinsic::x86_avx512_psrli_q_512: 340 case Intrinsic::x86_avx512_psrli_w_512: 341 LogicalShift = true; ShiftLeft = false; 342 break; 343 case Intrinsic::x86_sse2_psll_d: 344 case Intrinsic::x86_sse2_psll_q: 345 case Intrinsic::x86_sse2_psll_w: 346 case Intrinsic::x86_sse2_pslli_d: 347 case Intrinsic::x86_sse2_pslli_q: 348 case Intrinsic::x86_sse2_pslli_w: 349 case Intrinsic::x86_avx2_psll_d: 350 case Intrinsic::x86_avx2_psll_q: 351 case Intrinsic::x86_avx2_psll_w: 352 case Intrinsic::x86_avx2_pslli_d: 353 case Intrinsic::x86_avx2_pslli_q: 354 case Intrinsic::x86_avx2_pslli_w: 355 case Intrinsic::x86_avx512_psll_d_512: 356 case Intrinsic::x86_avx512_psll_q_512: 357 case Intrinsic::x86_avx512_psll_w_512: 358 case Intrinsic::x86_avx512_pslli_d_512: 359 case Intrinsic::x86_avx512_pslli_q_512: 360 case Intrinsic::x86_avx512_pslli_w_512: 361 LogicalShift = true; ShiftLeft = true; 362 break; 363 } 364 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 365 366 // Simplify if count is constant. 367 auto Arg1 = II.getArgOperand(1); 368 auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1); 369 auto CDV = dyn_cast<ConstantDataVector>(Arg1); 370 auto CInt = dyn_cast<ConstantInt>(Arg1); 371 if (!CAZ && !CDV && !CInt) 372 return nullptr; 373 374 APInt Count(64, 0); 375 if (CDV) { 376 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 377 // operand to compute the shift amount. 378 auto VT = cast<VectorType>(CDV->getType()); 379 unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); 380 assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); 381 unsigned NumSubElts = 64 / BitWidth; 382 383 // Concatenate the sub-elements to create the 64-bit value. 384 for (unsigned i = 0; i != NumSubElts; ++i) { 385 unsigned SubEltIdx = (NumSubElts - 1) - i; 386 auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 387 Count <<= BitWidth; 388 Count |= SubElt->getValue().zextOrTrunc(64); 389 } 390 } 391 else if (CInt) 392 Count = CInt->getValue(); 393 394 auto Vec = II.getArgOperand(0); 395 auto VT = cast<VectorType>(Vec->getType()); 396 auto SVT = VT->getElementType(); 397 unsigned VWidth = VT->getNumElements(); 398 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 399 400 // If shift-by-zero then just return the original value. 401 if (Count.isNullValue()) 402 return Vec; 403 404 // Handle cases when Shift >= BitWidth. 405 if (Count.uge(BitWidth)) { 406 // If LogicalShift - just return zero. 407 if (LogicalShift) 408 return ConstantAggregateZero::get(VT); 409 410 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 411 Count = APInt(64, BitWidth - 1); 412 } 413 414 // Get a constant vector of the same type as the first operand. 415 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 416 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 417 418 if (ShiftLeft) 419 return Builder.CreateShl(Vec, ShiftVec); 420 421 if (LogicalShift) 422 return Builder.CreateLShr(Vec, ShiftVec); 423 424 return Builder.CreateAShr(Vec, ShiftVec); 425 } 426 427 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 428 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 429 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 430 static Value *simplifyX86varShift(const IntrinsicInst &II, 431 InstCombiner::BuilderTy &Builder) { 432 bool LogicalShift = false; 433 bool ShiftLeft = false; 434 435 switch (II.getIntrinsicID()) { 436 default: llvm_unreachable("Unexpected intrinsic!"); 437 case Intrinsic::x86_avx2_psrav_d: 438 case Intrinsic::x86_avx2_psrav_d_256: 439 case Intrinsic::x86_avx512_psrav_q_128: 440 case Intrinsic::x86_avx512_psrav_q_256: 441 case Intrinsic::x86_avx512_psrav_d_512: 442 case Intrinsic::x86_avx512_psrav_q_512: 443 case Intrinsic::x86_avx512_psrav_w_128: 444 case Intrinsic::x86_avx512_psrav_w_256: 445 case Intrinsic::x86_avx512_psrav_w_512: 446 LogicalShift = false; 447 ShiftLeft = false; 448 break; 449 case Intrinsic::x86_avx2_psrlv_d: 450 case Intrinsic::x86_avx2_psrlv_d_256: 451 case Intrinsic::x86_avx2_psrlv_q: 452 case Intrinsic::x86_avx2_psrlv_q_256: 453 case Intrinsic::x86_avx512_psrlv_d_512: 454 case Intrinsic::x86_avx512_psrlv_q_512: 455 case Intrinsic::x86_avx512_psrlv_w_128: 456 case Intrinsic::x86_avx512_psrlv_w_256: 457 case Intrinsic::x86_avx512_psrlv_w_512: 458 LogicalShift = true; 459 ShiftLeft = false; 460 break; 461 case Intrinsic::x86_avx2_psllv_d: 462 case Intrinsic::x86_avx2_psllv_d_256: 463 case Intrinsic::x86_avx2_psllv_q: 464 case Intrinsic::x86_avx2_psllv_q_256: 465 case Intrinsic::x86_avx512_psllv_d_512: 466 case Intrinsic::x86_avx512_psllv_q_512: 467 case Intrinsic::x86_avx512_psllv_w_128: 468 case Intrinsic::x86_avx512_psllv_w_256: 469 case Intrinsic::x86_avx512_psllv_w_512: 470 LogicalShift = true; 471 ShiftLeft = true; 472 break; 473 } 474 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 475 476 // Simplify if all shift amounts are constant/undef. 477 auto *CShift = dyn_cast<Constant>(II.getArgOperand(1)); 478 if (!CShift) 479 return nullptr; 480 481 auto Vec = II.getArgOperand(0); 482 auto VT = cast<VectorType>(II.getType()); 483 auto SVT = VT->getVectorElementType(); 484 int NumElts = VT->getNumElements(); 485 int BitWidth = SVT->getIntegerBitWidth(); 486 487 // Collect each element's shift amount. 488 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 489 bool AnyOutOfRange = false; 490 SmallVector<int, 8> ShiftAmts; 491 for (int I = 0; I < NumElts; ++I) { 492 auto *CElt = CShift->getAggregateElement(I); 493 if (CElt && isa<UndefValue>(CElt)) { 494 ShiftAmts.push_back(-1); 495 continue; 496 } 497 498 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 499 if (!COp) 500 return nullptr; 501 502 // Handle out of range shifts. 503 // If LogicalShift - set to BitWidth (special case). 504 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 505 APInt ShiftVal = COp->getValue(); 506 if (ShiftVal.uge(BitWidth)) { 507 AnyOutOfRange = LogicalShift; 508 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 509 continue; 510 } 511 512 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 513 } 514 515 // If all elements out of range or UNDEF, return vector of zeros/undefs. 516 // ArithmeticShift should only hit this if they are all UNDEF. 517 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 518 if (all_of(ShiftAmts, OutOfRange)) { 519 SmallVector<Constant *, 8> ConstantVec; 520 for (int Idx : ShiftAmts) { 521 if (Idx < 0) { 522 ConstantVec.push_back(UndefValue::get(SVT)); 523 } else { 524 assert(LogicalShift && "Logical shift expected"); 525 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 526 } 527 } 528 return ConstantVector::get(ConstantVec); 529 } 530 531 // We can't handle only some out of range values with generic logical shifts. 532 if (AnyOutOfRange) 533 return nullptr; 534 535 // Build the shift amount constant vector. 536 SmallVector<Constant *, 8> ShiftVecAmts; 537 for (int Idx : ShiftAmts) { 538 if (Idx < 0) 539 ShiftVecAmts.push_back(UndefValue::get(SVT)); 540 else 541 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 542 } 543 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 544 545 if (ShiftLeft) 546 return Builder.CreateShl(Vec, ShiftVec); 547 548 if (LogicalShift) 549 return Builder.CreateLShr(Vec, ShiftVec); 550 551 return Builder.CreateAShr(Vec, ShiftVec); 552 } 553 554 static Value *simplifyX86muldq(const IntrinsicInst &II, 555 InstCombiner::BuilderTy &Builder) { 556 Value *Arg0 = II.getArgOperand(0); 557 Value *Arg1 = II.getArgOperand(1); 558 Type *ResTy = II.getType(); 559 assert(Arg0->getType()->getScalarSizeInBits() == 32 && 560 Arg1->getType()->getScalarSizeInBits() == 32 && 561 ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types"); 562 563 // muldq/muludq(undef, undef) -> zero (matches generic mul behavior) 564 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1)) 565 return ConstantAggregateZero::get(ResTy); 566 567 // Constant folding. 568 // PMULDQ = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)), 569 // vXi64 sext(shuffle<0,2,..>(Arg1)))) 570 // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)), 571 // vXi64 zext(shuffle<0,2,..>(Arg1)))) 572 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 573 return nullptr; 574 575 unsigned NumElts = ResTy->getVectorNumElements(); 576 assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) && 577 Arg1->getType()->getVectorNumElements() == (2 * NumElts) && 578 "Unexpected muldq/muludq types"); 579 580 unsigned IntrinsicID = II.getIntrinsicID(); 581 bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID || 582 Intrinsic::x86_avx2_pmul_dq == IntrinsicID || 583 Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID); 584 585 SmallVector<unsigned, 16> ShuffleMask; 586 for (unsigned i = 0; i != NumElts; ++i) 587 ShuffleMask.push_back(i * 2); 588 589 auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask); 590 auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask); 591 592 if (IsSigned) { 593 LHS = Builder.CreateSExt(LHS, ResTy); 594 RHS = Builder.CreateSExt(RHS, ResTy); 595 } else { 596 LHS = Builder.CreateZExt(LHS, ResTy); 597 RHS = Builder.CreateZExt(RHS, ResTy); 598 } 599 600 return Builder.CreateMul(LHS, RHS); 601 } 602 603 static Value *simplifyX86pack(IntrinsicInst &II, InstCombiner &IC, 604 InstCombiner::BuilderTy &Builder, bool IsSigned) { 605 Value *Arg0 = II.getArgOperand(0); 606 Value *Arg1 = II.getArgOperand(1); 607 Type *ResTy = II.getType(); 608 609 // Fast all undef handling. 610 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 611 return UndefValue::get(ResTy); 612 613 Type *ArgTy = Arg0->getType(); 614 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 615 unsigned NumDstElts = ResTy->getVectorNumElements(); 616 unsigned NumSrcElts = ArgTy->getVectorNumElements(); 617 assert(NumDstElts == (2 * NumSrcElts) && "Unexpected packing types"); 618 619 unsigned NumDstEltsPerLane = NumDstElts / NumLanes; 620 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 621 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 622 assert(ArgTy->getScalarSizeInBits() == (2 * DstScalarSizeInBits) && 623 "Unexpected packing types"); 624 625 // Constant folding. 626 auto *Cst0 = dyn_cast<Constant>(Arg0); 627 auto *Cst1 = dyn_cast<Constant>(Arg1); 628 if (!Cst0 || !Cst1) 629 return nullptr; 630 631 SmallVector<Constant *, 32> Vals; 632 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 633 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) { 634 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane; 635 auto *Cst = (Elt >= NumSrcEltsPerLane) ? Cst1 : Cst0; 636 auto *COp = Cst->getAggregateElement(SrcIdx); 637 if (COp && isa<UndefValue>(COp)) { 638 Vals.push_back(UndefValue::get(ResTy->getScalarType())); 639 continue; 640 } 641 642 auto *CInt = dyn_cast_or_null<ConstantInt>(COp); 643 if (!CInt) 644 return nullptr; 645 646 APInt Val = CInt->getValue(); 647 assert(Val.getBitWidth() == ArgTy->getScalarSizeInBits() && 648 "Unexpected constant bitwidth"); 649 650 if (IsSigned) { 651 // PACKSS: Truncate signed value with signed saturation. 652 // Source values less than dst minint are saturated to minint. 653 // Source values greater than dst maxint are saturated to maxint. 654 if (Val.isSignedIntN(DstScalarSizeInBits)) 655 Val = Val.trunc(DstScalarSizeInBits); 656 else if (Val.isNegative()) 657 Val = APInt::getSignedMinValue(DstScalarSizeInBits); 658 else 659 Val = APInt::getSignedMaxValue(DstScalarSizeInBits); 660 } else { 661 // PACKUS: Truncate signed value with unsigned saturation. 662 // Source values less than zero are saturated to zero. 663 // Source values greater than dst maxuint are saturated to maxuint. 664 if (Val.isIntN(DstScalarSizeInBits)) 665 Val = Val.trunc(DstScalarSizeInBits); 666 else if (Val.isNegative()) 667 Val = APInt::getNullValue(DstScalarSizeInBits); 668 else 669 Val = APInt::getAllOnesValue(DstScalarSizeInBits); 670 } 671 672 Vals.push_back(ConstantInt::get(ResTy->getScalarType(), Val)); 673 } 674 } 675 676 return ConstantVector::get(Vals); 677 } 678 679 static Value *simplifyX86movmsk(const IntrinsicInst &II, 680 InstCombiner::BuilderTy &Builder) { 681 Value *Arg = II.getArgOperand(0); 682 Type *ResTy = II.getType(); 683 Type *ArgTy = Arg->getType(); 684 685 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 686 if (isa<UndefValue>(Arg)) 687 return Constant::getNullValue(ResTy); 688 689 // We can't easily peek through x86_mmx types. 690 if (!ArgTy->isVectorTy()) 691 return nullptr; 692 693 auto *C = dyn_cast<Constant>(Arg); 694 if (!C) 695 return nullptr; 696 697 // Extract signbits of the vector input and pack into integer result. 698 APInt Result(ResTy->getPrimitiveSizeInBits(), 0); 699 for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) { 700 auto *COp = C->getAggregateElement(I); 701 if (!COp) 702 return nullptr; 703 if (isa<UndefValue>(COp)) 704 continue; 705 706 auto *CInt = dyn_cast<ConstantInt>(COp); 707 auto *CFp = dyn_cast<ConstantFP>(COp); 708 if (!CInt && !CFp) 709 return nullptr; 710 711 if ((CInt && CInt->isNegative()) || (CFp && CFp->isNegative())) 712 Result.setBit(I); 713 } 714 715 return Constant::getIntegerValue(ResTy, Result); 716 } 717 718 static Value *simplifyX86insertps(const IntrinsicInst &II, 719 InstCombiner::BuilderTy &Builder) { 720 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 721 if (!CInt) 722 return nullptr; 723 724 VectorType *VecTy = cast<VectorType>(II.getType()); 725 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 726 727 // The immediate permute control byte looks like this: 728 // [3:0] - zero mask for each 32-bit lane 729 // [5:4] - select one 32-bit destination lane 730 // [7:6] - select one 32-bit source lane 731 732 uint8_t Imm = CInt->getZExtValue(); 733 uint8_t ZMask = Imm & 0xf; 734 uint8_t DestLane = (Imm >> 4) & 0x3; 735 uint8_t SourceLane = (Imm >> 6) & 0x3; 736 737 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 738 739 // If all zero mask bits are set, this was just a weird way to 740 // generate a zero vector. 741 if (ZMask == 0xf) 742 return ZeroVector; 743 744 // Initialize by passing all of the first source bits through. 745 uint32_t ShuffleMask[4] = { 0, 1, 2, 3 }; 746 747 // We may replace the second operand with the zero vector. 748 Value *V1 = II.getArgOperand(1); 749 750 if (ZMask) { 751 // If the zero mask is being used with a single input or the zero mask 752 // overrides the destination lane, this is a shuffle with the zero vector. 753 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 754 (ZMask & (1 << DestLane))) { 755 V1 = ZeroVector; 756 // We may still move 32-bits of the first source vector from one lane 757 // to another. 758 ShuffleMask[DestLane] = SourceLane; 759 // The zero mask may override the previous insert operation. 760 for (unsigned i = 0; i < 4; ++i) 761 if ((ZMask >> i) & 0x1) 762 ShuffleMask[i] = i + 4; 763 } else { 764 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 765 return nullptr; 766 } 767 } else { 768 // Replace the selected destination lane with the selected source lane. 769 ShuffleMask[DestLane] = SourceLane + 4; 770 } 771 772 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 773 } 774 775 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 776 /// or conversion to a shuffle vector. 777 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 778 ConstantInt *CILength, ConstantInt *CIIndex, 779 InstCombiner::BuilderTy &Builder) { 780 auto LowConstantHighUndef = [&](uint64_t Val) { 781 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 782 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 783 UndefValue::get(IntTy64)}; 784 return ConstantVector::get(Args); 785 }; 786 787 // See if we're dealing with constant values. 788 Constant *C0 = dyn_cast<Constant>(Op0); 789 ConstantInt *CI0 = 790 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 791 : nullptr; 792 793 // Attempt to constant fold. 794 if (CILength && CIIndex) { 795 // From AMD documentation: "The bit index and field length are each six 796 // bits in length other bits of the field are ignored." 797 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 798 APInt APLength = CILength->getValue().zextOrTrunc(6); 799 800 unsigned Index = APIndex.getZExtValue(); 801 802 // From AMD documentation: "a value of zero in the field length is 803 // defined as length of 64". 804 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 805 806 // From AMD documentation: "If the sum of the bit index + length field 807 // is greater than 64, the results are undefined". 808 unsigned End = Index + Length; 809 810 // Note that both field index and field length are 8-bit quantities. 811 // Since variables 'Index' and 'Length' are unsigned values 812 // obtained from zero-extending field index and field length 813 // respectively, their sum should never wrap around. 814 if (End > 64) 815 return UndefValue::get(II.getType()); 816 817 // If we are inserting whole bytes, we can convert this to a shuffle. 818 // Lowering can recognize EXTRQI shuffle masks. 819 if ((Length % 8) == 0 && (Index % 8) == 0) { 820 // Convert bit indices to byte indices. 821 Length /= 8; 822 Index /= 8; 823 824 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 825 Type *IntTy32 = Type::getInt32Ty(II.getContext()); 826 VectorType *ShufTy = VectorType::get(IntTy8, 16); 827 828 SmallVector<Constant *, 16> ShuffleMask; 829 for (int i = 0; i != (int)Length; ++i) 830 ShuffleMask.push_back( 831 Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); 832 for (int i = Length; i != 8; ++i) 833 ShuffleMask.push_back( 834 Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); 835 for (int i = 8; i != 16; ++i) 836 ShuffleMask.push_back(UndefValue::get(IntTy32)); 837 838 Value *SV = Builder.CreateShuffleVector( 839 Builder.CreateBitCast(Op0, ShufTy), 840 ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask)); 841 return Builder.CreateBitCast(SV, II.getType()); 842 } 843 844 // Constant Fold - shift Index'th bit to lowest position and mask off 845 // Length bits. 846 if (CI0) { 847 APInt Elt = CI0->getValue(); 848 Elt.lshrInPlace(Index); 849 Elt = Elt.zextOrTrunc(Length); 850 return LowConstantHighUndef(Elt.getZExtValue()); 851 } 852 853 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 854 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 855 Value *Args[] = {Op0, CILength, CIIndex}; 856 Module *M = II.getModule(); 857 Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 858 return Builder.CreateCall(F, Args); 859 } 860 } 861 862 // Constant Fold - extraction from zero is always {zero, undef}. 863 if (CI0 && CI0->equalsInt(0)) 864 return LowConstantHighUndef(0); 865 866 return nullptr; 867 } 868 869 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 870 /// folding or conversion to a shuffle vector. 871 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 872 APInt APLength, APInt APIndex, 873 InstCombiner::BuilderTy &Builder) { 874 // From AMD documentation: "The bit index and field length are each six bits 875 // in length other bits of the field are ignored." 876 APIndex = APIndex.zextOrTrunc(6); 877 APLength = APLength.zextOrTrunc(6); 878 879 // Attempt to constant fold. 880 unsigned Index = APIndex.getZExtValue(); 881 882 // From AMD documentation: "a value of zero in the field length is 883 // defined as length of 64". 884 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 885 886 // From AMD documentation: "If the sum of the bit index + length field 887 // is greater than 64, the results are undefined". 888 unsigned End = Index + Length; 889 890 // Note that both field index and field length are 8-bit quantities. 891 // Since variables 'Index' and 'Length' are unsigned values 892 // obtained from zero-extending field index and field length 893 // respectively, their sum should never wrap around. 894 if (End > 64) 895 return UndefValue::get(II.getType()); 896 897 // If we are inserting whole bytes, we can convert this to a shuffle. 898 // Lowering can recognize INSERTQI shuffle masks. 899 if ((Length % 8) == 0 && (Index % 8) == 0) { 900 // Convert bit indices to byte indices. 901 Length /= 8; 902 Index /= 8; 903 904 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 905 Type *IntTy32 = Type::getInt32Ty(II.getContext()); 906 VectorType *ShufTy = VectorType::get(IntTy8, 16); 907 908 SmallVector<Constant *, 16> ShuffleMask; 909 for (int i = 0; i != (int)Index; ++i) 910 ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); 911 for (int i = 0; i != (int)Length; ++i) 912 ShuffleMask.push_back( 913 Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); 914 for (int i = Index + Length; i != 8; ++i) 915 ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); 916 for (int i = 8; i != 16; ++i) 917 ShuffleMask.push_back(UndefValue::get(IntTy32)); 918 919 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 920 Builder.CreateBitCast(Op1, ShufTy), 921 ConstantVector::get(ShuffleMask)); 922 return Builder.CreateBitCast(SV, II.getType()); 923 } 924 925 // See if we're dealing with constant values. 926 Constant *C0 = dyn_cast<Constant>(Op0); 927 Constant *C1 = dyn_cast<Constant>(Op1); 928 ConstantInt *CI00 = 929 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 930 : nullptr; 931 ConstantInt *CI10 = 932 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 933 : nullptr; 934 935 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 936 if (CI00 && CI10) { 937 APInt V00 = CI00->getValue(); 938 APInt V10 = CI10->getValue(); 939 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 940 V00 = V00 & ~Mask; 941 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 942 APInt Val = V00 | V10; 943 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 944 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 945 UndefValue::get(IntTy64)}; 946 return ConstantVector::get(Args); 947 } 948 949 // If we were an INSERTQ call, we'll save demanded elements if we convert to 950 // INSERTQI. 951 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 952 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 953 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 954 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 955 956 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 957 Module *M = II.getModule(); 958 Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 959 return Builder.CreateCall(F, Args); 960 } 961 962 return nullptr; 963 } 964 965 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 966 static Value *simplifyX86pshufb(const IntrinsicInst &II, 967 InstCombiner::BuilderTy &Builder) { 968 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 969 if (!V) 970 return nullptr; 971 972 auto *VecTy = cast<VectorType>(II.getType()); 973 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 974 unsigned NumElts = VecTy->getNumElements(); 975 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 976 "Unexpected number of elements in shuffle mask!"); 977 978 // Construct a shuffle mask from constant integers or UNDEFs. 979 Constant *Indexes[64] = {nullptr}; 980 981 // Each byte in the shuffle control mask forms an index to permute the 982 // corresponding byte in the destination operand. 983 for (unsigned I = 0; I < NumElts; ++I) { 984 Constant *COp = V->getAggregateElement(I); 985 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 986 return nullptr; 987 988 if (isa<UndefValue>(COp)) { 989 Indexes[I] = UndefValue::get(MaskEltTy); 990 continue; 991 } 992 993 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 994 995 // If the most significant bit (bit[7]) of each byte of the shuffle 996 // control mask is set, then zero is written in the result byte. 997 // The zero vector is in the right-hand side of the resulting 998 // shufflevector. 999 1000 // The value of each index for the high 128-bit lane is the least 1001 // significant 4 bits of the respective shuffle control byte. 1002 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 1003 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1004 } 1005 1006 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); 1007 auto V1 = II.getArgOperand(0); 1008 auto V2 = Constant::getNullValue(VecTy); 1009 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1010 } 1011 1012 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 1013 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 1014 InstCombiner::BuilderTy &Builder) { 1015 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 1016 if (!V) 1017 return nullptr; 1018 1019 auto *VecTy = cast<VectorType>(II.getType()); 1020 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 1021 unsigned NumElts = VecTy->getVectorNumElements(); 1022 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 1023 unsigned NumLaneElts = IsPD ? 2 : 4; 1024 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 1025 1026 // Construct a shuffle mask from constant integers or UNDEFs. 1027 Constant *Indexes[16] = {nullptr}; 1028 1029 // The intrinsics only read one or two bits, clear the rest. 1030 for (unsigned I = 0; I < NumElts; ++I) { 1031 Constant *COp = V->getAggregateElement(I); 1032 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1033 return nullptr; 1034 1035 if (isa<UndefValue>(COp)) { 1036 Indexes[I] = UndefValue::get(MaskEltTy); 1037 continue; 1038 } 1039 1040 APInt Index = cast<ConstantInt>(COp)->getValue(); 1041 Index = Index.zextOrTrunc(32).getLoBits(2); 1042 1043 // The PD variants uses bit 1 to select per-lane element index, so 1044 // shift down to convert to generic shuffle mask index. 1045 if (IsPD) 1046 Index.lshrInPlace(1); 1047 1048 // The _256 variants are a bit trickier since the mask bits always index 1049 // into the corresponding 128 half. In order to convert to a generic 1050 // shuffle, we have to make that explicit. 1051 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 1052 1053 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1054 } 1055 1056 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); 1057 auto V1 = II.getArgOperand(0); 1058 auto V2 = UndefValue::get(V1->getType()); 1059 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1060 } 1061 1062 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 1063 static Value *simplifyX86vpermv(const IntrinsicInst &II, 1064 InstCombiner::BuilderTy &Builder) { 1065 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1066 if (!V) 1067 return nullptr; 1068 1069 auto *VecTy = cast<VectorType>(II.getType()); 1070 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 1071 unsigned Size = VecTy->getNumElements(); 1072 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 1073 "Unexpected shuffle mask size"); 1074 1075 // Construct a shuffle mask from constant integers or UNDEFs. 1076 Constant *Indexes[64] = {nullptr}; 1077 1078 for (unsigned I = 0; I < Size; ++I) { 1079 Constant *COp = V->getAggregateElement(I); 1080 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1081 return nullptr; 1082 1083 if (isa<UndefValue>(COp)) { 1084 Indexes[I] = UndefValue::get(MaskEltTy); 1085 continue; 1086 } 1087 1088 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 1089 Index &= Size - 1; 1090 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1091 } 1092 1093 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size)); 1094 auto V1 = II.getArgOperand(0); 1095 auto V2 = UndefValue::get(VecTy); 1096 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1097 } 1098 1099 /// The shuffle mask for a perm2*128 selects any two halves of two 256-bit 1100 /// source vectors, unless a zero bit is set. If a zero bit is set, 1101 /// then ignore that half of the mask and clear that half of the vector. 1102 static Value *simplifyX86vperm2(const IntrinsicInst &II, 1103 InstCombiner::BuilderTy &Builder) { 1104 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1105 if (!CInt) 1106 return nullptr; 1107 1108 VectorType *VecTy = cast<VectorType>(II.getType()); 1109 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 1110 1111 // The immediate permute control byte looks like this: 1112 // [1:0] - select 128 bits from sources for low half of destination 1113 // [2] - ignore 1114 // [3] - zero low half of destination 1115 // [5:4] - select 128 bits from sources for high half of destination 1116 // [6] - ignore 1117 // [7] - zero high half of destination 1118 1119 uint8_t Imm = CInt->getZExtValue(); 1120 1121 bool LowHalfZero = Imm & 0x08; 1122 bool HighHalfZero = Imm & 0x80; 1123 1124 // If both zero mask bits are set, this was just a weird way to 1125 // generate a zero vector. 1126 if (LowHalfZero && HighHalfZero) 1127 return ZeroVector; 1128 1129 // If 0 or 1 zero mask bits are set, this is a simple shuffle. 1130 unsigned NumElts = VecTy->getNumElements(); 1131 unsigned HalfSize = NumElts / 2; 1132 SmallVector<uint32_t, 8> ShuffleMask(NumElts); 1133 1134 // The high bit of the selection field chooses the 1st or 2nd operand. 1135 bool LowInputSelect = Imm & 0x02; 1136 bool HighInputSelect = Imm & 0x20; 1137 1138 // The low bit of the selection field chooses the low or high half 1139 // of the selected operand. 1140 bool LowHalfSelect = Imm & 0x01; 1141 bool HighHalfSelect = Imm & 0x10; 1142 1143 // Determine which operand(s) are actually in use for this instruction. 1144 Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); 1145 Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); 1146 1147 // If needed, replace operands based on zero mask. 1148 V0 = LowHalfZero ? ZeroVector : V0; 1149 V1 = HighHalfZero ? ZeroVector : V1; 1150 1151 // Permute low half of result. 1152 unsigned StartIndex = LowHalfSelect ? HalfSize : 0; 1153 for (unsigned i = 0; i < HalfSize; ++i) 1154 ShuffleMask[i] = StartIndex + i; 1155 1156 // Permute high half of result. 1157 StartIndex = HighHalfSelect ? HalfSize : 0; 1158 StartIndex += NumElts; 1159 for (unsigned i = 0; i < HalfSize; ++i) 1160 ShuffleMask[i + HalfSize] = StartIndex + i; 1161 1162 return Builder.CreateShuffleVector(V0, V1, ShuffleMask); 1163 } 1164 1165 /// Decode XOP integer vector comparison intrinsics. 1166 static Value *simplifyX86vpcom(const IntrinsicInst &II, 1167 InstCombiner::BuilderTy &Builder, 1168 bool IsSigned) { 1169 if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1170 uint64_t Imm = CInt->getZExtValue() & 0x7; 1171 VectorType *VecTy = cast<VectorType>(II.getType()); 1172 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; 1173 1174 switch (Imm) { 1175 case 0x0: 1176 Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; 1177 break; 1178 case 0x1: 1179 Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; 1180 break; 1181 case 0x2: 1182 Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; 1183 break; 1184 case 0x3: 1185 Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; 1186 break; 1187 case 0x4: 1188 Pred = ICmpInst::ICMP_EQ; break; 1189 case 0x5: 1190 Pred = ICmpInst::ICMP_NE; break; 1191 case 0x6: 1192 return ConstantInt::getSigned(VecTy, 0); // FALSE 1193 case 0x7: 1194 return ConstantInt::getSigned(VecTy, -1); // TRUE 1195 } 1196 1197 if (Value *Cmp = Builder.CreateICmp(Pred, II.getArgOperand(0), 1198 II.getArgOperand(1))) 1199 return Builder.CreateSExtOrTrunc(Cmp, VecTy); 1200 } 1201 return nullptr; 1202 } 1203 1204 // Emit a select instruction and appropriate bitcasts to help simplify 1205 // masked intrinsics. 1206 static Value *emitX86MaskSelect(Value *Mask, Value *Op0, Value *Op1, 1207 InstCombiner::BuilderTy &Builder) { 1208 unsigned VWidth = Op0->getType()->getVectorNumElements(); 1209 1210 // If the mask is all ones we don't need the select. But we need to check 1211 // only the bit thats will be used in case VWidth is less than 8. 1212 if (auto *C = dyn_cast<ConstantInt>(Mask)) 1213 if (C->getValue().zextOrTrunc(VWidth).isAllOnesValue()) 1214 return Op0; 1215 1216 auto *MaskTy = VectorType::get(Builder.getInt1Ty(), 1217 cast<IntegerType>(Mask->getType())->getBitWidth()); 1218 Mask = Builder.CreateBitCast(Mask, MaskTy); 1219 1220 // If we have less than 8 elements, then the starting mask was an i8 and 1221 // we need to extract down to the right number of elements. 1222 if (VWidth < 8) { 1223 uint32_t Indices[4]; 1224 for (unsigned i = 0; i != VWidth; ++i) 1225 Indices[i] = i; 1226 Mask = Builder.CreateShuffleVector(Mask, Mask, 1227 makeArrayRef(Indices, VWidth), 1228 "extract"); 1229 } 1230 1231 return Builder.CreateSelect(Mask, Op0, Op1); 1232 } 1233 1234 static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) { 1235 Value *Arg0 = II.getArgOperand(0); 1236 Value *Arg1 = II.getArgOperand(1); 1237 1238 // fmin(x, x) -> x 1239 if (Arg0 == Arg1) 1240 return Arg0; 1241 1242 const auto *C1 = dyn_cast<ConstantFP>(Arg1); 1243 1244 // fmin(x, nan) -> x 1245 if (C1 && C1->isNaN()) 1246 return Arg0; 1247 1248 // This is the value because if undef were NaN, we would return the other 1249 // value and cannot return a NaN unless both operands are. 1250 // 1251 // fmin(undef, x) -> x 1252 if (isa<UndefValue>(Arg0)) 1253 return Arg1; 1254 1255 // fmin(x, undef) -> x 1256 if (isa<UndefValue>(Arg1)) 1257 return Arg0; 1258 1259 Value *X = nullptr; 1260 Value *Y = nullptr; 1261 if (II.getIntrinsicID() == Intrinsic::minnum) { 1262 // fmin(x, fmin(x, y)) -> fmin(x, y) 1263 // fmin(y, fmin(x, y)) -> fmin(x, y) 1264 if (match(Arg1, m_FMin(m_Value(X), m_Value(Y)))) { 1265 if (Arg0 == X || Arg0 == Y) 1266 return Arg1; 1267 } 1268 1269 // fmin(fmin(x, y), x) -> fmin(x, y) 1270 // fmin(fmin(x, y), y) -> fmin(x, y) 1271 if (match(Arg0, m_FMin(m_Value(X), m_Value(Y)))) { 1272 if (Arg1 == X || Arg1 == Y) 1273 return Arg0; 1274 } 1275 1276 // TODO: fmin(nnan x, inf) -> x 1277 // TODO: fmin(nnan ninf x, flt_max) -> x 1278 if (C1 && C1->isInfinity()) { 1279 // fmin(x, -inf) -> -inf 1280 if (C1->isNegative()) 1281 return Arg1; 1282 } 1283 } else { 1284 assert(II.getIntrinsicID() == Intrinsic::maxnum); 1285 // fmax(x, fmax(x, y)) -> fmax(x, y) 1286 // fmax(y, fmax(x, y)) -> fmax(x, y) 1287 if (match(Arg1, m_FMax(m_Value(X), m_Value(Y)))) { 1288 if (Arg0 == X || Arg0 == Y) 1289 return Arg1; 1290 } 1291 1292 // fmax(fmax(x, y), x) -> fmax(x, y) 1293 // fmax(fmax(x, y), y) -> fmax(x, y) 1294 if (match(Arg0, m_FMax(m_Value(X), m_Value(Y)))) { 1295 if (Arg1 == X || Arg1 == Y) 1296 return Arg0; 1297 } 1298 1299 // TODO: fmax(nnan x, -inf) -> x 1300 // TODO: fmax(nnan ninf x, -flt_max) -> x 1301 if (C1 && C1->isInfinity()) { 1302 // fmax(x, inf) -> inf 1303 if (!C1->isNegative()) 1304 return Arg1; 1305 } 1306 } 1307 return nullptr; 1308 } 1309 1310 static bool maskIsAllOneOrUndef(Value *Mask) { 1311 auto *ConstMask = dyn_cast<Constant>(Mask); 1312 if (!ConstMask) 1313 return false; 1314 if (ConstMask->isAllOnesValue() || isa<UndefValue>(ConstMask)) 1315 return true; 1316 for (unsigned I = 0, E = ConstMask->getType()->getVectorNumElements(); I != E; 1317 ++I) { 1318 if (auto *MaskElt = ConstMask->getAggregateElement(I)) 1319 if (MaskElt->isAllOnesValue() || isa<UndefValue>(MaskElt)) 1320 continue; 1321 return false; 1322 } 1323 return true; 1324 } 1325 1326 static Value *simplifyMaskedLoad(const IntrinsicInst &II, 1327 InstCombiner::BuilderTy &Builder) { 1328 // If the mask is all ones or undefs, this is a plain vector load of the 1st 1329 // argument. 1330 if (maskIsAllOneOrUndef(II.getArgOperand(2))) { 1331 Value *LoadPtr = II.getArgOperand(0); 1332 unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue(); 1333 return Builder.CreateAlignedLoad(LoadPtr, Alignment, "unmaskedload"); 1334 } 1335 1336 return nullptr; 1337 } 1338 1339 static Instruction *simplifyMaskedStore(IntrinsicInst &II, InstCombiner &IC) { 1340 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1341 if (!ConstMask) 1342 return nullptr; 1343 1344 // If the mask is all zeros, this instruction does nothing. 1345 if (ConstMask->isNullValue()) 1346 return IC.eraseInstFromFunction(II); 1347 1348 // If the mask is all ones, this is a plain vector store of the 1st argument. 1349 if (ConstMask->isAllOnesValue()) { 1350 Value *StorePtr = II.getArgOperand(1); 1351 unsigned Alignment = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue(); 1352 return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment); 1353 } 1354 1355 return nullptr; 1356 } 1357 1358 static Instruction *simplifyMaskedGather(IntrinsicInst &II, InstCombiner &IC) { 1359 // If the mask is all zeros, return the "passthru" argument of the gather. 1360 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2)); 1361 if (ConstMask && ConstMask->isNullValue()) 1362 return IC.replaceInstUsesWith(II, II.getArgOperand(3)); 1363 1364 return nullptr; 1365 } 1366 1367 static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) { 1368 // If the mask is all zeros, a scatter does nothing. 1369 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1370 if (ConstMask && ConstMask->isNullValue()) 1371 return IC.eraseInstFromFunction(II); 1372 1373 return nullptr; 1374 } 1375 1376 static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { 1377 assert((II.getIntrinsicID() == Intrinsic::cttz || 1378 II.getIntrinsicID() == Intrinsic::ctlz) && 1379 "Expected cttz or ctlz intrinsic"); 1380 Value *Op0 = II.getArgOperand(0); 1381 1382 KnownBits Known = IC.computeKnownBits(Op0, 0, &II); 1383 1384 // Create a mask for bits above (ctlz) or below (cttz) the first known one. 1385 bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; 1386 unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros() 1387 : Known.countMaxLeadingZeros(); 1388 unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros() 1389 : Known.countMinLeadingZeros(); 1390 1391 // If all bits above (ctlz) or below (cttz) the first known one are known 1392 // zero, this value is constant. 1393 // FIXME: This should be in InstSimplify because we're replacing an 1394 // instruction with a constant. 1395 if (PossibleZeros == DefiniteZeros) { 1396 auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros); 1397 return IC.replaceInstUsesWith(II, C); 1398 } 1399 1400 // If the input to cttz/ctlz is known to be non-zero, 1401 // then change the 'ZeroIsUndef' parameter to 'true' 1402 // because we know the zero behavior can't affect the result. 1403 if (!Known.One.isNullValue() || 1404 isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, 1405 &IC.getDominatorTree())) { 1406 if (!match(II.getArgOperand(1), m_One())) { 1407 II.setOperand(1, IC.Builder->getTrue()); 1408 return &II; 1409 } 1410 } 1411 1412 // Add range metadata since known bits can't completely reflect what we know. 1413 // TODO: Handle splat vectors. 1414 auto *IT = dyn_cast<IntegerType>(Op0->getType()); 1415 if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1416 Metadata *LowAndHigh[] = { 1417 ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)), 1418 ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))}; 1419 II.setMetadata(LLVMContext::MD_range, 1420 MDNode::get(II.getContext(), LowAndHigh)); 1421 return &II; 1422 } 1423 1424 return nullptr; 1425 } 1426 1427 static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { 1428 assert(II.getIntrinsicID() == Intrinsic::ctpop && 1429 "Expected ctpop intrinsic"); 1430 Value *Op0 = II.getArgOperand(0); 1431 // FIXME: Try to simplify vectors of integers. 1432 auto *IT = dyn_cast<IntegerType>(Op0->getType()); 1433 if (!IT) 1434 return nullptr; 1435 1436 unsigned BitWidth = IT->getBitWidth(); 1437 KnownBits Known(BitWidth); 1438 IC.computeKnownBits(Op0, Known, 0, &II); 1439 1440 unsigned MinCount = Known.countMinPopulation(); 1441 unsigned MaxCount = Known.countMaxPopulation(); 1442 1443 // Add range metadata since known bits can't completely reflect what we know. 1444 if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1445 Metadata *LowAndHigh[] = { 1446 ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)), 1447 ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))}; 1448 II.setMetadata(LLVMContext::MD_range, 1449 MDNode::get(II.getContext(), LowAndHigh)); 1450 return &II; 1451 } 1452 1453 return nullptr; 1454 } 1455 1456 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 1457 // XMM register mask efficiently, we could transform all x86 masked intrinsics 1458 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1459 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 1460 Value *Ptr = II.getOperand(0); 1461 Value *Mask = II.getOperand(1); 1462 Constant *ZeroVec = Constant::getNullValue(II.getType()); 1463 1464 // Special case a zero mask since that's not a ConstantDataVector. 1465 // This masked load instruction creates a zero vector. 1466 if (isa<ConstantAggregateZero>(Mask)) 1467 return IC.replaceInstUsesWith(II, ZeroVec); 1468 1469 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1470 if (!ConstMask) 1471 return nullptr; 1472 1473 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1474 // to allow target-independent optimizations. 1475 1476 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1477 // the LLVM intrinsic definition for the pointer argument. 1478 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1479 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 1480 Value *PtrCast = IC.Builder->CreateBitCast(Ptr, VecPtrTy, "castvec"); 1481 1482 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1483 // on each element's most significant bit (the sign bit). 1484 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1485 1486 // The pass-through vector for an x86 masked load is a zero vector. 1487 CallInst *NewMaskedLoad = 1488 IC.Builder->CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec); 1489 return IC.replaceInstUsesWith(II, NewMaskedLoad); 1490 } 1491 1492 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 1493 // XMM register mask efficiently, we could transform all x86 masked intrinsics 1494 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1495 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 1496 Value *Ptr = II.getOperand(0); 1497 Value *Mask = II.getOperand(1); 1498 Value *Vec = II.getOperand(2); 1499 1500 // Special case a zero mask since that's not a ConstantDataVector: 1501 // this masked store instruction does nothing. 1502 if (isa<ConstantAggregateZero>(Mask)) { 1503 IC.eraseInstFromFunction(II); 1504 return true; 1505 } 1506 1507 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 1508 // anything else at this level. 1509 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 1510 return false; 1511 1512 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1513 if (!ConstMask) 1514 return false; 1515 1516 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1517 // to allow target-independent optimizations. 1518 1519 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1520 // the LLVM intrinsic definition for the pointer argument. 1521 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1522 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 1523 Value *PtrCast = IC.Builder->CreateBitCast(Ptr, VecPtrTy, "castvec"); 1524 1525 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1526 // on each element's most significant bit (the sign bit). 1527 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1528 1529 IC.Builder->CreateMaskedStore(Vec, PtrCast, 1, BoolMask); 1530 1531 // 'Replace uses' doesn't work for stores. Erase the original masked store. 1532 IC.eraseInstFromFunction(II); 1533 return true; 1534 } 1535 1536 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 1537 // 1538 // A single NaN input is folded to minnum, so we rely on that folding for 1539 // handling NaNs. 1540 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 1541 const APFloat &Src2) { 1542 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 1543 1544 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 1545 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 1546 if (Cmp0 == APFloat::cmpEqual) 1547 return maxnum(Src1, Src2); 1548 1549 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 1550 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 1551 if (Cmp1 == APFloat::cmpEqual) 1552 return maxnum(Src0, Src2); 1553 1554 return maxnum(Src0, Src1); 1555 } 1556 1557 // Returns true iff the 2 intrinsics have the same operands, limiting the 1558 // comparison to the first NumOperands. 1559 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, 1560 unsigned NumOperands) { 1561 assert(I.getNumArgOperands() >= NumOperands && "Not enough operands"); 1562 assert(E.getNumArgOperands() >= NumOperands && "Not enough operands"); 1563 for (unsigned i = 0; i < NumOperands; i++) 1564 if (I.getArgOperand(i) != E.getArgOperand(i)) 1565 return false; 1566 return true; 1567 } 1568 1569 // Remove trivially empty start/end intrinsic ranges, i.e. a start 1570 // immediately followed by an end (ignoring debuginfo or other 1571 // start/end intrinsics in between). As this handles only the most trivial 1572 // cases, tracking the nesting level is not needed: 1573 // 1574 // call @llvm.foo.start(i1 0) ; &I 1575 // call @llvm.foo.start(i1 0) 1576 // call @llvm.foo.end(i1 0) ; This one will not be skipped: it will be removed 1577 // call @llvm.foo.end(i1 0) 1578 static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID, 1579 unsigned EndID, InstCombiner &IC) { 1580 assert(I.getIntrinsicID() == StartID && 1581 "Start intrinsic does not have expected ID"); 1582 BasicBlock::iterator BI(I), BE(I.getParent()->end()); 1583 for (++BI; BI != BE; ++BI) { 1584 if (auto *E = dyn_cast<IntrinsicInst>(BI)) { 1585 if (isa<DbgInfoIntrinsic>(E) || E->getIntrinsicID() == StartID) 1586 continue; 1587 if (E->getIntrinsicID() == EndID && 1588 haveSameOperands(I, *E, E->getNumArgOperands())) { 1589 IC.eraseInstFromFunction(*E); 1590 IC.eraseInstFromFunction(I); 1591 return true; 1592 } 1593 } 1594 break; 1595 } 1596 1597 return false; 1598 } 1599 1600 // Convert NVVM intrinsics to target-generic LLVM code where possible. 1601 static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) { 1602 // Each NVVM intrinsic we can simplify can be replaced with one of: 1603 // 1604 // * an LLVM intrinsic, 1605 // * an LLVM cast operation, 1606 // * an LLVM binary operation, or 1607 // * ad-hoc LLVM IR for the particular operation. 1608 1609 // Some transformations are only valid when the module's 1610 // flush-denormals-to-zero (ftz) setting is true/false, whereas other 1611 // transformations are valid regardless of the module's ftz setting. 1612 enum FtzRequirementTy { 1613 FTZ_Any, // Any ftz setting is ok. 1614 FTZ_MustBeOn, // Transformation is valid only if ftz is on. 1615 FTZ_MustBeOff, // Transformation is valid only if ftz is off. 1616 }; 1617 // Classes of NVVM intrinsics that can't be replaced one-to-one with a 1618 // target-generic intrinsic, cast op, or binary op but that we can nonetheless 1619 // simplify. 1620 enum SpecialCase { 1621 SPC_Reciprocal, 1622 }; 1623 1624 // SimplifyAction is a poor-man's variant (plus an additional flag) that 1625 // represents how to replace an NVVM intrinsic with target-generic LLVM IR. 1626 struct SimplifyAction { 1627 // Invariant: At most one of these Optionals has a value. 1628 Optional<Intrinsic::ID> IID; 1629 Optional<Instruction::CastOps> CastOp; 1630 Optional<Instruction::BinaryOps> BinaryOp; 1631 Optional<SpecialCase> Special; 1632 1633 FtzRequirementTy FtzRequirement = FTZ_Any; 1634 1635 SimplifyAction() = default; 1636 1637 SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) 1638 : IID(IID), FtzRequirement(FtzReq) {} 1639 1640 // Cast operations don't have anything to do with FTZ, so we skip that 1641 // argument. 1642 SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} 1643 1644 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) 1645 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} 1646 1647 SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) 1648 : Special(Special), FtzRequirement(FtzReq) {} 1649 }; 1650 1651 // Try to generate a SimplifyAction describing how to replace our 1652 // IntrinsicInstr with target-generic LLVM IR. 1653 const SimplifyAction Action = [II]() -> SimplifyAction { 1654 switch (II->getIntrinsicID()) { 1655 1656 // NVVM intrinsics that map directly to LLVM intrinsics. 1657 case Intrinsic::nvvm_ceil_d: 1658 return {Intrinsic::ceil, FTZ_Any}; 1659 case Intrinsic::nvvm_ceil_f: 1660 return {Intrinsic::ceil, FTZ_MustBeOff}; 1661 case Intrinsic::nvvm_ceil_ftz_f: 1662 return {Intrinsic::ceil, FTZ_MustBeOn}; 1663 case Intrinsic::nvvm_fabs_d: 1664 return {Intrinsic::fabs, FTZ_Any}; 1665 case Intrinsic::nvvm_fabs_f: 1666 return {Intrinsic::fabs, FTZ_MustBeOff}; 1667 case Intrinsic::nvvm_fabs_ftz_f: 1668 return {Intrinsic::fabs, FTZ_MustBeOn}; 1669 case Intrinsic::nvvm_floor_d: 1670 return {Intrinsic::floor, FTZ_Any}; 1671 case Intrinsic::nvvm_floor_f: 1672 return {Intrinsic::floor, FTZ_MustBeOff}; 1673 case Intrinsic::nvvm_floor_ftz_f: 1674 return {Intrinsic::floor, FTZ_MustBeOn}; 1675 case Intrinsic::nvvm_fma_rn_d: 1676 return {Intrinsic::fma, FTZ_Any}; 1677 case Intrinsic::nvvm_fma_rn_f: 1678 return {Intrinsic::fma, FTZ_MustBeOff}; 1679 case Intrinsic::nvvm_fma_rn_ftz_f: 1680 return {Intrinsic::fma, FTZ_MustBeOn}; 1681 case Intrinsic::nvvm_fmax_d: 1682 return {Intrinsic::maxnum, FTZ_Any}; 1683 case Intrinsic::nvvm_fmax_f: 1684 return {Intrinsic::maxnum, FTZ_MustBeOff}; 1685 case Intrinsic::nvvm_fmax_ftz_f: 1686 return {Intrinsic::maxnum, FTZ_MustBeOn}; 1687 case Intrinsic::nvvm_fmin_d: 1688 return {Intrinsic::minnum, FTZ_Any}; 1689 case Intrinsic::nvvm_fmin_f: 1690 return {Intrinsic::minnum, FTZ_MustBeOff}; 1691 case Intrinsic::nvvm_fmin_ftz_f: 1692 return {Intrinsic::minnum, FTZ_MustBeOn}; 1693 case Intrinsic::nvvm_round_d: 1694 return {Intrinsic::round, FTZ_Any}; 1695 case Intrinsic::nvvm_round_f: 1696 return {Intrinsic::round, FTZ_MustBeOff}; 1697 case Intrinsic::nvvm_round_ftz_f: 1698 return {Intrinsic::round, FTZ_MustBeOn}; 1699 case Intrinsic::nvvm_sqrt_rn_d: 1700 return {Intrinsic::sqrt, FTZ_Any}; 1701 case Intrinsic::nvvm_sqrt_f: 1702 // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the 1703 // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts 1704 // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are 1705 // the versions with explicit ftz-ness. 1706 return {Intrinsic::sqrt, FTZ_Any}; 1707 case Intrinsic::nvvm_sqrt_rn_f: 1708 return {Intrinsic::sqrt, FTZ_MustBeOff}; 1709 case Intrinsic::nvvm_sqrt_rn_ftz_f: 1710 return {Intrinsic::sqrt, FTZ_MustBeOn}; 1711 case Intrinsic::nvvm_trunc_d: 1712 return {Intrinsic::trunc, FTZ_Any}; 1713 case Intrinsic::nvvm_trunc_f: 1714 return {Intrinsic::trunc, FTZ_MustBeOff}; 1715 case Intrinsic::nvvm_trunc_ftz_f: 1716 return {Intrinsic::trunc, FTZ_MustBeOn}; 1717 1718 // NVVM intrinsics that map to LLVM cast operations. 1719 // 1720 // Note that llvm's target-generic conversion operators correspond to the rz 1721 // (round to zero) versions of the nvvm conversion intrinsics, even though 1722 // most everything else here uses the rn (round to nearest even) nvvm ops. 1723 case Intrinsic::nvvm_d2i_rz: 1724 case Intrinsic::nvvm_f2i_rz: 1725 case Intrinsic::nvvm_d2ll_rz: 1726 case Intrinsic::nvvm_f2ll_rz: 1727 return {Instruction::FPToSI}; 1728 case Intrinsic::nvvm_d2ui_rz: 1729 case Intrinsic::nvvm_f2ui_rz: 1730 case Intrinsic::nvvm_d2ull_rz: 1731 case Intrinsic::nvvm_f2ull_rz: 1732 return {Instruction::FPToUI}; 1733 case Intrinsic::nvvm_i2d_rz: 1734 case Intrinsic::nvvm_i2f_rz: 1735 case Intrinsic::nvvm_ll2d_rz: 1736 case Intrinsic::nvvm_ll2f_rz: 1737 return {Instruction::SIToFP}; 1738 case Intrinsic::nvvm_ui2d_rz: 1739 case Intrinsic::nvvm_ui2f_rz: 1740 case Intrinsic::nvvm_ull2d_rz: 1741 case Intrinsic::nvvm_ull2f_rz: 1742 return {Instruction::UIToFP}; 1743 1744 // NVVM intrinsics that map to LLVM binary ops. 1745 case Intrinsic::nvvm_add_rn_d: 1746 return {Instruction::FAdd, FTZ_Any}; 1747 case Intrinsic::nvvm_add_rn_f: 1748 return {Instruction::FAdd, FTZ_MustBeOff}; 1749 case Intrinsic::nvvm_add_rn_ftz_f: 1750 return {Instruction::FAdd, FTZ_MustBeOn}; 1751 case Intrinsic::nvvm_mul_rn_d: 1752 return {Instruction::FMul, FTZ_Any}; 1753 case Intrinsic::nvvm_mul_rn_f: 1754 return {Instruction::FMul, FTZ_MustBeOff}; 1755 case Intrinsic::nvvm_mul_rn_ftz_f: 1756 return {Instruction::FMul, FTZ_MustBeOn}; 1757 case Intrinsic::nvvm_div_rn_d: 1758 return {Instruction::FDiv, FTZ_Any}; 1759 case Intrinsic::nvvm_div_rn_f: 1760 return {Instruction::FDiv, FTZ_MustBeOff}; 1761 case Intrinsic::nvvm_div_rn_ftz_f: 1762 return {Instruction::FDiv, FTZ_MustBeOn}; 1763 1764 // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but 1765 // need special handling. 1766 // 1767 // We seem to be mising intrinsics for rcp.approx.{ftz.}f32, which is just 1768 // as well. 1769 case Intrinsic::nvvm_rcp_rn_d: 1770 return {SPC_Reciprocal, FTZ_Any}; 1771 case Intrinsic::nvvm_rcp_rn_f: 1772 return {SPC_Reciprocal, FTZ_MustBeOff}; 1773 case Intrinsic::nvvm_rcp_rn_ftz_f: 1774 return {SPC_Reciprocal, FTZ_MustBeOn}; 1775 1776 // We do not currently simplify intrinsics that give an approximate answer. 1777 // These include: 1778 // 1779 // - nvvm_cos_approx_{f,ftz_f} 1780 // - nvvm_ex2_approx_{d,f,ftz_f} 1781 // - nvvm_lg2_approx_{d,f,ftz_f} 1782 // - nvvm_sin_approx_{f,ftz_f} 1783 // - nvvm_sqrt_approx_{f,ftz_f} 1784 // - nvvm_rsqrt_approx_{d,f,ftz_f} 1785 // - nvvm_div_approx_{ftz_d,ftz_f,f} 1786 // - nvvm_rcp_approx_ftz_d 1787 // 1788 // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" 1789 // means that fastmath is enabled in the intrinsic. Unfortunately only 1790 // binary operators (currently) have a fastmath bit in SelectionDAG, so this 1791 // information gets lost and we can't select on it. 1792 // 1793 // TODO: div and rcp are lowered to a binary op, so these we could in theory 1794 // lower them to "fast fdiv". 1795 1796 default: 1797 return {}; 1798 } 1799 }(); 1800 1801 // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we 1802 // can bail out now. (Notice that in the case that IID is not an NVVM 1803 // intrinsic, we don't have to look up any module metadata, as 1804 // FtzRequirementTy will be FTZ_Any.) 1805 if (Action.FtzRequirement != FTZ_Any) { 1806 bool FtzEnabled = 1807 II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() == 1808 "true"; 1809 1810 if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) 1811 return nullptr; 1812 } 1813 1814 // Simplify to target-generic intrinsic. 1815 if (Action.IID) { 1816 SmallVector<Value *, 4> Args(II->arg_operands()); 1817 // All the target-generic intrinsics currently of interest to us have one 1818 // type argument, equal to that of the nvvm intrinsic's argument. 1819 Type *Tys[] = {II->getArgOperand(0)->getType()}; 1820 return CallInst::Create( 1821 Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); 1822 } 1823 1824 // Simplify to target-generic binary op. 1825 if (Action.BinaryOp) 1826 return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), 1827 II->getArgOperand(1), II->getName()); 1828 1829 // Simplify to target-generic cast op. 1830 if (Action.CastOp) 1831 return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), 1832 II->getName()); 1833 1834 // All that's left are the special cases. 1835 if (!Action.Special) 1836 return nullptr; 1837 1838 switch (*Action.Special) { 1839 case SPC_Reciprocal: 1840 // Simplify reciprocal. 1841 return BinaryOperator::Create( 1842 Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), 1843 II->getArgOperand(0), II->getName()); 1844 } 1845 llvm_unreachable("All SpecialCase enumerators should be handled in switch."); 1846 } 1847 1848 Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) { 1849 removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this); 1850 return nullptr; 1851 } 1852 1853 Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) { 1854 removeTriviallyEmptyRange(I, Intrinsic::vacopy, Intrinsic::vaend, *this); 1855 return nullptr; 1856 } 1857 1858 /// CallInst simplification. This mostly only handles folding of intrinsic 1859 /// instructions. For normal calls, it allows visitCallSite to do the heavy 1860 /// lifting. 1861 Instruction *InstCombiner::visitCallInst(CallInst &CI) { 1862 auto Args = CI.arg_operands(); 1863 if (Value *V = SimplifyCall(&CI, CI.getCalledValue(), Args.begin(), 1864 Args.end(), SQ.getWithInstruction(&CI))) 1865 return replaceInstUsesWith(CI, V); 1866 1867 if (isFreeCall(&CI, &TLI)) 1868 return visitFree(CI); 1869 1870 // If the caller function is nounwind, mark the call as nounwind, even if the 1871 // callee isn't. 1872 if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) { 1873 CI.setDoesNotThrow(); 1874 return &CI; 1875 } 1876 1877 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI); 1878 if (!II) return visitCallSite(&CI); 1879 1880 // Intrinsics cannot occur in an invoke, so handle them here instead of in 1881 // visitCallSite. 1882 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) { 1883 bool Changed = false; 1884 1885 // memmove/cpy/set of zero bytes is a noop. 1886 if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) { 1887 if (NumBytes->isNullValue()) 1888 return eraseInstFromFunction(CI); 1889 1890 if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) 1891 if (CI->getZExtValue() == 1) { 1892 // Replace the instruction with just byte operations. We would 1893 // transform other cases to loads/stores, but we don't know if 1894 // alignment is sufficient. 1895 } 1896 } 1897 1898 // No other transformations apply to volatile transfers. 1899 if (MI->isVolatile()) 1900 return nullptr; 1901 1902 // If we have a memmove and the source operation is a constant global, 1903 // then the source and dest pointers can't alias, so we can change this 1904 // into a call to memcpy. 1905 if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) { 1906 if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) 1907 if (GVSrc->isConstant()) { 1908 Module *M = CI.getModule(); 1909 Intrinsic::ID MemCpyID = Intrinsic::memcpy; 1910 Type *Tys[3] = { CI.getArgOperand(0)->getType(), 1911 CI.getArgOperand(1)->getType(), 1912 CI.getArgOperand(2)->getType() }; 1913 CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys)); 1914 Changed = true; 1915 } 1916 } 1917 1918 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { 1919 // memmove(x,x,size) -> noop. 1920 if (MTI->getSource() == MTI->getDest()) 1921 return eraseInstFromFunction(CI); 1922 } 1923 1924 // If we can determine a pointer alignment that is bigger than currently 1925 // set, update the alignment. 1926 if (isa<MemTransferInst>(MI)) { 1927 if (Instruction *I = SimplifyMemTransfer(MI)) 1928 return I; 1929 } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) { 1930 if (Instruction *I = SimplifyMemSet(MSI)) 1931 return I; 1932 } 1933 1934 if (Changed) return II; 1935 } 1936 1937 if (auto *AMI = dyn_cast<ElementUnorderedAtomicMemCpyInst>(II)) { 1938 if (Constant *C = dyn_cast<Constant>(AMI->getLength())) 1939 if (C->isNullValue()) 1940 return eraseInstFromFunction(*AMI); 1941 1942 if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI)) 1943 return I; 1944 } 1945 1946 if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) 1947 return I; 1948 1949 auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, 1950 unsigned DemandedWidth) { 1951 APInt UndefElts(Width, 0); 1952 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 1953 return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 1954 }; 1955 1956 switch (II->getIntrinsicID()) { 1957 default: break; 1958 case Intrinsic::objectsize: 1959 if (ConstantInt *N = 1960 lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false)) 1961 return replaceInstUsesWith(CI, N); 1962 return nullptr; 1963 1964 case Intrinsic::bswap: { 1965 Value *IIOperand = II->getArgOperand(0); 1966 Value *X = nullptr; 1967 1968 // bswap(bswap(x)) -> x 1969 if (match(IIOperand, m_BSwap(m_Value(X)))) 1970 return replaceInstUsesWith(CI, X); 1971 1972 // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) 1973 if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { 1974 unsigned C = X->getType()->getPrimitiveSizeInBits() - 1975 IIOperand->getType()->getPrimitiveSizeInBits(); 1976 Value *CV = ConstantInt::get(X->getType(), C); 1977 Value *V = Builder->CreateLShr(X, CV); 1978 return new TruncInst(V, IIOperand->getType()); 1979 } 1980 break; 1981 } 1982 1983 case Intrinsic::bitreverse: { 1984 Value *IIOperand = II->getArgOperand(0); 1985 Value *X = nullptr; 1986 1987 // bitreverse(bitreverse(x)) -> x 1988 if (match(IIOperand, m_Intrinsic<Intrinsic::bitreverse>(m_Value(X)))) 1989 return replaceInstUsesWith(CI, X); 1990 break; 1991 } 1992 1993 case Intrinsic::masked_load: 1994 if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II, *Builder)) 1995 return replaceInstUsesWith(CI, SimplifiedMaskedOp); 1996 break; 1997 case Intrinsic::masked_store: 1998 return simplifyMaskedStore(*II, *this); 1999 case Intrinsic::masked_gather: 2000 return simplifyMaskedGather(*II, *this); 2001 case Intrinsic::masked_scatter: 2002 return simplifyMaskedScatter(*II, *this); 2003 2004 case Intrinsic::powi: 2005 if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2006 // powi(x, 0) -> 1.0 2007 if (Power->isZero()) 2008 return replaceInstUsesWith(CI, ConstantFP::get(CI.getType(), 1.0)); 2009 // powi(x, 1) -> x 2010 if (Power->isOne()) 2011 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2012 // powi(x, -1) -> 1/x 2013 if (Power->isAllOnesValue()) 2014 return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0), 2015 II->getArgOperand(0)); 2016 } 2017 break; 2018 2019 case Intrinsic::cttz: 2020 case Intrinsic::ctlz: 2021 if (auto *I = foldCttzCtlz(*II, *this)) 2022 return I; 2023 break; 2024 2025 case Intrinsic::ctpop: 2026 if (auto *I = foldCtpop(*II, *this)) 2027 return I; 2028 break; 2029 2030 case Intrinsic::uadd_with_overflow: 2031 case Intrinsic::sadd_with_overflow: 2032 case Intrinsic::umul_with_overflow: 2033 case Intrinsic::smul_with_overflow: 2034 if (isa<Constant>(II->getArgOperand(0)) && 2035 !isa<Constant>(II->getArgOperand(1))) { 2036 // Canonicalize constants into the RHS. 2037 Value *LHS = II->getArgOperand(0); 2038 II->setArgOperand(0, II->getArgOperand(1)); 2039 II->setArgOperand(1, LHS); 2040 return II; 2041 } 2042 LLVM_FALLTHROUGH; 2043 2044 case Intrinsic::usub_with_overflow: 2045 case Intrinsic::ssub_with_overflow: { 2046 OverflowCheckFlavor OCF = 2047 IntrinsicIDToOverflowCheckFlavor(II->getIntrinsicID()); 2048 assert(OCF != OCF_INVALID && "unexpected!"); 2049 2050 Value *OperationResult = nullptr; 2051 Constant *OverflowResult = nullptr; 2052 if (OptimizeOverflowCheck(OCF, II->getArgOperand(0), II->getArgOperand(1), 2053 *II, OperationResult, OverflowResult)) 2054 return CreateOverflowTuple(II, OperationResult, OverflowResult); 2055 2056 break; 2057 } 2058 2059 case Intrinsic::minnum: 2060 case Intrinsic::maxnum: { 2061 Value *Arg0 = II->getArgOperand(0); 2062 Value *Arg1 = II->getArgOperand(1); 2063 // Canonicalize constants to the RHS. 2064 if (isa<ConstantFP>(Arg0) && !isa<ConstantFP>(Arg1)) { 2065 II->setArgOperand(0, Arg1); 2066 II->setArgOperand(1, Arg0); 2067 return II; 2068 } 2069 if (Value *V = simplifyMinnumMaxnum(*II)) 2070 return replaceInstUsesWith(*II, V); 2071 break; 2072 } 2073 case Intrinsic::fmuladd: { 2074 // Canonicalize fast fmuladd to the separate fmul + fadd. 2075 if (II->hasUnsafeAlgebra()) { 2076 BuilderTy::FastMathFlagGuard Guard(*Builder); 2077 Builder->setFastMathFlags(II->getFastMathFlags()); 2078 Value *Mul = Builder->CreateFMul(II->getArgOperand(0), 2079 II->getArgOperand(1)); 2080 Value *Add = Builder->CreateFAdd(Mul, II->getArgOperand(2)); 2081 Add->takeName(II); 2082 return replaceInstUsesWith(*II, Add); 2083 } 2084 2085 LLVM_FALLTHROUGH; 2086 } 2087 case Intrinsic::fma: { 2088 Value *Src0 = II->getArgOperand(0); 2089 Value *Src1 = II->getArgOperand(1); 2090 2091 // Canonicalize constants into the RHS. 2092 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 2093 II->setArgOperand(0, Src1); 2094 II->setArgOperand(1, Src0); 2095 std::swap(Src0, Src1); 2096 } 2097 2098 Value *LHS = nullptr; 2099 Value *RHS = nullptr; 2100 2101 // fma fneg(x), fneg(y), z -> fma x, y, z 2102 if (match(Src0, m_FNeg(m_Value(LHS))) && 2103 match(Src1, m_FNeg(m_Value(RHS)))) { 2104 II->setArgOperand(0, LHS); 2105 II->setArgOperand(1, RHS); 2106 return II; 2107 } 2108 2109 // fma fabs(x), fabs(x), z -> fma x, x, z 2110 if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) && 2111 match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) { 2112 II->setArgOperand(0, LHS); 2113 II->setArgOperand(1, RHS); 2114 return II; 2115 } 2116 2117 // fma x, 1, z -> fadd x, z 2118 if (match(Src1, m_FPOne())) { 2119 Instruction *RI = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2)); 2120 RI->copyFastMathFlags(II); 2121 return RI; 2122 } 2123 2124 break; 2125 } 2126 case Intrinsic::fabs: { 2127 Value *Cond; 2128 Constant *LHS, *RHS; 2129 if (match(II->getArgOperand(0), 2130 m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) { 2131 CallInst *Call0 = Builder->CreateCall(II->getCalledFunction(), {LHS}); 2132 CallInst *Call1 = Builder->CreateCall(II->getCalledFunction(), {RHS}); 2133 return SelectInst::Create(Cond, Call0, Call1); 2134 } 2135 2136 LLVM_FALLTHROUGH; 2137 } 2138 case Intrinsic::ceil: 2139 case Intrinsic::floor: 2140 case Intrinsic::round: 2141 case Intrinsic::nearbyint: 2142 case Intrinsic::rint: 2143 case Intrinsic::trunc: { 2144 Value *ExtSrc; 2145 if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) && 2146 II->getArgOperand(0)->hasOneUse()) { 2147 // fabs (fpext x) -> fpext (fabs x) 2148 Value *F = Intrinsic::getDeclaration(II->getModule(), II->getIntrinsicID(), 2149 { ExtSrc->getType() }); 2150 CallInst *NewFabs = Builder->CreateCall(F, ExtSrc); 2151 NewFabs->copyFastMathFlags(II); 2152 NewFabs->takeName(II); 2153 return new FPExtInst(NewFabs, II->getType()); 2154 } 2155 2156 break; 2157 } 2158 case Intrinsic::cos: 2159 case Intrinsic::amdgcn_cos: { 2160 Value *SrcSrc; 2161 Value *Src = II->getArgOperand(0); 2162 if (match(Src, m_FNeg(m_Value(SrcSrc))) || 2163 match(Src, m_Intrinsic<Intrinsic::fabs>(m_Value(SrcSrc)))) { 2164 // cos(-x) -> cos(x) 2165 // cos(fabs(x)) -> cos(x) 2166 II->setArgOperand(0, SrcSrc); 2167 return II; 2168 } 2169 2170 break; 2171 } 2172 case Intrinsic::ppc_altivec_lvx: 2173 case Intrinsic::ppc_altivec_lvxl: 2174 // Turn PPC lvx -> load if the pointer is known aligned. 2175 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, 2176 &DT) >= 16) { 2177 Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), 2178 PointerType::getUnqual(II->getType())); 2179 return new LoadInst(Ptr); 2180 } 2181 break; 2182 case Intrinsic::ppc_vsx_lxvw4x: 2183 case Intrinsic::ppc_vsx_lxvd2x: { 2184 // Turn PPC VSX loads into normal loads. 2185 Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), 2186 PointerType::getUnqual(II->getType())); 2187 return new LoadInst(Ptr, Twine(""), false, 1); 2188 } 2189 case Intrinsic::ppc_altivec_stvx: 2190 case Intrinsic::ppc_altivec_stvxl: 2191 // Turn stvx -> store if the pointer is known aligned. 2192 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, 2193 &DT) >= 16) { 2194 Type *OpPtrTy = 2195 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2196 Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); 2197 return new StoreInst(II->getArgOperand(0), Ptr); 2198 } 2199 break; 2200 case Intrinsic::ppc_vsx_stxvw4x: 2201 case Intrinsic::ppc_vsx_stxvd2x: { 2202 // Turn PPC VSX stores into normal stores. 2203 Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); 2204 Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); 2205 return new StoreInst(II->getArgOperand(0), Ptr, false, 1); 2206 } 2207 case Intrinsic::ppc_qpx_qvlfs: 2208 // Turn PPC QPX qvlfs -> load if the pointer is known aligned. 2209 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, 2210 &DT) >= 16) { 2211 Type *VTy = VectorType::get(Builder->getFloatTy(), 2212 II->getType()->getVectorNumElements()); 2213 Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), 2214 PointerType::getUnqual(VTy)); 2215 Value *Load = Builder->CreateLoad(Ptr); 2216 return new FPExtInst(Load, II->getType()); 2217 } 2218 break; 2219 case Intrinsic::ppc_qpx_qvlfd: 2220 // Turn PPC QPX qvlfd -> load if the pointer is known aligned. 2221 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC, 2222 &DT) >= 32) { 2223 Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), 2224 PointerType::getUnqual(II->getType())); 2225 return new LoadInst(Ptr); 2226 } 2227 break; 2228 case Intrinsic::ppc_qpx_qvstfs: 2229 // Turn PPC QPX qvstfs -> store if the pointer is known aligned. 2230 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, 2231 &DT) >= 16) { 2232 Type *VTy = VectorType::get(Builder->getFloatTy(), 2233 II->getArgOperand(0)->getType()->getVectorNumElements()); 2234 Value *TOp = Builder->CreateFPTrunc(II->getArgOperand(0), VTy); 2235 Type *OpPtrTy = PointerType::getUnqual(VTy); 2236 Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); 2237 return new StoreInst(TOp, Ptr); 2238 } 2239 break; 2240 case Intrinsic::ppc_qpx_qvstfd: 2241 // Turn PPC QPX qvstfd -> store if the pointer is known aligned. 2242 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC, 2243 &DT) >= 32) { 2244 Type *OpPtrTy = 2245 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2246 Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); 2247 return new StoreInst(II->getArgOperand(0), Ptr); 2248 } 2249 break; 2250 2251 case Intrinsic::x86_vcvtph2ps_128: 2252 case Intrinsic::x86_vcvtph2ps_256: { 2253 auto Arg = II->getArgOperand(0); 2254 auto ArgType = cast<VectorType>(Arg->getType()); 2255 auto RetType = cast<VectorType>(II->getType()); 2256 unsigned ArgWidth = ArgType->getNumElements(); 2257 unsigned RetWidth = RetType->getNumElements(); 2258 assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths"); 2259 assert(ArgType->isIntOrIntVectorTy() && 2260 ArgType->getScalarSizeInBits() == 16 && 2261 "CVTPH2PS input type should be 16-bit integer vector"); 2262 assert(RetType->getScalarType()->isFloatTy() && 2263 "CVTPH2PS output type should be 32-bit float vector"); 2264 2265 // Constant folding: Convert to generic half to single conversion. 2266 if (isa<ConstantAggregateZero>(Arg)) 2267 return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType)); 2268 2269 if (isa<ConstantDataVector>(Arg)) { 2270 auto VectorHalfAsShorts = Arg; 2271 if (RetWidth < ArgWidth) { 2272 SmallVector<uint32_t, 8> SubVecMask; 2273 for (unsigned i = 0; i != RetWidth; ++i) 2274 SubVecMask.push_back((int)i); 2275 VectorHalfAsShorts = Builder->CreateShuffleVector( 2276 Arg, UndefValue::get(ArgType), SubVecMask); 2277 } 2278 2279 auto VectorHalfType = 2280 VectorType::get(Type::getHalfTy(II->getContext()), RetWidth); 2281 auto VectorHalfs = 2282 Builder->CreateBitCast(VectorHalfAsShorts, VectorHalfType); 2283 auto VectorFloats = Builder->CreateFPExt(VectorHalfs, RetType); 2284 return replaceInstUsesWith(*II, VectorFloats); 2285 } 2286 2287 // We only use the lowest lanes of the argument. 2288 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) { 2289 II->setArgOperand(0, V); 2290 return II; 2291 } 2292 break; 2293 } 2294 2295 case Intrinsic::x86_sse_cvtss2si: 2296 case Intrinsic::x86_sse_cvtss2si64: 2297 case Intrinsic::x86_sse_cvttss2si: 2298 case Intrinsic::x86_sse_cvttss2si64: 2299 case Intrinsic::x86_sse2_cvtsd2si: 2300 case Intrinsic::x86_sse2_cvtsd2si64: 2301 case Intrinsic::x86_sse2_cvttsd2si: 2302 case Intrinsic::x86_sse2_cvttsd2si64: 2303 case Intrinsic::x86_avx512_vcvtss2si32: 2304 case Intrinsic::x86_avx512_vcvtss2si64: 2305 case Intrinsic::x86_avx512_vcvtss2usi32: 2306 case Intrinsic::x86_avx512_vcvtss2usi64: 2307 case Intrinsic::x86_avx512_vcvtsd2si32: 2308 case Intrinsic::x86_avx512_vcvtsd2si64: 2309 case Intrinsic::x86_avx512_vcvtsd2usi32: 2310 case Intrinsic::x86_avx512_vcvtsd2usi64: 2311 case Intrinsic::x86_avx512_cvttss2si: 2312 case Intrinsic::x86_avx512_cvttss2si64: 2313 case Intrinsic::x86_avx512_cvttss2usi: 2314 case Intrinsic::x86_avx512_cvttss2usi64: 2315 case Intrinsic::x86_avx512_cvttsd2si: 2316 case Intrinsic::x86_avx512_cvttsd2si64: 2317 case Intrinsic::x86_avx512_cvttsd2usi: 2318 case Intrinsic::x86_avx512_cvttsd2usi64: { 2319 // These intrinsics only demand the 0th element of their input vectors. If 2320 // we can simplify the input based on that, do so now. 2321 Value *Arg = II->getArgOperand(0); 2322 unsigned VWidth = Arg->getType()->getVectorNumElements(); 2323 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 2324 II->setArgOperand(0, V); 2325 return II; 2326 } 2327 break; 2328 } 2329 2330 case Intrinsic::x86_mmx_pmovmskb: 2331 case Intrinsic::x86_sse_movmsk_ps: 2332 case Intrinsic::x86_sse2_movmsk_pd: 2333 case Intrinsic::x86_sse2_pmovmskb_128: 2334 case Intrinsic::x86_avx_movmsk_pd_256: 2335 case Intrinsic::x86_avx_movmsk_ps_256: 2336 case Intrinsic::x86_avx2_pmovmskb: { 2337 if (Value *V = simplifyX86movmsk(*II, *Builder)) 2338 return replaceInstUsesWith(*II, V); 2339 break; 2340 } 2341 2342 case Intrinsic::x86_sse_comieq_ss: 2343 case Intrinsic::x86_sse_comige_ss: 2344 case Intrinsic::x86_sse_comigt_ss: 2345 case Intrinsic::x86_sse_comile_ss: 2346 case Intrinsic::x86_sse_comilt_ss: 2347 case Intrinsic::x86_sse_comineq_ss: 2348 case Intrinsic::x86_sse_ucomieq_ss: 2349 case Intrinsic::x86_sse_ucomige_ss: 2350 case Intrinsic::x86_sse_ucomigt_ss: 2351 case Intrinsic::x86_sse_ucomile_ss: 2352 case Intrinsic::x86_sse_ucomilt_ss: 2353 case Intrinsic::x86_sse_ucomineq_ss: 2354 case Intrinsic::x86_sse2_comieq_sd: 2355 case Intrinsic::x86_sse2_comige_sd: 2356 case Intrinsic::x86_sse2_comigt_sd: 2357 case Intrinsic::x86_sse2_comile_sd: 2358 case Intrinsic::x86_sse2_comilt_sd: 2359 case Intrinsic::x86_sse2_comineq_sd: 2360 case Intrinsic::x86_sse2_ucomieq_sd: 2361 case Intrinsic::x86_sse2_ucomige_sd: 2362 case Intrinsic::x86_sse2_ucomigt_sd: 2363 case Intrinsic::x86_sse2_ucomile_sd: 2364 case Intrinsic::x86_sse2_ucomilt_sd: 2365 case Intrinsic::x86_sse2_ucomineq_sd: 2366 case Intrinsic::x86_avx512_vcomi_ss: 2367 case Intrinsic::x86_avx512_vcomi_sd: 2368 case Intrinsic::x86_avx512_mask_cmp_ss: 2369 case Intrinsic::x86_avx512_mask_cmp_sd: { 2370 // These intrinsics only demand the 0th element of their input vectors. If 2371 // we can simplify the input based on that, do so now. 2372 bool MadeChange = false; 2373 Value *Arg0 = II->getArgOperand(0); 2374 Value *Arg1 = II->getArgOperand(1); 2375 unsigned VWidth = Arg0->getType()->getVectorNumElements(); 2376 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2377 II->setArgOperand(0, V); 2378 MadeChange = true; 2379 } 2380 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2381 II->setArgOperand(1, V); 2382 MadeChange = true; 2383 } 2384 if (MadeChange) 2385 return II; 2386 break; 2387 } 2388 case Intrinsic::x86_avx512_mask_cmp_pd_128: 2389 case Intrinsic::x86_avx512_mask_cmp_pd_256: 2390 case Intrinsic::x86_avx512_mask_cmp_pd_512: 2391 case Intrinsic::x86_avx512_mask_cmp_ps_128: 2392 case Intrinsic::x86_avx512_mask_cmp_ps_256: 2393 case Intrinsic::x86_avx512_mask_cmp_ps_512: { 2394 // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a) 2395 Value *Arg0 = II->getArgOperand(0); 2396 Value *Arg1 = II->getArgOperand(1); 2397 bool Arg0IsZero = match(Arg0, m_Zero()); 2398 if (Arg0IsZero) 2399 std::swap(Arg0, Arg1); 2400 Value *A, *B; 2401 // This fold requires only the NINF(not +/- inf) since inf minus 2402 // inf is nan. 2403 // NSZ(No Signed Zeros) is not needed because zeros of any sign are 2404 // equal for both compares. 2405 // NNAN is not needed because nans compare the same for both compares. 2406 // The compare intrinsic uses the above assumptions and therefore 2407 // doesn't require additional flags. 2408 if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) && 2409 match(Arg1, m_Zero()) && 2410 cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) { 2411 if (Arg0IsZero) 2412 std::swap(A, B); 2413 II->setArgOperand(0, A); 2414 II->setArgOperand(1, B); 2415 return II; 2416 } 2417 break; 2418 } 2419 2420 case Intrinsic::x86_avx512_mask_add_ps_512: 2421 case Intrinsic::x86_avx512_mask_div_ps_512: 2422 case Intrinsic::x86_avx512_mask_mul_ps_512: 2423 case Intrinsic::x86_avx512_mask_sub_ps_512: 2424 case Intrinsic::x86_avx512_mask_add_pd_512: 2425 case Intrinsic::x86_avx512_mask_div_pd_512: 2426 case Intrinsic::x86_avx512_mask_mul_pd_512: 2427 case Intrinsic::x86_avx512_mask_sub_pd_512: 2428 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2429 // IR operations. 2430 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { 2431 if (R->getValue() == 4) { 2432 Value *Arg0 = II->getArgOperand(0); 2433 Value *Arg1 = II->getArgOperand(1); 2434 2435 Value *V; 2436 switch (II->getIntrinsicID()) { 2437 default: llvm_unreachable("Case stmts out of sync!"); 2438 case Intrinsic::x86_avx512_mask_add_ps_512: 2439 case Intrinsic::x86_avx512_mask_add_pd_512: 2440 V = Builder->CreateFAdd(Arg0, Arg1); 2441 break; 2442 case Intrinsic::x86_avx512_mask_sub_ps_512: 2443 case Intrinsic::x86_avx512_mask_sub_pd_512: 2444 V = Builder->CreateFSub(Arg0, Arg1); 2445 break; 2446 case Intrinsic::x86_avx512_mask_mul_ps_512: 2447 case Intrinsic::x86_avx512_mask_mul_pd_512: 2448 V = Builder->CreateFMul(Arg0, Arg1); 2449 break; 2450 case Intrinsic::x86_avx512_mask_div_ps_512: 2451 case Intrinsic::x86_avx512_mask_div_pd_512: 2452 V = Builder->CreateFDiv(Arg0, Arg1); 2453 break; 2454 } 2455 2456 // Create a select for the masking. 2457 V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2), 2458 *Builder); 2459 return replaceInstUsesWith(*II, V); 2460 } 2461 } 2462 break; 2463 2464 case Intrinsic::x86_avx512_mask_add_ss_round: 2465 case Intrinsic::x86_avx512_mask_div_ss_round: 2466 case Intrinsic::x86_avx512_mask_mul_ss_round: 2467 case Intrinsic::x86_avx512_mask_sub_ss_round: 2468 case Intrinsic::x86_avx512_mask_add_sd_round: 2469 case Intrinsic::x86_avx512_mask_div_sd_round: 2470 case Intrinsic::x86_avx512_mask_mul_sd_round: 2471 case Intrinsic::x86_avx512_mask_sub_sd_round: 2472 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2473 // IR operations. 2474 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { 2475 if (R->getValue() == 4) { 2476 // Extract the element as scalars. 2477 Value *Arg0 = II->getArgOperand(0); 2478 Value *Arg1 = II->getArgOperand(1); 2479 Value *LHS = Builder->CreateExtractElement(Arg0, (uint64_t)0); 2480 Value *RHS = Builder->CreateExtractElement(Arg1, (uint64_t)0); 2481 2482 Value *V; 2483 switch (II->getIntrinsicID()) { 2484 default: llvm_unreachable("Case stmts out of sync!"); 2485 case Intrinsic::x86_avx512_mask_add_ss_round: 2486 case Intrinsic::x86_avx512_mask_add_sd_round: 2487 V = Builder->CreateFAdd(LHS, RHS); 2488 break; 2489 case Intrinsic::x86_avx512_mask_sub_ss_round: 2490 case Intrinsic::x86_avx512_mask_sub_sd_round: 2491 V = Builder->CreateFSub(LHS, RHS); 2492 break; 2493 case Intrinsic::x86_avx512_mask_mul_ss_round: 2494 case Intrinsic::x86_avx512_mask_mul_sd_round: 2495 V = Builder->CreateFMul(LHS, RHS); 2496 break; 2497 case Intrinsic::x86_avx512_mask_div_ss_round: 2498 case Intrinsic::x86_avx512_mask_div_sd_round: 2499 V = Builder->CreateFDiv(LHS, RHS); 2500 break; 2501 } 2502 2503 // Handle the masking aspect of the intrinsic. 2504 Value *Mask = II->getArgOperand(3); 2505 auto *C = dyn_cast<ConstantInt>(Mask); 2506 // We don't need a select if we know the mask bit is a 1. 2507 if (!C || !C->getValue()[0]) { 2508 // Cast the mask to an i1 vector and then extract the lowest element. 2509 auto *MaskTy = VectorType::get(Builder->getInt1Ty(), 2510 cast<IntegerType>(Mask->getType())->getBitWidth()); 2511 Mask = Builder->CreateBitCast(Mask, MaskTy); 2512 Mask = Builder->CreateExtractElement(Mask, (uint64_t)0); 2513 // Extract the lowest element from the passthru operand. 2514 Value *Passthru = Builder->CreateExtractElement(II->getArgOperand(2), 2515 (uint64_t)0); 2516 V = Builder->CreateSelect(Mask, V, Passthru); 2517 } 2518 2519 // Insert the result back into the original argument 0. 2520 V = Builder->CreateInsertElement(Arg0, V, (uint64_t)0); 2521 2522 return replaceInstUsesWith(*II, V); 2523 } 2524 } 2525 LLVM_FALLTHROUGH; 2526 2527 // X86 scalar intrinsics simplified with SimplifyDemandedVectorElts. 2528 case Intrinsic::x86_avx512_mask_max_ss_round: 2529 case Intrinsic::x86_avx512_mask_min_ss_round: 2530 case Intrinsic::x86_avx512_mask_max_sd_round: 2531 case Intrinsic::x86_avx512_mask_min_sd_round: 2532 case Intrinsic::x86_avx512_mask_vfmadd_ss: 2533 case Intrinsic::x86_avx512_mask_vfmadd_sd: 2534 case Intrinsic::x86_avx512_maskz_vfmadd_ss: 2535 case Intrinsic::x86_avx512_maskz_vfmadd_sd: 2536 case Intrinsic::x86_avx512_mask3_vfmadd_ss: 2537 case Intrinsic::x86_avx512_mask3_vfmadd_sd: 2538 case Intrinsic::x86_avx512_mask3_vfmsub_ss: 2539 case Intrinsic::x86_avx512_mask3_vfmsub_sd: 2540 case Intrinsic::x86_avx512_mask3_vfnmsub_ss: 2541 case Intrinsic::x86_avx512_mask3_vfnmsub_sd: 2542 case Intrinsic::x86_fma_vfmadd_ss: 2543 case Intrinsic::x86_fma_vfmsub_ss: 2544 case Intrinsic::x86_fma_vfnmadd_ss: 2545 case Intrinsic::x86_fma_vfnmsub_ss: 2546 case Intrinsic::x86_fma_vfmadd_sd: 2547 case Intrinsic::x86_fma_vfmsub_sd: 2548 case Intrinsic::x86_fma_vfnmadd_sd: 2549 case Intrinsic::x86_fma_vfnmsub_sd: 2550 case Intrinsic::x86_sse_cmp_ss: 2551 case Intrinsic::x86_sse_min_ss: 2552 case Intrinsic::x86_sse_max_ss: 2553 case Intrinsic::x86_sse2_cmp_sd: 2554 case Intrinsic::x86_sse2_min_sd: 2555 case Intrinsic::x86_sse2_max_sd: 2556 case Intrinsic::x86_sse41_round_ss: 2557 case Intrinsic::x86_sse41_round_sd: 2558 case Intrinsic::x86_xop_vfrcz_ss: 2559 case Intrinsic::x86_xop_vfrcz_sd: { 2560 unsigned VWidth = II->getType()->getVectorNumElements(); 2561 APInt UndefElts(VWidth, 0); 2562 APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); 2563 if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { 2564 if (V != II) 2565 return replaceInstUsesWith(*II, V); 2566 return II; 2567 } 2568 break; 2569 } 2570 2571 // Constant fold ashr( <A x Bi>, Ci ). 2572 // Constant fold lshr( <A x Bi>, Ci ). 2573 // Constant fold shl( <A x Bi>, Ci ). 2574 case Intrinsic::x86_sse2_psrai_d: 2575 case Intrinsic::x86_sse2_psrai_w: 2576 case Intrinsic::x86_avx2_psrai_d: 2577 case Intrinsic::x86_avx2_psrai_w: 2578 case Intrinsic::x86_avx512_psrai_q_128: 2579 case Intrinsic::x86_avx512_psrai_q_256: 2580 case Intrinsic::x86_avx512_psrai_d_512: 2581 case Intrinsic::x86_avx512_psrai_q_512: 2582 case Intrinsic::x86_avx512_psrai_w_512: 2583 case Intrinsic::x86_sse2_psrli_d: 2584 case Intrinsic::x86_sse2_psrli_q: 2585 case Intrinsic::x86_sse2_psrli_w: 2586 case Intrinsic::x86_avx2_psrli_d: 2587 case Intrinsic::x86_avx2_psrli_q: 2588 case Intrinsic::x86_avx2_psrli_w: 2589 case Intrinsic::x86_avx512_psrli_d_512: 2590 case Intrinsic::x86_avx512_psrli_q_512: 2591 case Intrinsic::x86_avx512_psrli_w_512: 2592 case Intrinsic::x86_sse2_pslli_d: 2593 case Intrinsic::x86_sse2_pslli_q: 2594 case Intrinsic::x86_sse2_pslli_w: 2595 case Intrinsic::x86_avx2_pslli_d: 2596 case Intrinsic::x86_avx2_pslli_q: 2597 case Intrinsic::x86_avx2_pslli_w: 2598 case Intrinsic::x86_avx512_pslli_d_512: 2599 case Intrinsic::x86_avx512_pslli_q_512: 2600 case Intrinsic::x86_avx512_pslli_w_512: 2601 if (Value *V = simplifyX86immShift(*II, *Builder)) 2602 return replaceInstUsesWith(*II, V); 2603 break; 2604 2605 case Intrinsic::x86_sse2_psra_d: 2606 case Intrinsic::x86_sse2_psra_w: 2607 case Intrinsic::x86_avx2_psra_d: 2608 case Intrinsic::x86_avx2_psra_w: 2609 case Intrinsic::x86_avx512_psra_q_128: 2610 case Intrinsic::x86_avx512_psra_q_256: 2611 case Intrinsic::x86_avx512_psra_d_512: 2612 case Intrinsic::x86_avx512_psra_q_512: 2613 case Intrinsic::x86_avx512_psra_w_512: 2614 case Intrinsic::x86_sse2_psrl_d: 2615 case Intrinsic::x86_sse2_psrl_q: 2616 case Intrinsic::x86_sse2_psrl_w: 2617 case Intrinsic::x86_avx2_psrl_d: 2618 case Intrinsic::x86_avx2_psrl_q: 2619 case Intrinsic::x86_avx2_psrl_w: 2620 case Intrinsic::x86_avx512_psrl_d_512: 2621 case Intrinsic::x86_avx512_psrl_q_512: 2622 case Intrinsic::x86_avx512_psrl_w_512: 2623 case Intrinsic::x86_sse2_psll_d: 2624 case Intrinsic::x86_sse2_psll_q: 2625 case Intrinsic::x86_sse2_psll_w: 2626 case Intrinsic::x86_avx2_psll_d: 2627 case Intrinsic::x86_avx2_psll_q: 2628 case Intrinsic::x86_avx2_psll_w: 2629 case Intrinsic::x86_avx512_psll_d_512: 2630 case Intrinsic::x86_avx512_psll_q_512: 2631 case Intrinsic::x86_avx512_psll_w_512: { 2632 if (Value *V = simplifyX86immShift(*II, *Builder)) 2633 return replaceInstUsesWith(*II, V); 2634 2635 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2636 // operand to compute the shift amount. 2637 Value *Arg1 = II->getArgOperand(1); 2638 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2639 "Unexpected packed shift size"); 2640 unsigned VWidth = Arg1->getType()->getVectorNumElements(); 2641 2642 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 2643 II->setArgOperand(1, V); 2644 return II; 2645 } 2646 break; 2647 } 2648 2649 case Intrinsic::x86_avx2_psllv_d: 2650 case Intrinsic::x86_avx2_psllv_d_256: 2651 case Intrinsic::x86_avx2_psllv_q: 2652 case Intrinsic::x86_avx2_psllv_q_256: 2653 case Intrinsic::x86_avx512_psllv_d_512: 2654 case Intrinsic::x86_avx512_psllv_q_512: 2655 case Intrinsic::x86_avx512_psllv_w_128: 2656 case Intrinsic::x86_avx512_psllv_w_256: 2657 case Intrinsic::x86_avx512_psllv_w_512: 2658 case Intrinsic::x86_avx2_psrav_d: 2659 case Intrinsic::x86_avx2_psrav_d_256: 2660 case Intrinsic::x86_avx512_psrav_q_128: 2661 case Intrinsic::x86_avx512_psrav_q_256: 2662 case Intrinsic::x86_avx512_psrav_d_512: 2663 case Intrinsic::x86_avx512_psrav_q_512: 2664 case Intrinsic::x86_avx512_psrav_w_128: 2665 case Intrinsic::x86_avx512_psrav_w_256: 2666 case Intrinsic::x86_avx512_psrav_w_512: 2667 case Intrinsic::x86_avx2_psrlv_d: 2668 case Intrinsic::x86_avx2_psrlv_d_256: 2669 case Intrinsic::x86_avx2_psrlv_q: 2670 case Intrinsic::x86_avx2_psrlv_q_256: 2671 case Intrinsic::x86_avx512_psrlv_d_512: 2672 case Intrinsic::x86_avx512_psrlv_q_512: 2673 case Intrinsic::x86_avx512_psrlv_w_128: 2674 case Intrinsic::x86_avx512_psrlv_w_256: 2675 case Intrinsic::x86_avx512_psrlv_w_512: 2676 if (Value *V = simplifyX86varShift(*II, *Builder)) 2677 return replaceInstUsesWith(*II, V); 2678 break; 2679 2680 case Intrinsic::x86_sse2_pmulu_dq: 2681 case Intrinsic::x86_sse41_pmuldq: 2682 case Intrinsic::x86_avx2_pmul_dq: 2683 case Intrinsic::x86_avx2_pmulu_dq: 2684 case Intrinsic::x86_avx512_pmul_dq_512: 2685 case Intrinsic::x86_avx512_pmulu_dq_512: { 2686 if (Value *V = simplifyX86muldq(*II, *Builder)) 2687 return replaceInstUsesWith(*II, V); 2688 2689 unsigned VWidth = II->getType()->getVectorNumElements(); 2690 APInt UndefElts(VWidth, 0); 2691 APInt DemandedElts = APInt::getAllOnesValue(VWidth); 2692 if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) { 2693 if (V != II) 2694 return replaceInstUsesWith(*II, V); 2695 return II; 2696 } 2697 break; 2698 } 2699 2700 case Intrinsic::x86_sse2_packssdw_128: 2701 case Intrinsic::x86_sse2_packsswb_128: 2702 case Intrinsic::x86_avx2_packssdw: 2703 case Intrinsic::x86_avx2_packsswb: 2704 case Intrinsic::x86_avx512_packssdw_512: 2705 case Intrinsic::x86_avx512_packsswb_512: 2706 if (Value *V = simplifyX86pack(*II, *this, *Builder, true)) 2707 return replaceInstUsesWith(*II, V); 2708 break; 2709 2710 case Intrinsic::x86_sse2_packuswb_128: 2711 case Intrinsic::x86_sse41_packusdw: 2712 case Intrinsic::x86_avx2_packusdw: 2713 case Intrinsic::x86_avx2_packuswb: 2714 case Intrinsic::x86_avx512_packusdw_512: 2715 case Intrinsic::x86_avx512_packuswb_512: 2716 if (Value *V = simplifyX86pack(*II, *this, *Builder, false)) 2717 return replaceInstUsesWith(*II, V); 2718 break; 2719 2720 case Intrinsic::x86_pclmulqdq: { 2721 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) { 2722 unsigned Imm = C->getZExtValue(); 2723 2724 bool MadeChange = false; 2725 Value *Arg0 = II->getArgOperand(0); 2726 Value *Arg1 = II->getArgOperand(1); 2727 unsigned VWidth = Arg0->getType()->getVectorNumElements(); 2728 APInt DemandedElts(VWidth, 0); 2729 2730 APInt UndefElts1(VWidth, 0); 2731 DemandedElts = (Imm & 0x01) ? 2 : 1; 2732 if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts, 2733 UndefElts1)) { 2734 II->setArgOperand(0, V); 2735 MadeChange = true; 2736 } 2737 2738 APInt UndefElts2(VWidth, 0); 2739 DemandedElts = (Imm & 0x10) ? 2 : 1; 2740 if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts, 2741 UndefElts2)) { 2742 II->setArgOperand(1, V); 2743 MadeChange = true; 2744 } 2745 2746 // If both input elements are undef, the result is undef. 2747 if (UndefElts1[(Imm & 0x01) ? 1 : 0] || 2748 UndefElts2[(Imm & 0x10) ? 1 : 0]) 2749 return replaceInstUsesWith(*II, 2750 ConstantAggregateZero::get(II->getType())); 2751 2752 if (MadeChange) 2753 return II; 2754 } 2755 break; 2756 } 2757 2758 case Intrinsic::x86_sse41_insertps: 2759 if (Value *V = simplifyX86insertps(*II, *Builder)) 2760 return replaceInstUsesWith(*II, V); 2761 break; 2762 2763 case Intrinsic::x86_sse4a_extrq: { 2764 Value *Op0 = II->getArgOperand(0); 2765 Value *Op1 = II->getArgOperand(1); 2766 unsigned VWidth0 = Op0->getType()->getVectorNumElements(); 2767 unsigned VWidth1 = Op1->getType()->getVectorNumElements(); 2768 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2769 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2770 VWidth1 == 16 && "Unexpected operand sizes"); 2771 2772 // See if we're dealing with constant values. 2773 Constant *C1 = dyn_cast<Constant>(Op1); 2774 ConstantInt *CILength = 2775 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 2776 : nullptr; 2777 ConstantInt *CIIndex = 2778 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2779 : nullptr; 2780 2781 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 2782 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder)) 2783 return replaceInstUsesWith(*II, V); 2784 2785 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 2786 // operands and the lowest 16-bits of the second. 2787 bool MadeChange = false; 2788 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2789 II->setArgOperand(0, V); 2790 MadeChange = true; 2791 } 2792 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 2793 II->setArgOperand(1, V); 2794 MadeChange = true; 2795 } 2796 if (MadeChange) 2797 return II; 2798 break; 2799 } 2800 2801 case Intrinsic::x86_sse4a_extrqi: { 2802 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 2803 // bits of the lower 64-bits. The upper 64-bits are undefined. 2804 Value *Op0 = II->getArgOperand(0); 2805 unsigned VWidth = Op0->getType()->getVectorNumElements(); 2806 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2807 "Unexpected operand size"); 2808 2809 // See if we're dealing with constant values. 2810 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1)); 2811 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2)); 2812 2813 // Attempt to simplify to a constant or shuffle vector. 2814 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder)) 2815 return replaceInstUsesWith(*II, V); 2816 2817 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 2818 // operand. 2819 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2820 II->setArgOperand(0, V); 2821 return II; 2822 } 2823 break; 2824 } 2825 2826 case Intrinsic::x86_sse4a_insertq: { 2827 Value *Op0 = II->getArgOperand(0); 2828 Value *Op1 = II->getArgOperand(1); 2829 unsigned VWidth = Op0->getType()->getVectorNumElements(); 2830 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2831 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2832 Op1->getType()->getVectorNumElements() == 2 && 2833 "Unexpected operand size"); 2834 2835 // See if we're dealing with constant values. 2836 Constant *C1 = dyn_cast<Constant>(Op1); 2837 ConstantInt *CI11 = 2838 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2839 : nullptr; 2840 2841 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 2842 if (CI11) { 2843 const APInt &V11 = CI11->getValue(); 2844 APInt Len = V11.zextOrTrunc(6); 2845 APInt Idx = V11.lshr(8).zextOrTrunc(6); 2846 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder)) 2847 return replaceInstUsesWith(*II, V); 2848 } 2849 2850 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 2851 // operand. 2852 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2853 II->setArgOperand(0, V); 2854 return II; 2855 } 2856 break; 2857 } 2858 2859 case Intrinsic::x86_sse4a_insertqi: { 2860 // INSERTQI: Extract lowest Length bits from lower half of second source and 2861 // insert over first source starting at Index bit. The upper 64-bits are 2862 // undefined. 2863 Value *Op0 = II->getArgOperand(0); 2864 Value *Op1 = II->getArgOperand(1); 2865 unsigned VWidth0 = Op0->getType()->getVectorNumElements(); 2866 unsigned VWidth1 = Op1->getType()->getVectorNumElements(); 2867 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2868 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2869 VWidth1 == 2 && "Unexpected operand sizes"); 2870 2871 // See if we're dealing with constant values. 2872 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2)); 2873 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3)); 2874 2875 // Attempt to simplify to a constant or shuffle vector. 2876 if (CILength && CIIndex) { 2877 APInt Len = CILength->getValue().zextOrTrunc(6); 2878 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 2879 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder)) 2880 return replaceInstUsesWith(*II, V); 2881 } 2882 2883 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 2884 // operands. 2885 bool MadeChange = false; 2886 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2887 II->setArgOperand(0, V); 2888 MadeChange = true; 2889 } 2890 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 2891 II->setArgOperand(1, V); 2892 MadeChange = true; 2893 } 2894 if (MadeChange) 2895 return II; 2896 break; 2897 } 2898 2899 case Intrinsic::x86_sse41_pblendvb: 2900 case Intrinsic::x86_sse41_blendvps: 2901 case Intrinsic::x86_sse41_blendvpd: 2902 case Intrinsic::x86_avx_blendv_ps_256: 2903 case Intrinsic::x86_avx_blendv_pd_256: 2904 case Intrinsic::x86_avx2_pblendvb: { 2905 // Convert blendv* to vector selects if the mask is constant. 2906 // This optimization is convoluted because the intrinsic is defined as 2907 // getting a vector of floats or doubles for the ps and pd versions. 2908 // FIXME: That should be changed. 2909 2910 Value *Op0 = II->getArgOperand(0); 2911 Value *Op1 = II->getArgOperand(1); 2912 Value *Mask = II->getArgOperand(2); 2913 2914 // fold (blend A, A, Mask) -> A 2915 if (Op0 == Op1) 2916 return replaceInstUsesWith(CI, Op0); 2917 2918 // Zero Mask - select 1st argument. 2919 if (isa<ConstantAggregateZero>(Mask)) 2920 return replaceInstUsesWith(CI, Op0); 2921 2922 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 2923 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 2924 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 2925 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 2926 } 2927 break; 2928 } 2929 2930 case Intrinsic::x86_ssse3_pshuf_b_128: 2931 case Intrinsic::x86_avx2_pshuf_b: 2932 case Intrinsic::x86_avx512_pshuf_b_512: 2933 if (Value *V = simplifyX86pshufb(*II, *Builder)) 2934 return replaceInstUsesWith(*II, V); 2935 break; 2936 2937 case Intrinsic::x86_avx_vpermilvar_ps: 2938 case Intrinsic::x86_avx_vpermilvar_ps_256: 2939 case Intrinsic::x86_avx512_vpermilvar_ps_512: 2940 case Intrinsic::x86_avx_vpermilvar_pd: 2941 case Intrinsic::x86_avx_vpermilvar_pd_256: 2942 case Intrinsic::x86_avx512_vpermilvar_pd_512: 2943 if (Value *V = simplifyX86vpermilvar(*II, *Builder)) 2944 return replaceInstUsesWith(*II, V); 2945 break; 2946 2947 case Intrinsic::x86_avx2_permd: 2948 case Intrinsic::x86_avx2_permps: 2949 if (Value *V = simplifyX86vpermv(*II, *Builder)) 2950 return replaceInstUsesWith(*II, V); 2951 break; 2952 2953 case Intrinsic::x86_avx512_mask_permvar_df_256: 2954 case Intrinsic::x86_avx512_mask_permvar_df_512: 2955 case Intrinsic::x86_avx512_mask_permvar_di_256: 2956 case Intrinsic::x86_avx512_mask_permvar_di_512: 2957 case Intrinsic::x86_avx512_mask_permvar_hi_128: 2958 case Intrinsic::x86_avx512_mask_permvar_hi_256: 2959 case Intrinsic::x86_avx512_mask_permvar_hi_512: 2960 case Intrinsic::x86_avx512_mask_permvar_qi_128: 2961 case Intrinsic::x86_avx512_mask_permvar_qi_256: 2962 case Intrinsic::x86_avx512_mask_permvar_qi_512: 2963 case Intrinsic::x86_avx512_mask_permvar_sf_256: 2964 case Intrinsic::x86_avx512_mask_permvar_sf_512: 2965 case Intrinsic::x86_avx512_mask_permvar_si_256: 2966 case Intrinsic::x86_avx512_mask_permvar_si_512: 2967 if (Value *V = simplifyX86vpermv(*II, *Builder)) { 2968 // We simplified the permuting, now create a select for the masking. 2969 V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2), 2970 *Builder); 2971 return replaceInstUsesWith(*II, V); 2972 } 2973 break; 2974 2975 case Intrinsic::x86_avx_vperm2f128_pd_256: 2976 case Intrinsic::x86_avx_vperm2f128_ps_256: 2977 case Intrinsic::x86_avx_vperm2f128_si_256: 2978 case Intrinsic::x86_avx2_vperm2i128: 2979 if (Value *V = simplifyX86vperm2(*II, *Builder)) 2980 return replaceInstUsesWith(*II, V); 2981 break; 2982 2983 case Intrinsic::x86_avx_maskload_ps: 2984 case Intrinsic::x86_avx_maskload_pd: 2985 case Intrinsic::x86_avx_maskload_ps_256: 2986 case Intrinsic::x86_avx_maskload_pd_256: 2987 case Intrinsic::x86_avx2_maskload_d: 2988 case Intrinsic::x86_avx2_maskload_q: 2989 case Intrinsic::x86_avx2_maskload_d_256: 2990 case Intrinsic::x86_avx2_maskload_q_256: 2991 if (Instruction *I = simplifyX86MaskedLoad(*II, *this)) 2992 return I; 2993 break; 2994 2995 case Intrinsic::x86_sse2_maskmov_dqu: 2996 case Intrinsic::x86_avx_maskstore_ps: 2997 case Intrinsic::x86_avx_maskstore_pd: 2998 case Intrinsic::x86_avx_maskstore_ps_256: 2999 case Intrinsic::x86_avx_maskstore_pd_256: 3000 case Intrinsic::x86_avx2_maskstore_d: 3001 case Intrinsic::x86_avx2_maskstore_q: 3002 case Intrinsic::x86_avx2_maskstore_d_256: 3003 case Intrinsic::x86_avx2_maskstore_q_256: 3004 if (simplifyX86MaskedStore(*II, *this)) 3005 return nullptr; 3006 break; 3007 3008 case Intrinsic::x86_xop_vpcomb: 3009 case Intrinsic::x86_xop_vpcomd: 3010 case Intrinsic::x86_xop_vpcomq: 3011 case Intrinsic::x86_xop_vpcomw: 3012 if (Value *V = simplifyX86vpcom(*II, *Builder, true)) 3013 return replaceInstUsesWith(*II, V); 3014 break; 3015 3016 case Intrinsic::x86_xop_vpcomub: 3017 case Intrinsic::x86_xop_vpcomud: 3018 case Intrinsic::x86_xop_vpcomuq: 3019 case Intrinsic::x86_xop_vpcomuw: 3020 if (Value *V = simplifyX86vpcom(*II, *Builder, false)) 3021 return replaceInstUsesWith(*II, V); 3022 break; 3023 3024 case Intrinsic::ppc_altivec_vperm: 3025 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. 3026 // Note that ppc_altivec_vperm has a big-endian bias, so when creating 3027 // a vectorshuffle for little endian, we must undo the transformation 3028 // performed on vec_perm in altivec.h. That is, we must complement 3029 // the permutation mask with respect to 31 and reverse the order of 3030 // V1 and V2. 3031 if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) { 3032 assert(Mask->getType()->getVectorNumElements() == 16 && 3033 "Bad type for intrinsic!"); 3034 3035 // Check that all of the elements are integer constants or undefs. 3036 bool AllEltsOk = true; 3037 for (unsigned i = 0; i != 16; ++i) { 3038 Constant *Elt = Mask->getAggregateElement(i); 3039 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) { 3040 AllEltsOk = false; 3041 break; 3042 } 3043 } 3044 3045 if (AllEltsOk) { 3046 // Cast the input vectors to byte vectors. 3047 Value *Op0 = Builder->CreateBitCast(II->getArgOperand(0), 3048 Mask->getType()); 3049 Value *Op1 = Builder->CreateBitCast(II->getArgOperand(1), 3050 Mask->getType()); 3051 Value *Result = UndefValue::get(Op0->getType()); 3052 3053 // Only extract each element once. 3054 Value *ExtractedElts[32]; 3055 memset(ExtractedElts, 0, sizeof(ExtractedElts)); 3056 3057 for (unsigned i = 0; i != 16; ++i) { 3058 if (isa<UndefValue>(Mask->getAggregateElement(i))) 3059 continue; 3060 unsigned Idx = 3061 cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue(); 3062 Idx &= 31; // Match the hardware behavior. 3063 if (DL.isLittleEndian()) 3064 Idx = 31 - Idx; 3065 3066 if (!ExtractedElts[Idx]) { 3067 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0; 3068 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1; 3069 ExtractedElts[Idx] = 3070 Builder->CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse, 3071 Builder->getInt32(Idx&15)); 3072 } 3073 3074 // Insert this value into the result vector. 3075 Result = Builder->CreateInsertElement(Result, ExtractedElts[Idx], 3076 Builder->getInt32(i)); 3077 } 3078 return CastInst::Create(Instruction::BitCast, Result, CI.getType()); 3079 } 3080 } 3081 break; 3082 3083 case Intrinsic::arm_neon_vld1: 3084 case Intrinsic::arm_neon_vld2: 3085 case Intrinsic::arm_neon_vld3: 3086 case Intrinsic::arm_neon_vld4: 3087 case Intrinsic::arm_neon_vld2lane: 3088 case Intrinsic::arm_neon_vld3lane: 3089 case Intrinsic::arm_neon_vld4lane: 3090 case Intrinsic::arm_neon_vst1: 3091 case Intrinsic::arm_neon_vst2: 3092 case Intrinsic::arm_neon_vst3: 3093 case Intrinsic::arm_neon_vst4: 3094 case Intrinsic::arm_neon_vst2lane: 3095 case Intrinsic::arm_neon_vst3lane: 3096 case Intrinsic::arm_neon_vst4lane: { 3097 unsigned MemAlign = 3098 getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); 3099 unsigned AlignArg = II->getNumArgOperands() - 1; 3100 ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg)); 3101 if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) { 3102 II->setArgOperand(AlignArg, 3103 ConstantInt::get(Type::getInt32Ty(II->getContext()), 3104 MemAlign, false)); 3105 return II; 3106 } 3107 break; 3108 } 3109 3110 case Intrinsic::arm_neon_vmulls: 3111 case Intrinsic::arm_neon_vmullu: 3112 case Intrinsic::aarch64_neon_smull: 3113 case Intrinsic::aarch64_neon_umull: { 3114 Value *Arg0 = II->getArgOperand(0); 3115 Value *Arg1 = II->getArgOperand(1); 3116 3117 // Handle mul by zero first: 3118 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) { 3119 return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); 3120 } 3121 3122 // Check for constant LHS & RHS - in this case we just simplify. 3123 bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu || 3124 II->getIntrinsicID() == Intrinsic::aarch64_neon_umull); 3125 VectorType *NewVT = cast<VectorType>(II->getType()); 3126 if (Constant *CV0 = dyn_cast<Constant>(Arg0)) { 3127 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) { 3128 CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext); 3129 CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext); 3130 3131 return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1)); 3132 } 3133 3134 // Couldn't simplify - canonicalize constant to the RHS. 3135 std::swap(Arg0, Arg1); 3136 } 3137 3138 // Handle mul by one: 3139 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) 3140 if (ConstantInt *Splat = 3141 dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) 3142 if (Splat->isOne()) 3143 return CastInst::CreateIntegerCast(Arg0, II->getType(), 3144 /*isSigned=*/!Zext); 3145 3146 break; 3147 } 3148 case Intrinsic::amdgcn_rcp: { 3149 Value *Src = II->getArgOperand(0); 3150 3151 // TODO: Move to ConstantFolding/InstSimplify? 3152 if (isa<UndefValue>(Src)) 3153 return replaceInstUsesWith(CI, Src); 3154 3155 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3156 const APFloat &ArgVal = C->getValueAPF(); 3157 APFloat Val(ArgVal.getSemantics(), 1.0); 3158 APFloat::opStatus Status = Val.divide(ArgVal, 3159 APFloat::rmNearestTiesToEven); 3160 // Only do this if it was exact and therefore not dependent on the 3161 // rounding mode. 3162 if (Status == APFloat::opOK) 3163 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); 3164 } 3165 3166 break; 3167 } 3168 case Intrinsic::amdgcn_rsq: { 3169 Value *Src = II->getArgOperand(0); 3170 3171 // TODO: Move to ConstantFolding/InstSimplify? 3172 if (isa<UndefValue>(Src)) 3173 return replaceInstUsesWith(CI, Src); 3174 break; 3175 } 3176 case Intrinsic::amdgcn_frexp_mant: 3177 case Intrinsic::amdgcn_frexp_exp: { 3178 Value *Src = II->getArgOperand(0); 3179 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3180 int Exp; 3181 APFloat Significand = frexp(C->getValueAPF(), Exp, 3182 APFloat::rmNearestTiesToEven); 3183 3184 if (II->getIntrinsicID() == Intrinsic::amdgcn_frexp_mant) { 3185 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), 3186 Significand)); 3187 } 3188 3189 // Match instruction special case behavior. 3190 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 3191 Exp = 0; 3192 3193 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp)); 3194 } 3195 3196 if (isa<UndefValue>(Src)) 3197 return replaceInstUsesWith(CI, UndefValue::get(II->getType())); 3198 3199 break; 3200 } 3201 case Intrinsic::amdgcn_class: { 3202 enum { 3203 S_NAN = 1 << 0, // Signaling NaN 3204 Q_NAN = 1 << 1, // Quiet NaN 3205 N_INFINITY = 1 << 2, // Negative infinity 3206 N_NORMAL = 1 << 3, // Negative normal 3207 N_SUBNORMAL = 1 << 4, // Negative subnormal 3208 N_ZERO = 1 << 5, // Negative zero 3209 P_ZERO = 1 << 6, // Positive zero 3210 P_SUBNORMAL = 1 << 7, // Positive subnormal 3211 P_NORMAL = 1 << 8, // Positive normal 3212 P_INFINITY = 1 << 9 // Positive infinity 3213 }; 3214 3215 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 3216 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY; 3217 3218 Value *Src0 = II->getArgOperand(0); 3219 Value *Src1 = II->getArgOperand(1); 3220 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 3221 if (!CMask) { 3222 if (isa<UndefValue>(Src0)) 3223 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3224 3225 if (isa<UndefValue>(Src1)) 3226 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3227 break; 3228 } 3229 3230 uint32_t Mask = CMask->getZExtValue(); 3231 3232 // If all tests are made, it doesn't matter what the value is. 3233 if ((Mask & FullMask) == FullMask) 3234 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true)); 3235 3236 if ((Mask & FullMask) == 0) 3237 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3238 3239 if (Mask == (S_NAN | Q_NAN)) { 3240 // Equivalent of isnan. Replace with standard fcmp. 3241 Value *FCmp = Builder->CreateFCmpUNO(Src0, Src0); 3242 FCmp->takeName(II); 3243 return replaceInstUsesWith(*II, FCmp); 3244 } 3245 3246 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 3247 if (!CVal) { 3248 if (isa<UndefValue>(Src0)) 3249 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3250 3251 // Clamp mask to used bits 3252 if ((Mask & FullMask) != Mask) { 3253 CallInst *NewCall = Builder->CreateCall(II->getCalledFunction(), 3254 { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) } 3255 ); 3256 3257 NewCall->takeName(II); 3258 return replaceInstUsesWith(*II, NewCall); 3259 } 3260 3261 break; 3262 } 3263 3264 const APFloat &Val = CVal->getValueAPF(); 3265 3266 bool Result = 3267 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 3268 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 3269 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 3270 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 3271 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 3272 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 3273 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 3274 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 3275 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 3276 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 3277 3278 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result)); 3279 } 3280 case Intrinsic::amdgcn_cvt_pkrtz: { 3281 Value *Src0 = II->getArgOperand(0); 3282 Value *Src1 = II->getArgOperand(1); 3283 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3284 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3285 const fltSemantics &HalfSem 3286 = II->getType()->getScalarType()->getFltSemantics(); 3287 bool LosesInfo; 3288 APFloat Val0 = C0->getValueAPF(); 3289 APFloat Val1 = C1->getValueAPF(); 3290 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3291 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3292 3293 Constant *Folded = ConstantVector::get({ 3294 ConstantFP::get(II->getContext(), Val0), 3295 ConstantFP::get(II->getContext(), Val1) }); 3296 return replaceInstUsesWith(*II, Folded); 3297 } 3298 } 3299 3300 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) 3301 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3302 3303 break; 3304 } 3305 case Intrinsic::amdgcn_ubfe: 3306 case Intrinsic::amdgcn_sbfe: { 3307 // Decompose simple cases into standard shifts. 3308 Value *Src = II->getArgOperand(0); 3309 if (isa<UndefValue>(Src)) 3310 return replaceInstUsesWith(*II, Src); 3311 3312 unsigned Width; 3313 Type *Ty = II->getType(); 3314 unsigned IntSize = Ty->getIntegerBitWidth(); 3315 3316 ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3317 if (CWidth) { 3318 Width = CWidth->getZExtValue(); 3319 if ((Width & (IntSize - 1)) == 0) 3320 return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty)); 3321 3322 if (Width >= IntSize) { 3323 // Hardware ignores high bits, so remove those. 3324 II->setArgOperand(2, ConstantInt::get(CWidth->getType(), 3325 Width & (IntSize - 1))); 3326 return II; 3327 } 3328 } 3329 3330 unsigned Offset; 3331 ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3332 if (COffset) { 3333 Offset = COffset->getZExtValue(); 3334 if (Offset >= IntSize) { 3335 II->setArgOperand(1, ConstantInt::get(COffset->getType(), 3336 Offset & (IntSize - 1))); 3337 return II; 3338 } 3339 } 3340 3341 bool Signed = II->getIntrinsicID() == Intrinsic::amdgcn_sbfe; 3342 3343 // TODO: Also emit sub if only width is constant. 3344 if (!CWidth && COffset && Offset == 0) { 3345 Constant *KSize = ConstantInt::get(COffset->getType(), IntSize); 3346 Value *ShiftVal = Builder->CreateSub(KSize, II->getArgOperand(2)); 3347 ShiftVal = Builder->CreateZExt(ShiftVal, II->getType()); 3348 3349 Value *Shl = Builder->CreateShl(Src, ShiftVal); 3350 Value *RightShift = Signed ? 3351 Builder->CreateAShr(Shl, ShiftVal) : 3352 Builder->CreateLShr(Shl, ShiftVal); 3353 RightShift->takeName(II); 3354 return replaceInstUsesWith(*II, RightShift); 3355 } 3356 3357 if (!CWidth || !COffset) 3358 break; 3359 3360 // TODO: This allows folding to undef when the hardware has specific 3361 // behavior? 3362 if (Offset + Width < IntSize) { 3363 Value *Shl = Builder->CreateShl(Src, IntSize - Offset - Width); 3364 Value *RightShift = Signed ? 3365 Builder->CreateAShr(Shl, IntSize - Width) : 3366 Builder->CreateLShr(Shl, IntSize - Width); 3367 RightShift->takeName(II); 3368 return replaceInstUsesWith(*II, RightShift); 3369 } 3370 3371 Value *RightShift = Signed ? 3372 Builder->CreateAShr(Src, Offset) : 3373 Builder->CreateLShr(Src, Offset); 3374 3375 RightShift->takeName(II); 3376 return replaceInstUsesWith(*II, RightShift); 3377 } 3378 case Intrinsic::amdgcn_exp: 3379 case Intrinsic::amdgcn_exp_compr: { 3380 ConstantInt *En = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3381 if (!En) // Illegal. 3382 break; 3383 3384 unsigned EnBits = En->getZExtValue(); 3385 if (EnBits == 0xf) 3386 break; // All inputs enabled. 3387 3388 bool IsCompr = II->getIntrinsicID() == Intrinsic::amdgcn_exp_compr; 3389 bool Changed = false; 3390 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 3391 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 3392 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 3393 Value *Src = II->getArgOperand(I + 2); 3394 if (!isa<UndefValue>(Src)) { 3395 II->setArgOperand(I + 2, UndefValue::get(Src->getType())); 3396 Changed = true; 3397 } 3398 } 3399 } 3400 3401 if (Changed) 3402 return II; 3403 3404 break; 3405 3406 } 3407 case Intrinsic::amdgcn_fmed3: { 3408 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 3409 // for the shader. 3410 3411 Value *Src0 = II->getArgOperand(0); 3412 Value *Src1 = II->getArgOperand(1); 3413 Value *Src2 = II->getArgOperand(2); 3414 3415 bool Swap = false; 3416 // Canonicalize constants to RHS operands. 3417 // 3418 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 3419 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3420 std::swap(Src0, Src1); 3421 Swap = true; 3422 } 3423 3424 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 3425 std::swap(Src1, Src2); 3426 Swap = true; 3427 } 3428 3429 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3430 std::swap(Src0, Src1); 3431 Swap = true; 3432 } 3433 3434 if (Swap) { 3435 II->setArgOperand(0, Src0); 3436 II->setArgOperand(1, Src1); 3437 II->setArgOperand(2, Src2); 3438 return II; 3439 } 3440 3441 if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) { 3442 CallInst *NewCall = Builder->CreateMinNum(Src0, Src1); 3443 NewCall->copyFastMathFlags(II); 3444 NewCall->takeName(II); 3445 return replaceInstUsesWith(*II, NewCall); 3446 } 3447 3448 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3449 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3450 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 3451 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 3452 C2->getValueAPF()); 3453 return replaceInstUsesWith(*II, 3454 ConstantFP::get(Builder->getContext(), Result)); 3455 } 3456 } 3457 } 3458 3459 break; 3460 } 3461 case Intrinsic::amdgcn_icmp: 3462 case Intrinsic::amdgcn_fcmp: { 3463 const ConstantInt *CC = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3464 if (!CC) 3465 break; 3466 3467 // Guard against invalid arguments. 3468 int64_t CCVal = CC->getZExtValue(); 3469 bool IsInteger = II->getIntrinsicID() == Intrinsic::amdgcn_icmp; 3470 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 3471 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 3472 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 3473 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 3474 break; 3475 3476 Value *Src0 = II->getArgOperand(0); 3477 Value *Src1 = II->getArgOperand(1); 3478 3479 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 3480 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 3481 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 3482 if (CCmp->isNullValue()) { 3483 return replaceInstUsesWith( 3484 *II, ConstantExpr::getSExt(CCmp, II->getType())); 3485 } 3486 3487 // The result of V_ICMP/V_FCMP assembly instructions (which this 3488 // intrinsic exposes) is one bit per thread, masked with the EXEC 3489 // register (which contains the bitmask of live threads). So a 3490 // comparison that always returns true is the same as a read of the 3491 // EXEC register. 3492 Value *NewF = Intrinsic::getDeclaration( 3493 II->getModule(), Intrinsic::read_register, II->getType()); 3494 Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")}; 3495 MDNode *MD = MDNode::get(II->getContext(), MDArgs); 3496 Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; 3497 CallInst *NewCall = Builder->CreateCall(NewF, Args); 3498 NewCall->addAttribute(AttributeList::FunctionIndex, 3499 Attribute::Convergent); 3500 NewCall->takeName(II); 3501 return replaceInstUsesWith(*II, NewCall); 3502 } 3503 3504 // Canonicalize constants to RHS. 3505 CmpInst::Predicate SwapPred 3506 = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 3507 II->setArgOperand(0, Src1); 3508 II->setArgOperand(1, Src0); 3509 II->setArgOperand(2, ConstantInt::get(CC->getType(), 3510 static_cast<int>(SwapPred))); 3511 return II; 3512 } 3513 3514 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 3515 break; 3516 3517 // Canonicalize compare eq with true value to compare != 0 3518 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 3519 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 3520 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 3521 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 3522 Value *ExtSrc; 3523 if (CCVal == CmpInst::ICMP_EQ && 3524 ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) || 3525 (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) && 3526 ExtSrc->getType()->isIntegerTy(1)) { 3527 II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType())); 3528 II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 3529 return II; 3530 } 3531 3532 CmpInst::Predicate SrcPred; 3533 Value *SrcLHS; 3534 Value *SrcRHS; 3535 3536 // Fold compare eq/ne with 0 from a compare result as the predicate to the 3537 // intrinsic. The typical use is a wave vote function in the library, which 3538 // will be fed from a user code condition compared with 0. Fold in the 3539 // redundant compare. 3540 3541 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 3542 // -> llvm.amdgcn.[if]cmp(a, b, pred) 3543 // 3544 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 3545 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 3546 if (match(Src1, m_Zero()) && 3547 match(Src0, 3548 m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) { 3549 if (CCVal == CmpInst::ICMP_EQ) 3550 SrcPred = CmpInst::getInversePredicate(SrcPred); 3551 3552 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ? 3553 Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp; 3554 3555 Value *NewF = Intrinsic::getDeclaration(II->getModule(), NewIID, 3556 SrcLHS->getType()); 3557 Value *Args[] = { SrcLHS, SrcRHS, 3558 ConstantInt::get(CC->getType(), SrcPred) }; 3559 CallInst *NewCall = Builder->CreateCall(NewF, Args); 3560 NewCall->takeName(II); 3561 return replaceInstUsesWith(*II, NewCall); 3562 } 3563 3564 break; 3565 } 3566 case Intrinsic::stackrestore: { 3567 // If the save is right next to the restore, remove the restore. This can 3568 // happen when variable allocas are DCE'd. 3569 if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { 3570 if (SS->getIntrinsicID() == Intrinsic::stacksave) { 3571 if (&*++SS->getIterator() == II) 3572 return eraseInstFromFunction(CI); 3573 } 3574 } 3575 3576 // Scan down this block to see if there is another stack restore in the 3577 // same block without an intervening call/alloca. 3578 BasicBlock::iterator BI(II); 3579 TerminatorInst *TI = II->getParent()->getTerminator(); 3580 bool CannotRemove = false; 3581 for (++BI; &*BI != TI; ++BI) { 3582 if (isa<AllocaInst>(BI)) { 3583 CannotRemove = true; 3584 break; 3585 } 3586 if (CallInst *BCI = dyn_cast<CallInst>(BI)) { 3587 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BCI)) { 3588 // If there is a stackrestore below this one, remove this one. 3589 if (II->getIntrinsicID() == Intrinsic::stackrestore) 3590 return eraseInstFromFunction(CI); 3591 3592 // Bail if we cross over an intrinsic with side effects, such as 3593 // llvm.stacksave, llvm.read_register, or llvm.setjmp. 3594 if (II->mayHaveSideEffects()) { 3595 CannotRemove = true; 3596 break; 3597 } 3598 } else { 3599 // If we found a non-intrinsic call, we can't remove the stack 3600 // restore. 3601 CannotRemove = true; 3602 break; 3603 } 3604 } 3605 } 3606 3607 // If the stack restore is in a return, resume, or unwind block and if there 3608 // are no allocas or calls between the restore and the return, nuke the 3609 // restore. 3610 if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI))) 3611 return eraseInstFromFunction(CI); 3612 break; 3613 } 3614 case Intrinsic::lifetime_start: 3615 // Asan needs to poison memory to detect invalid access which is possible 3616 // even for empty lifetime range. 3617 if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress)) 3618 break; 3619 3620 if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start, 3621 Intrinsic::lifetime_end, *this)) 3622 return nullptr; 3623 break; 3624 case Intrinsic::assume: { 3625 Value *IIOperand = II->getArgOperand(0); 3626 // Remove an assume if it is immediately followed by an identical assume. 3627 if (match(II->getNextNode(), 3628 m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand)))) 3629 return eraseInstFromFunction(CI); 3630 3631 // Canonicalize assume(a && b) -> assume(a); assume(b); 3632 // Note: New assumption intrinsics created here are registered by 3633 // the InstCombineIRInserter object. 3634 Value *AssumeIntrinsic = II->getCalledValue(), *A, *B; 3635 if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) { 3636 Builder->CreateCall(AssumeIntrinsic, A, II->getName()); 3637 Builder->CreateCall(AssumeIntrinsic, B, II->getName()); 3638 return eraseInstFromFunction(*II); 3639 } 3640 // assume(!(a || b)) -> assume(!a); assume(!b); 3641 if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) { 3642 Builder->CreateCall(AssumeIntrinsic, Builder->CreateNot(A), 3643 II->getName()); 3644 Builder->CreateCall(AssumeIntrinsic, Builder->CreateNot(B), 3645 II->getName()); 3646 return eraseInstFromFunction(*II); 3647 } 3648 3649 // assume( (load addr) != null ) -> add 'nonnull' metadata to load 3650 // (if assume is valid at the load) 3651 CmpInst::Predicate Pred; 3652 Instruction *LHS; 3653 if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) && 3654 Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load && 3655 LHS->getType()->isPointerTy() && 3656 isValidAssumeForContext(II, LHS, &DT)) { 3657 MDNode *MD = MDNode::get(II->getContext(), None); 3658 LHS->setMetadata(LLVMContext::MD_nonnull, MD); 3659 return eraseInstFromFunction(*II); 3660 3661 // TODO: apply nonnull return attributes to calls and invokes 3662 // TODO: apply range metadata for range check patterns? 3663 } 3664 3665 // If there is a dominating assume with the same condition as this one, 3666 // then this one is redundant, and should be removed. 3667 KnownBits Known(1); 3668 computeKnownBits(IIOperand, Known, 0, II); 3669 if (Known.isAllOnes()) 3670 return eraseInstFromFunction(*II); 3671 3672 // Update the cache of affected values for this assumption (we might be 3673 // here because we just simplified the condition). 3674 AC.updateAffectedValues(II); 3675 break; 3676 } 3677 case Intrinsic::experimental_gc_relocate: { 3678 // Translate facts known about a pointer before relocating into 3679 // facts about the relocate value, while being careful to 3680 // preserve relocation semantics. 3681 Value *DerivedPtr = cast<GCRelocateInst>(II)->getDerivedPtr(); 3682 3683 // Remove the relocation if unused, note that this check is required 3684 // to prevent the cases below from looping forever. 3685 if (II->use_empty()) 3686 return eraseInstFromFunction(*II); 3687 3688 // Undef is undef, even after relocation. 3689 // TODO: provide a hook for this in GCStrategy. This is clearly legal for 3690 // most practical collectors, but there was discussion in the review thread 3691 // about whether it was legal for all possible collectors. 3692 if (isa<UndefValue>(DerivedPtr)) 3693 // Use undef of gc_relocate's type to replace it. 3694 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3695 3696 if (auto *PT = dyn_cast<PointerType>(II->getType())) { 3697 // The relocation of null will be null for most any collector. 3698 // TODO: provide a hook for this in GCStrategy. There might be some 3699 // weird collector this property does not hold for. 3700 if (isa<ConstantPointerNull>(DerivedPtr)) 3701 // Use null-pointer of gc_relocate's type to replace it. 3702 return replaceInstUsesWith(*II, ConstantPointerNull::get(PT)); 3703 3704 // isKnownNonNull -> nonnull attribute 3705 if (isKnownNonNullAt(DerivedPtr, II, &DT)) 3706 II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); 3707 } 3708 3709 // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) 3710 // Canonicalize on the type from the uses to the defs 3711 3712 // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...) 3713 break; 3714 } 3715 3716 case Intrinsic::experimental_guard: { 3717 // Is this guard followed by another guard? 3718 Instruction *NextInst = II->getNextNode(); 3719 Value *NextCond = nullptr; 3720 if (match(NextInst, 3721 m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) { 3722 Value *CurrCond = II->getArgOperand(0); 3723 3724 // Remove a guard that it is immediately preceded by an identical guard. 3725 if (CurrCond == NextCond) 3726 return eraseInstFromFunction(*NextInst); 3727 3728 // Otherwise canonicalize guard(a); guard(b) -> guard(a & b). 3729 II->setArgOperand(0, Builder->CreateAnd(CurrCond, NextCond)); 3730 return eraseInstFromFunction(*NextInst); 3731 } 3732 break; 3733 } 3734 } 3735 return visitCallSite(II); 3736 } 3737 3738 // Fence instruction simplification 3739 Instruction *InstCombiner::visitFenceInst(FenceInst &FI) { 3740 // Remove identical consecutive fences. 3741 if (auto *NFI = dyn_cast<FenceInst>(FI.getNextNode())) 3742 if (FI.isIdenticalTo(NFI)) 3743 return eraseInstFromFunction(FI); 3744 return nullptr; 3745 } 3746 3747 // InvokeInst simplification 3748 // 3749 Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { 3750 return visitCallSite(&II); 3751 } 3752 3753 /// If this cast does not affect the value passed through the varargs area, we 3754 /// can eliminate the use of the cast. 3755 static bool isSafeToEliminateVarargsCast(const CallSite CS, 3756 const DataLayout &DL, 3757 const CastInst *const CI, 3758 const int ix) { 3759 if (!CI->isLosslessCast()) 3760 return false; 3761 3762 // If this is a GC intrinsic, avoid munging types. We need types for 3763 // statepoint reconstruction in SelectionDAG. 3764 // TODO: This is probably something which should be expanded to all 3765 // intrinsics since the entire point of intrinsics is that 3766 // they are understandable by the optimizer. 3767 if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)) 3768 return false; 3769 3770 // The size of ByVal or InAlloca arguments is derived from the type, so we 3771 // can't change to a type with a different size. If the size were 3772 // passed explicitly we could avoid this check. 3773 if (!CS.isByValOrInAllocaArgument(ix)) 3774 return true; 3775 3776 Type* SrcTy = 3777 cast<PointerType>(CI->getOperand(0)->getType())->getElementType(); 3778 Type* DstTy = cast<PointerType>(CI->getType())->getElementType(); 3779 if (!SrcTy->isSized() || !DstTy->isSized()) 3780 return false; 3781 if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy)) 3782 return false; 3783 return true; 3784 } 3785 3786 Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { 3787 if (!CI->getCalledFunction()) return nullptr; 3788 3789 auto InstCombineRAUW = [this](Instruction *From, Value *With) { 3790 replaceInstUsesWith(*From, With); 3791 }; 3792 LibCallSimplifier Simplifier(DL, &TLI, InstCombineRAUW); 3793 if (Value *With = Simplifier.optimizeCall(CI)) { 3794 ++NumSimplified; 3795 return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); 3796 } 3797 3798 return nullptr; 3799 } 3800 3801 static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) { 3802 // Strip off at most one level of pointer casts, looking for an alloca. This 3803 // is good enough in practice and simpler than handling any number of casts. 3804 Value *Underlying = TrampMem->stripPointerCasts(); 3805 if (Underlying != TrampMem && 3806 (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem)) 3807 return nullptr; 3808 if (!isa<AllocaInst>(Underlying)) 3809 return nullptr; 3810 3811 IntrinsicInst *InitTrampoline = nullptr; 3812 for (User *U : TrampMem->users()) { 3813 IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); 3814 if (!II) 3815 return nullptr; 3816 if (II->getIntrinsicID() == Intrinsic::init_trampoline) { 3817 if (InitTrampoline) 3818 // More than one init_trampoline writes to this value. Give up. 3819 return nullptr; 3820 InitTrampoline = II; 3821 continue; 3822 } 3823 if (II->getIntrinsicID() == Intrinsic::adjust_trampoline) 3824 // Allow any number of calls to adjust.trampoline. 3825 continue; 3826 return nullptr; 3827 } 3828 3829 // No call to init.trampoline found. 3830 if (!InitTrampoline) 3831 return nullptr; 3832 3833 // Check that the alloca is being used in the expected way. 3834 if (InitTrampoline->getOperand(0) != TrampMem) 3835 return nullptr; 3836 3837 return InitTrampoline; 3838 } 3839 3840 static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp, 3841 Value *TrampMem) { 3842 // Visit all the previous instructions in the basic block, and try to find a 3843 // init.trampoline which has a direct path to the adjust.trampoline. 3844 for (BasicBlock::iterator I = AdjustTramp->getIterator(), 3845 E = AdjustTramp->getParent()->begin(); 3846 I != E;) { 3847 Instruction *Inst = &*--I; 3848 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 3849 if (II->getIntrinsicID() == Intrinsic::init_trampoline && 3850 II->getOperand(0) == TrampMem) 3851 return II; 3852 if (Inst->mayWriteToMemory()) 3853 return nullptr; 3854 } 3855 return nullptr; 3856 } 3857 3858 // Given a call to llvm.adjust.trampoline, find and return the corresponding 3859 // call to llvm.init.trampoline if the call to the trampoline can be optimized 3860 // to a direct call to a function. Otherwise return NULL. 3861 // 3862 static IntrinsicInst *findInitTrampoline(Value *Callee) { 3863 Callee = Callee->stripPointerCasts(); 3864 IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee); 3865 if (!AdjustTramp || 3866 AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline) 3867 return nullptr; 3868 3869 Value *TrampMem = AdjustTramp->getOperand(0); 3870 3871 if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem)) 3872 return IT; 3873 if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem)) 3874 return IT; 3875 return nullptr; 3876 } 3877 3878 /// Improvements for call and invoke instructions. 3879 Instruction *InstCombiner::visitCallSite(CallSite CS) { 3880 if (isAllocLikeFn(CS.getInstruction(), &TLI)) 3881 return visitAllocSite(*CS.getInstruction()); 3882 3883 bool Changed = false; 3884 3885 // Mark any parameters that are known to be non-null with the nonnull 3886 // attribute. This is helpful for inlining calls to functions with null 3887 // checks on their arguments. 3888 SmallVector<unsigned, 4> ArgNos; 3889 unsigned ArgNo = 0; 3890 3891 for (Value *V : CS.args()) { 3892 if (V->getType()->isPointerTy() && 3893 !CS.paramHasAttr(ArgNo, Attribute::NonNull) && 3894 isKnownNonNullAt(V, CS.getInstruction(), &DT)) 3895 ArgNos.push_back(ArgNo); 3896 ArgNo++; 3897 } 3898 3899 assert(ArgNo == CS.arg_size() && "sanity check"); 3900 3901 if (!ArgNos.empty()) { 3902 AttributeList AS = CS.getAttributes(); 3903 LLVMContext &Ctx = CS.getInstruction()->getContext(); 3904 AS = AS.addParamAttribute(Ctx, ArgNos, 3905 Attribute::get(Ctx, Attribute::NonNull)); 3906 CS.setAttributes(AS); 3907 Changed = true; 3908 } 3909 3910 // If the callee is a pointer to a function, attempt to move any casts to the 3911 // arguments of the call/invoke. 3912 Value *Callee = CS.getCalledValue(); 3913 if (!isa<Function>(Callee) && transformConstExprCastCall(CS)) 3914 return nullptr; 3915 3916 if (Function *CalleeF = dyn_cast<Function>(Callee)) { 3917 // Remove the convergent attr on calls when the callee is not convergent. 3918 if (CS.isConvergent() && !CalleeF->isConvergent() && 3919 !CalleeF->isIntrinsic()) { 3920 DEBUG(dbgs() << "Removing convergent attr from instr " 3921 << CS.getInstruction() << "\n"); 3922 CS.setNotConvergent(); 3923 return CS.getInstruction(); 3924 } 3925 3926 // If the call and callee calling conventions don't match, this call must 3927 // be unreachable, as the call is undefined. 3928 if (CalleeF->getCallingConv() != CS.getCallingConv() && 3929 // Only do this for calls to a function with a body. A prototype may 3930 // not actually end up matching the implementation's calling conv for a 3931 // variety of reasons (e.g. it may be written in assembly). 3932 !CalleeF->isDeclaration()) { 3933 Instruction *OldCall = CS.getInstruction(); 3934 new StoreInst(ConstantInt::getTrue(Callee->getContext()), 3935 UndefValue::get(Type::getInt1PtrTy(Callee->getContext())), 3936 OldCall); 3937 // If OldCall does not return void then replaceAllUsesWith undef. 3938 // This allows ValueHandlers and custom metadata to adjust itself. 3939 if (!OldCall->getType()->isVoidTy()) 3940 replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType())); 3941 if (isa<CallInst>(OldCall)) 3942 return eraseInstFromFunction(*OldCall); 3943 3944 // We cannot remove an invoke, because it would change the CFG, just 3945 // change the callee to a null pointer. 3946 cast<InvokeInst>(OldCall)->setCalledFunction( 3947 Constant::getNullValue(CalleeF->getType())); 3948 return nullptr; 3949 } 3950 } 3951 3952 if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { 3953 // If CS does not return void then replaceAllUsesWith undef. 3954 // This allows ValueHandlers and custom metadata to adjust itself. 3955 if (!CS.getInstruction()->getType()->isVoidTy()) 3956 replaceInstUsesWith(*CS.getInstruction(), 3957 UndefValue::get(CS.getInstruction()->getType())); 3958 3959 if (isa<InvokeInst>(CS.getInstruction())) { 3960 // Can't remove an invoke because we cannot change the CFG. 3961 return nullptr; 3962 } 3963 3964 // This instruction is not reachable, just remove it. We insert a store to 3965 // undef so that we know that this code is not reachable, despite the fact 3966 // that we can't modify the CFG here. 3967 new StoreInst(ConstantInt::getTrue(Callee->getContext()), 3968 UndefValue::get(Type::getInt1PtrTy(Callee->getContext())), 3969 CS.getInstruction()); 3970 3971 return eraseInstFromFunction(*CS.getInstruction()); 3972 } 3973 3974 if (IntrinsicInst *II = findInitTrampoline(Callee)) 3975 return transformCallThroughTrampoline(CS, II); 3976 3977 PointerType *PTy = cast<PointerType>(Callee->getType()); 3978 FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); 3979 if (FTy->isVarArg()) { 3980 int ix = FTy->getNumParams(); 3981 // See if we can optimize any arguments passed through the varargs area of 3982 // the call. 3983 for (CallSite::arg_iterator I = CS.arg_begin() + FTy->getNumParams(), 3984 E = CS.arg_end(); I != E; ++I, ++ix) { 3985 CastInst *CI = dyn_cast<CastInst>(*I); 3986 if (CI && isSafeToEliminateVarargsCast(CS, DL, CI, ix)) { 3987 *I = CI->getOperand(0); 3988 Changed = true; 3989 } 3990 } 3991 } 3992 3993 if (isa<InlineAsm>(Callee) && !CS.doesNotThrow()) { 3994 // Inline asm calls cannot throw - mark them 'nounwind'. 3995 CS.setDoesNotThrow(); 3996 Changed = true; 3997 } 3998 3999 // Try to optimize the call if possible, we require DataLayout for most of 4000 // this. None of these calls are seen as possibly dead so go ahead and 4001 // delete the instruction now. 4002 if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) { 4003 Instruction *I = tryOptimizeCall(CI); 4004 // If we changed something return the result, etc. Otherwise let 4005 // the fallthrough check. 4006 if (I) return eraseInstFromFunction(*I); 4007 } 4008 4009 return Changed ? CS.getInstruction() : nullptr; 4010 } 4011 4012 /// If the callee is a constexpr cast of a function, attempt to move the cast to 4013 /// the arguments of the call/invoke. 4014 bool InstCombiner::transformConstExprCastCall(CallSite CS) { 4015 auto *Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); 4016 if (!Callee) 4017 return false; 4018 4019 // The prototype of a thunk is a lie. Don't directly call such a function. 4020 if (Callee->hasFnAttribute("thunk")) 4021 return false; 4022 4023 Instruction *Caller = CS.getInstruction(); 4024 const AttributeList &CallerPAL = CS.getAttributes(); 4025 4026 // Okay, this is a cast from a function to a different type. Unless doing so 4027 // would cause a type conversion of one of our arguments, change this call to 4028 // be a direct call with arguments casted to the appropriate types. 4029 // 4030 FunctionType *FT = Callee->getFunctionType(); 4031 Type *OldRetTy = Caller->getType(); 4032 Type *NewRetTy = FT->getReturnType(); 4033 4034 // Check to see if we are changing the return type... 4035 if (OldRetTy != NewRetTy) { 4036 4037 if (NewRetTy->isStructTy()) 4038 return false; // TODO: Handle multiple return values. 4039 4040 if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) { 4041 if (Callee->isDeclaration()) 4042 return false; // Cannot transform this return value. 4043 4044 if (!Caller->use_empty() && 4045 // void -> non-void is handled specially 4046 !NewRetTy->isVoidTy()) 4047 return false; // Cannot transform this return value. 4048 } 4049 4050 if (!CallerPAL.isEmpty() && !Caller->use_empty()) { 4051 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4052 if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) 4053 return false; // Attribute not compatible with transformed value. 4054 } 4055 4056 // If the callsite is an invoke instruction, and the return value is used by 4057 // a PHI node in a successor, we cannot change the return type of the call 4058 // because there is no place to put the cast instruction (without breaking 4059 // the critical edge). Bail out in this case. 4060 if (!Caller->use_empty()) 4061 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) 4062 for (User *U : II->users()) 4063 if (PHINode *PN = dyn_cast<PHINode>(U)) 4064 if (PN->getParent() == II->getNormalDest() || 4065 PN->getParent() == II->getUnwindDest()) 4066 return false; 4067 } 4068 4069 unsigned NumActualArgs = CS.arg_size(); 4070 unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs); 4071 4072 // Prevent us turning: 4073 // declare void @takes_i32_inalloca(i32* inalloca) 4074 // call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0) 4075 // 4076 // into: 4077 // call void @takes_i32_inalloca(i32* null) 4078 // 4079 // Similarly, avoid folding away bitcasts of byval calls. 4080 if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) || 4081 Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal)) 4082 return false; 4083 4084 CallSite::arg_iterator AI = CS.arg_begin(); 4085 for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) { 4086 Type *ParamTy = FT->getParamType(i); 4087 Type *ActTy = (*AI)->getType(); 4088 4089 if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) 4090 return false; // Cannot transform this parameter value. 4091 4092 if (AttrBuilder(CallerPAL.getParamAttributes(i)) 4093 .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) 4094 return false; // Attribute not compatible with transformed value. 4095 4096 if (CS.isInAllocaArgument(i)) 4097 return false; // Cannot transform to and from inalloca. 4098 4099 // If the parameter is passed as a byval argument, then we have to have a 4100 // sized type and the sized type has to have the same size as the old type. 4101 if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 4102 PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); 4103 if (!ParamPTy || !ParamPTy->getElementType()->isSized()) 4104 return false; 4105 4106 Type *CurElTy = ActTy->getPointerElementType(); 4107 if (DL.getTypeAllocSize(CurElTy) != 4108 DL.getTypeAllocSize(ParamPTy->getElementType())) 4109 return false; 4110 } 4111 } 4112 4113 if (Callee->isDeclaration()) { 4114 // Do not delete arguments unless we have a function body. 4115 if (FT->getNumParams() < NumActualArgs && !FT->isVarArg()) 4116 return false; 4117 4118 // If the callee is just a declaration, don't change the varargsness of the 4119 // call. We don't want to introduce a varargs call where one doesn't 4120 // already exist. 4121 PointerType *APTy = cast<PointerType>(CS.getCalledValue()->getType()); 4122 if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg()) 4123 return false; 4124 4125 // If both the callee and the cast type are varargs, we still have to make 4126 // sure the number of fixed parameters are the same or we have the same 4127 // ABI issues as if we introduce a varargs call. 4128 if (FT->isVarArg() && 4129 cast<FunctionType>(APTy->getElementType())->isVarArg() && 4130 FT->getNumParams() != 4131 cast<FunctionType>(APTy->getElementType())->getNumParams()) 4132 return false; 4133 } 4134 4135 if (FT->getNumParams() < NumActualArgs && FT->isVarArg() && 4136 !CallerPAL.isEmpty()) { 4137 // In this case we have more arguments than the new function type, but we 4138 // won't be dropping them. Check that these extra arguments have attributes 4139 // that are compatible with being a vararg call argument. 4140 unsigned SRetIdx; 4141 if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) && 4142 SRetIdx > FT->getNumParams()) 4143 return false; 4144 } 4145 4146 // Okay, we decided that this is a safe thing to do: go ahead and start 4147 // inserting cast instructions as necessary. 4148 SmallVector<Value *, 8> Args; 4149 SmallVector<AttributeSet, 8> ArgAttrs; 4150 Args.reserve(NumActualArgs); 4151 ArgAttrs.reserve(NumActualArgs); 4152 4153 // Get any return attributes. 4154 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4155 4156 // If the return value is not being used, the type may not be compatible 4157 // with the existing attributes. Wipe out any problematic attributes. 4158 RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy)); 4159 4160 AI = CS.arg_begin(); 4161 for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { 4162 Type *ParamTy = FT->getParamType(i); 4163 4164 Value *NewArg = *AI; 4165 if ((*AI)->getType() != ParamTy) 4166 NewArg = Builder->CreateBitOrPointerCast(*AI, ParamTy); 4167 Args.push_back(NewArg); 4168 4169 // Add any parameter attributes. 4170 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4171 } 4172 4173 // If the function takes more arguments than the call was taking, add them 4174 // now. 4175 for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) { 4176 Args.push_back(Constant::getNullValue(FT->getParamType(i))); 4177 ArgAttrs.push_back(AttributeSet()); 4178 } 4179 4180 // If we are removing arguments to the function, emit an obnoxious warning. 4181 if (FT->getNumParams() < NumActualArgs) { 4182 // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722 4183 if (FT->isVarArg()) { 4184 // Add all of the arguments in their promoted form to the arg list. 4185 for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) { 4186 Type *PTy = getPromotedType((*AI)->getType()); 4187 Value *NewArg = *AI; 4188 if (PTy != (*AI)->getType()) { 4189 // Must promote to pass through va_arg area! 4190 Instruction::CastOps opcode = 4191 CastInst::getCastOpcode(*AI, false, PTy, false); 4192 NewArg = Builder->CreateCast(opcode, *AI, PTy); 4193 } 4194 Args.push_back(NewArg); 4195 4196 // Add any parameter attributes. 4197 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4198 } 4199 } 4200 } 4201 4202 AttributeSet FnAttrs = CallerPAL.getFnAttributes(); 4203 4204 if (NewRetTy->isVoidTy()) 4205 Caller->setName(""); // Void type should not have a name. 4206 4207 assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) && 4208 "missing argument attributes"); 4209 LLVMContext &Ctx = Callee->getContext(); 4210 AttributeList NewCallerPAL = AttributeList::get( 4211 Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs); 4212 4213 SmallVector<OperandBundleDef, 1> OpBundles; 4214 CS.getOperandBundlesAsDefs(OpBundles); 4215 4216 CallSite NewCS; 4217 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4218 NewCS = Builder->CreateInvoke(Callee, II->getNormalDest(), 4219 II->getUnwindDest(), Args, OpBundles); 4220 } else { 4221 NewCS = Builder->CreateCall(Callee, Args, OpBundles); 4222 cast<CallInst>(NewCS.getInstruction()) 4223 ->setTailCallKind(cast<CallInst>(Caller)->getTailCallKind()); 4224 } 4225 NewCS->takeName(Caller); 4226 NewCS.setCallingConv(CS.getCallingConv()); 4227 NewCS.setAttributes(NewCallerPAL); 4228 4229 // Preserve the weight metadata for the new call instruction. The metadata 4230 // is used by SamplePGO to check callsite's hotness. 4231 uint64_t W; 4232 if (Caller->extractProfTotalWeight(W)) 4233 NewCS->setProfWeight(W); 4234 4235 // Insert a cast of the return type as necessary. 4236 Instruction *NC = NewCS.getInstruction(); 4237 Value *NV = NC; 4238 if (OldRetTy != NV->getType() && !Caller->use_empty()) { 4239 if (!NV->getType()->isVoidTy()) { 4240 NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy); 4241 NC->setDebugLoc(Caller->getDebugLoc()); 4242 4243 // If this is an invoke instruction, we should insert it after the first 4244 // non-phi, instruction in the normal successor block. 4245 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4246 BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt(); 4247 InsertNewInstBefore(NC, *I); 4248 } else { 4249 // Otherwise, it's a call, just insert cast right after the call. 4250 InsertNewInstBefore(NC, *Caller); 4251 } 4252 Worklist.AddUsersToWorkList(*Caller); 4253 } else { 4254 NV = UndefValue::get(Caller->getType()); 4255 } 4256 } 4257 4258 if (!Caller->use_empty()) 4259 replaceInstUsesWith(*Caller, NV); 4260 else if (Caller->hasValueHandle()) { 4261 if (OldRetTy == NV->getType()) 4262 ValueHandleBase::ValueIsRAUWd(Caller, NV); 4263 else 4264 // We cannot call ValueIsRAUWd with a different type, and the 4265 // actual tracked value will disappear. 4266 ValueHandleBase::ValueIsDeleted(Caller); 4267 } 4268 4269 eraseInstFromFunction(*Caller); 4270 return true; 4271 } 4272 4273 /// Turn a call to a function created by init_trampoline / adjust_trampoline 4274 /// intrinsic pair into a direct call to the underlying function. 4275 Instruction * 4276 InstCombiner::transformCallThroughTrampoline(CallSite CS, 4277 IntrinsicInst *Tramp) { 4278 Value *Callee = CS.getCalledValue(); 4279 PointerType *PTy = cast<PointerType>(Callee->getType()); 4280 FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); 4281 AttributeList Attrs = CS.getAttributes(); 4282 4283 // If the call already has the 'nest' attribute somewhere then give up - 4284 // otherwise 'nest' would occur twice after splicing in the chain. 4285 if (Attrs.hasAttrSomewhere(Attribute::Nest)) 4286 return nullptr; 4287 4288 assert(Tramp && 4289 "transformCallThroughTrampoline called with incorrect CallSite."); 4290 4291 Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts()); 4292 FunctionType *NestFTy = cast<FunctionType>(NestF->getValueType()); 4293 4294 AttributeList NestAttrs = NestF->getAttributes(); 4295 if (!NestAttrs.isEmpty()) { 4296 unsigned NestArgNo = 0; 4297 Type *NestTy = nullptr; 4298 AttributeSet NestAttr; 4299 4300 // Look for a parameter marked with the 'nest' attribute. 4301 for (FunctionType::param_iterator I = NestFTy->param_begin(), 4302 E = NestFTy->param_end(); 4303 I != E; ++NestArgNo, ++I) { 4304 AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo); 4305 if (AS.hasAttribute(Attribute::Nest)) { 4306 // Record the parameter type and any other attributes. 4307 NestTy = *I; 4308 NestAttr = AS; 4309 break; 4310 } 4311 } 4312 4313 if (NestTy) { 4314 Instruction *Caller = CS.getInstruction(); 4315 std::vector<Value*> NewArgs; 4316 std::vector<AttributeSet> NewArgAttrs; 4317 NewArgs.reserve(CS.arg_size() + 1); 4318 NewArgAttrs.reserve(CS.arg_size()); 4319 4320 // Insert the nest argument into the call argument list, which may 4321 // mean appending it. Likewise for attributes. 4322 4323 { 4324 unsigned ArgNo = 0; 4325 CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); 4326 do { 4327 if (ArgNo == NestArgNo) { 4328 // Add the chain argument and attributes. 4329 Value *NestVal = Tramp->getArgOperand(2); 4330 if (NestVal->getType() != NestTy) 4331 NestVal = Builder->CreateBitCast(NestVal, NestTy, "nest"); 4332 NewArgs.push_back(NestVal); 4333 NewArgAttrs.push_back(NestAttr); 4334 } 4335 4336 if (I == E) 4337 break; 4338 4339 // Add the original argument and attributes. 4340 NewArgs.push_back(*I); 4341 NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); 4342 4343 ++ArgNo; 4344 ++I; 4345 } while (true); 4346 } 4347 4348 // The trampoline may have been bitcast to a bogus type (FTy). 4349 // Handle this by synthesizing a new function type, equal to FTy 4350 // with the chain parameter inserted. 4351 4352 std::vector<Type*> NewTypes; 4353 NewTypes.reserve(FTy->getNumParams()+1); 4354 4355 // Insert the chain's type into the list of parameter types, which may 4356 // mean appending it. 4357 { 4358 unsigned ArgNo = 0; 4359 FunctionType::param_iterator I = FTy->param_begin(), 4360 E = FTy->param_end(); 4361 4362 do { 4363 if (ArgNo == NestArgNo) 4364 // Add the chain's type. 4365 NewTypes.push_back(NestTy); 4366 4367 if (I == E) 4368 break; 4369 4370 // Add the original type. 4371 NewTypes.push_back(*I); 4372 4373 ++ArgNo; 4374 ++I; 4375 } while (true); 4376 } 4377 4378 // Replace the trampoline call with a direct call. Let the generic 4379 // code sort out any function type mismatches. 4380 FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes, 4381 FTy->isVarArg()); 4382 Constant *NewCallee = 4383 NestF->getType() == PointerType::getUnqual(NewFTy) ? 4384 NestF : ConstantExpr::getBitCast(NestF, 4385 PointerType::getUnqual(NewFTy)); 4386 AttributeList NewPAL = 4387 AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(), 4388 Attrs.getRetAttributes(), NewArgAttrs); 4389 4390 SmallVector<OperandBundleDef, 1> OpBundles; 4391 CS.getOperandBundlesAsDefs(OpBundles); 4392 4393 Instruction *NewCaller; 4394 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4395 NewCaller = InvokeInst::Create(NewCallee, 4396 II->getNormalDest(), II->getUnwindDest(), 4397 NewArgs, OpBundles); 4398 cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv()); 4399 cast<InvokeInst>(NewCaller)->setAttributes(NewPAL); 4400 } else { 4401 NewCaller = CallInst::Create(NewCallee, NewArgs, OpBundles); 4402 cast<CallInst>(NewCaller)->setTailCallKind( 4403 cast<CallInst>(Caller)->getTailCallKind()); 4404 cast<CallInst>(NewCaller)->setCallingConv( 4405 cast<CallInst>(Caller)->getCallingConv()); 4406 cast<CallInst>(NewCaller)->setAttributes(NewPAL); 4407 } 4408 4409 return NewCaller; 4410 } 4411 } 4412 4413 // Replace the trampoline call with a direct call. Since there is no 'nest' 4414 // parameter, there is no need to adjust the argument list. Let the generic 4415 // code sort out any function type mismatches. 4416 Constant *NewCallee = 4417 NestF->getType() == PTy ? NestF : 4418 ConstantExpr::getBitCast(NestF, PTy); 4419 CS.setCalledFunction(NewCallee); 4420 return CS.getInstruction(); 4421 } 4422