1 //===- InstCombineCalls.cpp -----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the visitCall, visitInvoke, and visitCallBr functions. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "InstCombineInternal.h" 14 #include "llvm/ADT/APFloat.h" 15 #include "llvm/ADT/APInt.h" 16 #include "llvm/ADT/APSInt.h" 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/FloatingPointMode.h" 19 #include "llvm/ADT/None.h" 20 #include "llvm/ADT/Optional.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/ADT/SmallVector.h" 23 #include "llvm/ADT/Statistic.h" 24 #include "llvm/ADT/Twine.h" 25 #include "llvm/Analysis/AssumeBundleQueries.h" 26 #include "llvm/Analysis/AssumptionCache.h" 27 #include "llvm/Analysis/InstructionSimplify.h" 28 #include "llvm/Analysis/Loads.h" 29 #include "llvm/Analysis/MemoryBuiltins.h" 30 #include "llvm/Analysis/ValueTracking.h" 31 #include "llvm/Analysis/VectorUtils.h" 32 #include "llvm/IR/Attributes.h" 33 #include "llvm/IR/BasicBlock.h" 34 #include "llvm/IR/Constant.h" 35 #include "llvm/IR/Constants.h" 36 #include "llvm/IR/DataLayout.h" 37 #include "llvm/IR/DerivedTypes.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/IR/GlobalVariable.h" 40 #include "llvm/IR/InstrTypes.h" 41 #include "llvm/IR/Instruction.h" 42 #include "llvm/IR/Instructions.h" 43 #include "llvm/IR/IntrinsicInst.h" 44 #include "llvm/IR/Intrinsics.h" 45 #include "llvm/IR/IntrinsicsX86.h" 46 #include "llvm/IR/IntrinsicsARM.h" 47 #include "llvm/IR/IntrinsicsAArch64.h" 48 #include "llvm/IR/IntrinsicsHexagon.h" 49 #include "llvm/IR/IntrinsicsNVPTX.h" 50 #include "llvm/IR/IntrinsicsAMDGPU.h" 51 #include "llvm/IR/IntrinsicsPowerPC.h" 52 #include "llvm/IR/LLVMContext.h" 53 #include "llvm/IR/Metadata.h" 54 #include "llvm/IR/PatternMatch.h" 55 #include "llvm/IR/Statepoint.h" 56 #include "llvm/IR/Type.h" 57 #include "llvm/IR/User.h" 58 #include "llvm/IR/Value.h" 59 #include "llvm/IR/ValueHandle.h" 60 #include "llvm/Support/AtomicOrdering.h" 61 #include "llvm/Support/Casting.h" 62 #include "llvm/Support/CommandLine.h" 63 #include "llvm/Support/Compiler.h" 64 #include "llvm/Support/Debug.h" 65 #include "llvm/Support/ErrorHandling.h" 66 #include "llvm/Support/KnownBits.h" 67 #include "llvm/Support/MathExtras.h" 68 #include "llvm/Support/raw_ostream.h" 69 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h" 70 #include "llvm/Transforms/Utils/Local.h" 71 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 72 #include <algorithm> 73 #include <cassert> 74 #include <cstdint> 75 #include <cstring> 76 #include <utility> 77 #include <vector> 78 79 using namespace llvm; 80 using namespace PatternMatch; 81 82 #define DEBUG_TYPE "instcombine" 83 84 STATISTIC(NumSimplified, "Number of library calls simplified"); 85 86 static cl::opt<unsigned> GuardWideningWindow( 87 "instcombine-guard-widening-window", 88 cl::init(3), 89 cl::desc("How wide an instruction window to bypass looking for " 90 "another guard")); 91 92 /// Return the specified type promoted as it would be to pass though a va_arg 93 /// area. 94 static Type *getPromotedType(Type *Ty) { 95 if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) { 96 if (ITy->getBitWidth() < 32) 97 return Type::getInt32Ty(Ty->getContext()); 98 } 99 return Ty; 100 } 101 102 /// Return a constant boolean vector that has true elements in all positions 103 /// where the input constant data vector has an element with the sign bit set. 104 static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { 105 SmallVector<Constant *, 32> BoolVec; 106 IntegerType *BoolTy = Type::getInt1Ty(V->getContext()); 107 for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) { 108 Constant *Elt = V->getElementAsConstant(I); 109 assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) && 110 "Unexpected constant data vector element type"); 111 bool Sign = V->getElementType()->isIntegerTy() 112 ? cast<ConstantInt>(Elt)->isNegative() 113 : cast<ConstantFP>(Elt)->isNegative(); 114 BoolVec.push_back(ConstantInt::get(BoolTy, Sign)); 115 } 116 return ConstantVector::get(BoolVec); 117 } 118 119 Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { 120 unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT); 121 unsigned CopyDstAlign = MI->getDestAlignment(); 122 if (CopyDstAlign < DstAlign){ 123 MI->setDestAlignment(DstAlign); 124 return MI; 125 } 126 127 unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT); 128 unsigned CopySrcAlign = MI->getSourceAlignment(); 129 if (CopySrcAlign < SrcAlign) { 130 MI->setSourceAlignment(SrcAlign); 131 return MI; 132 } 133 134 // If we have a store to a location which is known constant, we can conclude 135 // that the store must be storing the constant value (else the memory 136 // wouldn't be constant), and this must be a noop. 137 if (AA->pointsToConstantMemory(MI->getDest())) { 138 // Set the size of the copy to 0, it will be deleted on the next iteration. 139 MI->setLength(Constant::getNullValue(MI->getLength()->getType())); 140 return MI; 141 } 142 143 // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with 144 // load/store. 145 ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength()); 146 if (!MemOpLength) return nullptr; 147 148 // Source and destination pointer types are always "i8*" for intrinsic. See 149 // if the size is something we can handle with a single primitive load/store. 150 // A single load+store correctly handles overlapping memory in the memmove 151 // case. 152 uint64_t Size = MemOpLength->getLimitedValue(); 153 assert(Size && "0-sized memory transferring should be removed already."); 154 155 if (Size > 8 || (Size&(Size-1))) 156 return nullptr; // If not 1/2/4/8 bytes, exit. 157 158 // If it is an atomic and alignment is less than the size then we will 159 // introduce the unaligned memory access which will be later transformed 160 // into libcall in CodeGen. This is not evident performance gain so disable 161 // it now. 162 if (isa<AtomicMemTransferInst>(MI)) 163 if (CopyDstAlign < Size || CopySrcAlign < Size) 164 return nullptr; 165 166 // Use an integer load+store unless we can find something better. 167 unsigned SrcAddrSp = 168 cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace(); 169 unsigned DstAddrSp = 170 cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace(); 171 172 IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3); 173 Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp); 174 Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp); 175 176 // If the memcpy has metadata describing the members, see if we can get the 177 // TBAA tag describing our copy. 178 MDNode *CopyMD = nullptr; 179 if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) { 180 CopyMD = M; 181 } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { 182 if (M->getNumOperands() == 3 && M->getOperand(0) && 183 mdconst::hasa<ConstantInt>(M->getOperand(0)) && 184 mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() && 185 M->getOperand(1) && 186 mdconst::hasa<ConstantInt>(M->getOperand(1)) && 187 mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == 188 Size && 189 M->getOperand(2) && isa<MDNode>(M->getOperand(2))) 190 CopyMD = cast<MDNode>(M->getOperand(2)); 191 } 192 193 Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy); 194 Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); 195 LoadInst *L = Builder.CreateLoad(IntType, Src); 196 // Alignment from the mem intrinsic will be better, so use it. 197 L->setAlignment( 198 MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead. 199 if (CopyMD) 200 L->setMetadata(LLVMContext::MD_tbaa, CopyMD); 201 MDNode *LoopMemParallelMD = 202 MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access); 203 if (LoopMemParallelMD) 204 L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 205 MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group); 206 if (AccessGroupMD) 207 L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); 208 209 StoreInst *S = Builder.CreateStore(L, Dest); 210 // Alignment from the mem intrinsic will be better, so use it. 211 S->setAlignment( 212 MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead. 213 if (CopyMD) 214 S->setMetadata(LLVMContext::MD_tbaa, CopyMD); 215 if (LoopMemParallelMD) 216 S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 217 if (AccessGroupMD) 218 S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); 219 220 if (auto *MT = dyn_cast<MemTransferInst>(MI)) { 221 // non-atomics can be volatile 222 L->setVolatile(MT->isVolatile()); 223 S->setVolatile(MT->isVolatile()); 224 } 225 if (isa<AtomicMemTransferInst>(MI)) { 226 // atomics have to be unordered 227 L->setOrdering(AtomicOrdering::Unordered); 228 S->setOrdering(AtomicOrdering::Unordered); 229 } 230 231 // Set the size of the copy to 0, it will be deleted on the next iteration. 232 MI->setLength(Constant::getNullValue(MemOpLength->getType())); 233 return MI; 234 } 235 236 Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { 237 const unsigned KnownAlignment = 238 getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); 239 if (MI->getDestAlignment() < KnownAlignment) { 240 MI->setDestAlignment(KnownAlignment); 241 return MI; 242 } 243 244 // If we have a store to a location which is known constant, we can conclude 245 // that the store must be storing the constant value (else the memory 246 // wouldn't be constant), and this must be a noop. 247 if (AA->pointsToConstantMemory(MI->getDest())) { 248 // Set the size of the copy to 0, it will be deleted on the next iteration. 249 MI->setLength(Constant::getNullValue(MI->getLength()->getType())); 250 return MI; 251 } 252 253 // Extract the length and alignment and fill if they are constant. 254 ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength()); 255 ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue()); 256 if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) 257 return nullptr; 258 const uint64_t Len = LenC->getLimitedValue(); 259 assert(Len && "0-sized memory setting should be removed already."); 260 const Align Alignment = assumeAligned(MI->getDestAlignment()); 261 262 // If it is an atomic and alignment is less than the size then we will 263 // introduce the unaligned memory access which will be later transformed 264 // into libcall in CodeGen. This is not evident performance gain so disable 265 // it now. 266 if (isa<AtomicMemSetInst>(MI)) 267 if (Alignment < Len) 268 return nullptr; 269 270 // memset(s,c,n) -> store s, c (for n=1,2,4,8) 271 if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) { 272 Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8. 273 274 Value *Dest = MI->getDest(); 275 unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace(); 276 Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp); 277 Dest = Builder.CreateBitCast(Dest, NewDstPtrTy); 278 279 // Extract the fill value and store. 280 uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; 281 StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest, 282 MI->isVolatile()); 283 S->setAlignment(Alignment); 284 if (isa<AtomicMemSetInst>(MI)) 285 S->setOrdering(AtomicOrdering::Unordered); 286 287 // Set the size of the copy to 0, it will be deleted on the next iteration. 288 MI->setLength(Constant::getNullValue(LenC->getType())); 289 return MI; 290 } 291 292 return nullptr; 293 } 294 295 static Value *simplifyX86immShift(const IntrinsicInst &II, 296 InstCombiner::BuilderTy &Builder) { 297 bool LogicalShift = false; 298 bool ShiftLeft = false; 299 bool IsImm = false; 300 301 switch (II.getIntrinsicID()) { 302 default: llvm_unreachable("Unexpected intrinsic!"); 303 case Intrinsic::x86_sse2_psrai_d: 304 case Intrinsic::x86_sse2_psrai_w: 305 case Intrinsic::x86_avx2_psrai_d: 306 case Intrinsic::x86_avx2_psrai_w: 307 case Intrinsic::x86_avx512_psrai_q_128: 308 case Intrinsic::x86_avx512_psrai_q_256: 309 case Intrinsic::x86_avx512_psrai_d_512: 310 case Intrinsic::x86_avx512_psrai_q_512: 311 case Intrinsic::x86_avx512_psrai_w_512: 312 IsImm = true; 313 LLVM_FALLTHROUGH; 314 case Intrinsic::x86_sse2_psra_d: 315 case Intrinsic::x86_sse2_psra_w: 316 case Intrinsic::x86_avx2_psra_d: 317 case Intrinsic::x86_avx2_psra_w: 318 case Intrinsic::x86_avx512_psra_q_128: 319 case Intrinsic::x86_avx512_psra_q_256: 320 case Intrinsic::x86_avx512_psra_d_512: 321 case Intrinsic::x86_avx512_psra_q_512: 322 case Intrinsic::x86_avx512_psra_w_512: 323 LogicalShift = false; 324 ShiftLeft = false; 325 break; 326 case Intrinsic::x86_sse2_psrli_d: 327 case Intrinsic::x86_sse2_psrli_q: 328 case Intrinsic::x86_sse2_psrli_w: 329 case Intrinsic::x86_avx2_psrli_d: 330 case Intrinsic::x86_avx2_psrli_q: 331 case Intrinsic::x86_avx2_psrli_w: 332 case Intrinsic::x86_avx512_psrli_d_512: 333 case Intrinsic::x86_avx512_psrli_q_512: 334 case Intrinsic::x86_avx512_psrli_w_512: 335 IsImm = true; 336 LLVM_FALLTHROUGH; 337 case Intrinsic::x86_sse2_psrl_d: 338 case Intrinsic::x86_sse2_psrl_q: 339 case Intrinsic::x86_sse2_psrl_w: 340 case Intrinsic::x86_avx2_psrl_d: 341 case Intrinsic::x86_avx2_psrl_q: 342 case Intrinsic::x86_avx2_psrl_w: 343 case Intrinsic::x86_avx512_psrl_d_512: 344 case Intrinsic::x86_avx512_psrl_q_512: 345 case Intrinsic::x86_avx512_psrl_w_512: 346 LogicalShift = true; 347 ShiftLeft = false; 348 break; 349 case Intrinsic::x86_sse2_pslli_d: 350 case Intrinsic::x86_sse2_pslli_q: 351 case Intrinsic::x86_sse2_pslli_w: 352 case Intrinsic::x86_avx2_pslli_d: 353 case Intrinsic::x86_avx2_pslli_q: 354 case Intrinsic::x86_avx2_pslli_w: 355 case Intrinsic::x86_avx512_pslli_d_512: 356 case Intrinsic::x86_avx512_pslli_q_512: 357 case Intrinsic::x86_avx512_pslli_w_512: 358 IsImm = true; 359 LLVM_FALLTHROUGH; 360 case Intrinsic::x86_sse2_psll_d: 361 case Intrinsic::x86_sse2_psll_q: 362 case Intrinsic::x86_sse2_psll_w: 363 case Intrinsic::x86_avx2_psll_d: 364 case Intrinsic::x86_avx2_psll_q: 365 case Intrinsic::x86_avx2_psll_w: 366 case Intrinsic::x86_avx512_psll_d_512: 367 case Intrinsic::x86_avx512_psll_q_512: 368 case Intrinsic::x86_avx512_psll_w_512: 369 LogicalShift = true; 370 ShiftLeft = true; 371 break; 372 } 373 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 374 375 auto Vec = II.getArgOperand(0); 376 auto Amt = II.getArgOperand(1); 377 auto VT = cast<VectorType>(Vec->getType()); 378 auto SVT = VT->getElementType(); 379 auto AmtVT = Amt->getType(); 380 unsigned VWidth = VT->getNumElements(); 381 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 382 383 // If the shift amount is guaranteed to be in-range we can replace it with a 384 // generic shift. If its guaranteed to be out of range, logical shifts combine to 385 // zero and arithmetic shifts are clamped to (BitWidth - 1). 386 if (IsImm) { 387 assert(AmtVT ->isIntegerTy(32) && 388 "Unexpected shift-by-immediate type"); 389 KnownBits KnownAmtBits = 390 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 391 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 392 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 393 Amt = Builder.CreateVectorSplat(VWidth, Amt); 394 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 395 : Builder.CreateLShr(Vec, Amt)) 396 : Builder.CreateAShr(Vec, Amt)); 397 } 398 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 399 if (LogicalShift) 400 return ConstantAggregateZero::get(VT); 401 Amt = ConstantInt::get(SVT, BitWidth - 1); 402 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 403 } 404 } else { 405 // Ensure the first element has an in-range value and the rest of the 406 // elements in the bottom 64 bits are zero. 407 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 408 cast<VectorType>(AmtVT)->getElementType() == SVT && 409 "Unexpected shift-by-scalar type"); 410 unsigned NumAmtElts = cast<VectorType>(AmtVT)->getNumElements(); 411 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 412 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 413 KnownBits KnownLowerBits = llvm::computeKnownBits( 414 Amt, DemandedLower, II.getModule()->getDataLayout()); 415 KnownBits KnownUpperBits = llvm::computeKnownBits( 416 Amt, DemandedUpper, II.getModule()->getDataLayout()); 417 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 418 (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) { 419 SmallVector<uint32_t, 16> ZeroSplat(VWidth, 0); 420 Amt = Builder.CreateShuffleVector(Amt, Amt, ZeroSplat); 421 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 422 : Builder.CreateLShr(Vec, Amt)) 423 : Builder.CreateAShr(Vec, Amt)); 424 } 425 } 426 427 // Simplify if count is constant vector. 428 auto CDV = dyn_cast<ConstantDataVector>(Amt); 429 if (!CDV) 430 return nullptr; 431 432 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 433 // operand to compute the shift amount. 434 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 435 cast<VectorType>(AmtVT)->getElementType() == SVT && 436 "Unexpected shift-by-scalar type"); 437 438 // Concatenate the sub-elements to create the 64-bit value. 439 APInt Count(64, 0); 440 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 441 unsigned SubEltIdx = (NumSubElts - 1) - i; 442 auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 443 Count <<= BitWidth; 444 Count |= SubElt->getValue().zextOrTrunc(64); 445 } 446 447 // If shift-by-zero then just return the original value. 448 if (Count.isNullValue()) 449 return Vec; 450 451 // Handle cases when Shift >= BitWidth. 452 if (Count.uge(BitWidth)) { 453 // If LogicalShift - just return zero. 454 if (LogicalShift) 455 return ConstantAggregateZero::get(VT); 456 457 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 458 Count = APInt(64, BitWidth - 1); 459 } 460 461 // Get a constant vector of the same type as the first operand. 462 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 463 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 464 465 if (ShiftLeft) 466 return Builder.CreateShl(Vec, ShiftVec); 467 468 if (LogicalShift) 469 return Builder.CreateLShr(Vec, ShiftVec); 470 471 return Builder.CreateAShr(Vec, ShiftVec); 472 } 473 474 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 475 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 476 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 477 static Value *simplifyX86varShift(const IntrinsicInst &II, 478 InstCombiner::BuilderTy &Builder) { 479 bool LogicalShift = false; 480 bool ShiftLeft = false; 481 482 switch (II.getIntrinsicID()) { 483 default: llvm_unreachable("Unexpected intrinsic!"); 484 case Intrinsic::x86_avx2_psrav_d: 485 case Intrinsic::x86_avx2_psrav_d_256: 486 case Intrinsic::x86_avx512_psrav_q_128: 487 case Intrinsic::x86_avx512_psrav_q_256: 488 case Intrinsic::x86_avx512_psrav_d_512: 489 case Intrinsic::x86_avx512_psrav_q_512: 490 case Intrinsic::x86_avx512_psrav_w_128: 491 case Intrinsic::x86_avx512_psrav_w_256: 492 case Intrinsic::x86_avx512_psrav_w_512: 493 LogicalShift = false; 494 ShiftLeft = false; 495 break; 496 case Intrinsic::x86_avx2_psrlv_d: 497 case Intrinsic::x86_avx2_psrlv_d_256: 498 case Intrinsic::x86_avx2_psrlv_q: 499 case Intrinsic::x86_avx2_psrlv_q_256: 500 case Intrinsic::x86_avx512_psrlv_d_512: 501 case Intrinsic::x86_avx512_psrlv_q_512: 502 case Intrinsic::x86_avx512_psrlv_w_128: 503 case Intrinsic::x86_avx512_psrlv_w_256: 504 case Intrinsic::x86_avx512_psrlv_w_512: 505 LogicalShift = true; 506 ShiftLeft = false; 507 break; 508 case Intrinsic::x86_avx2_psllv_d: 509 case Intrinsic::x86_avx2_psllv_d_256: 510 case Intrinsic::x86_avx2_psllv_q: 511 case Intrinsic::x86_avx2_psllv_q_256: 512 case Intrinsic::x86_avx512_psllv_d_512: 513 case Intrinsic::x86_avx512_psllv_q_512: 514 case Intrinsic::x86_avx512_psllv_w_128: 515 case Intrinsic::x86_avx512_psllv_w_256: 516 case Intrinsic::x86_avx512_psllv_w_512: 517 LogicalShift = true; 518 ShiftLeft = true; 519 break; 520 } 521 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 522 523 auto Vec = II.getArgOperand(0); 524 auto Amt = II.getArgOperand(1); 525 auto VT = cast<VectorType>(II.getType()); 526 auto SVT = VT->getVectorElementType(); 527 int NumElts = VT->getNumElements(); 528 int BitWidth = SVT->getIntegerBitWidth(); 529 530 // If the shift amount is guaranteed to be in-range we can replace it with a 531 // generic shift. 532 APInt UpperBits = 533 APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth)); 534 if (llvm::MaskedValueIsZero(Amt, UpperBits, 535 II.getModule()->getDataLayout())) { 536 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 537 : Builder.CreateLShr(Vec, Amt)) 538 : Builder.CreateAShr(Vec, Amt)); 539 } 540 541 // Simplify if all shift amounts are constant/undef. 542 auto *CShift = dyn_cast<Constant>(Amt); 543 if (!CShift) 544 return nullptr; 545 546 // Collect each element's shift amount. 547 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 548 bool AnyOutOfRange = false; 549 SmallVector<int, 8> ShiftAmts; 550 for (int I = 0; I < NumElts; ++I) { 551 auto *CElt = CShift->getAggregateElement(I); 552 if (CElt && isa<UndefValue>(CElt)) { 553 ShiftAmts.push_back(-1); 554 continue; 555 } 556 557 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 558 if (!COp) 559 return nullptr; 560 561 // Handle out of range shifts. 562 // If LogicalShift - set to BitWidth (special case). 563 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 564 APInt ShiftVal = COp->getValue(); 565 if (ShiftVal.uge(BitWidth)) { 566 AnyOutOfRange = LogicalShift; 567 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 568 continue; 569 } 570 571 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 572 } 573 574 // If all elements out of range or UNDEF, return vector of zeros/undefs. 575 // ArithmeticShift should only hit this if they are all UNDEF. 576 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 577 if (llvm::all_of(ShiftAmts, OutOfRange)) { 578 SmallVector<Constant *, 8> ConstantVec; 579 for (int Idx : ShiftAmts) { 580 if (Idx < 0) { 581 ConstantVec.push_back(UndefValue::get(SVT)); 582 } else { 583 assert(LogicalShift && "Logical shift expected"); 584 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 585 } 586 } 587 return ConstantVector::get(ConstantVec); 588 } 589 590 // We can't handle only some out of range values with generic logical shifts. 591 if (AnyOutOfRange) 592 return nullptr; 593 594 // Build the shift amount constant vector. 595 SmallVector<Constant *, 8> ShiftVecAmts; 596 for (int Idx : ShiftAmts) { 597 if (Idx < 0) 598 ShiftVecAmts.push_back(UndefValue::get(SVT)); 599 else 600 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 601 } 602 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 603 604 if (ShiftLeft) 605 return Builder.CreateShl(Vec, ShiftVec); 606 607 if (LogicalShift) 608 return Builder.CreateLShr(Vec, ShiftVec); 609 610 return Builder.CreateAShr(Vec, ShiftVec); 611 } 612 613 static Value *simplifyX86pack(IntrinsicInst &II, 614 InstCombiner::BuilderTy &Builder, bool IsSigned) { 615 Value *Arg0 = II.getArgOperand(0); 616 Value *Arg1 = II.getArgOperand(1); 617 Type *ResTy = II.getType(); 618 619 // Fast all undef handling. 620 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 621 return UndefValue::get(ResTy); 622 623 Type *ArgTy = Arg0->getType(); 624 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 625 unsigned NumSrcElts = ArgTy->getVectorNumElements(); 626 assert(ResTy->getVectorNumElements() == (2 * NumSrcElts) && 627 "Unexpected packing types"); 628 629 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 630 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 631 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 632 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 633 "Unexpected packing types"); 634 635 // Constant folding. 636 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 637 return nullptr; 638 639 // Clamp Values - signed/unsigned both use signed clamp values, but they 640 // differ on the min/max values. 641 APInt MinValue, MaxValue; 642 if (IsSigned) { 643 // PACKSS: Truncate signed value with signed saturation. 644 // Source values less than dst minint are saturated to minint. 645 // Source values greater than dst maxint are saturated to maxint. 646 MinValue = 647 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 648 MaxValue = 649 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 650 } else { 651 // PACKUS: Truncate signed value with unsigned saturation. 652 // Source values less than zero are saturated to zero. 653 // Source values greater than dst maxuint are saturated to maxuint. 654 MinValue = APInt::getNullValue(SrcScalarSizeInBits); 655 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 656 } 657 658 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 659 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 660 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 661 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 662 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 663 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 664 665 // Shuffle clamped args together at the lane level. 666 SmallVector<unsigned, 32> PackMask; 667 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 668 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 669 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 670 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 671 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 672 } 673 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 674 675 // Truncate to dst size. 676 return Builder.CreateTrunc(Shuffle, ResTy); 677 } 678 679 static Value *simplifyX86movmsk(const IntrinsicInst &II, 680 InstCombiner::BuilderTy &Builder) { 681 Value *Arg = II.getArgOperand(0); 682 Type *ResTy = II.getType(); 683 Type *ArgTy = Arg->getType(); 684 685 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 686 if (isa<UndefValue>(Arg)) 687 return Constant::getNullValue(ResTy); 688 689 // We can't easily peek through x86_mmx types. 690 if (!ArgTy->isVectorTy()) 691 return nullptr; 692 693 // Expand MOVMSK to compare/bitcast/zext: 694 // e.g. PMOVMSKB(v16i8 x): 695 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 696 // %int = bitcast <16 x i1> %cmp to i16 697 // %res = zext i16 %int to i32 698 unsigned NumElts = ArgTy->getVectorNumElements(); 699 Type *IntegerVecTy = VectorType::getInteger(cast<VectorType>(ArgTy)); 700 Type *IntegerTy = Builder.getIntNTy(NumElts); 701 702 Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); 703 Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); 704 Res = Builder.CreateBitCast(Res, IntegerTy); 705 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 706 return Res; 707 } 708 709 static Value *simplifyX86addcarry(const IntrinsicInst &II, 710 InstCombiner::BuilderTy &Builder) { 711 Value *CarryIn = II.getArgOperand(0); 712 Value *Op1 = II.getArgOperand(1); 713 Value *Op2 = II.getArgOperand(2); 714 Type *RetTy = II.getType(); 715 Type *OpTy = Op1->getType(); 716 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 717 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 718 "Unexpected types for x86 addcarry"); 719 720 // If carry-in is zero, this is just an unsigned add with overflow. 721 if (match(CarryIn, m_ZeroInt())) { 722 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 723 { Op1, Op2 }); 724 // The types have to be adjusted to match the x86 call types. 725 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 726 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 727 Builder.getInt8Ty()); 728 Value *Res = UndefValue::get(RetTy); 729 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 730 return Builder.CreateInsertValue(Res, UAddResult, 1); 731 } 732 733 return nullptr; 734 } 735 736 static Value *simplifyX86insertps(const IntrinsicInst &II, 737 InstCombiner::BuilderTy &Builder) { 738 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 739 if (!CInt) 740 return nullptr; 741 742 VectorType *VecTy = cast<VectorType>(II.getType()); 743 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 744 745 // The immediate permute control byte looks like this: 746 // [3:0] - zero mask for each 32-bit lane 747 // [5:4] - select one 32-bit destination lane 748 // [7:6] - select one 32-bit source lane 749 750 uint8_t Imm = CInt->getZExtValue(); 751 uint8_t ZMask = Imm & 0xf; 752 uint8_t DestLane = (Imm >> 4) & 0x3; 753 uint8_t SourceLane = (Imm >> 6) & 0x3; 754 755 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 756 757 // If all zero mask bits are set, this was just a weird way to 758 // generate a zero vector. 759 if (ZMask == 0xf) 760 return ZeroVector; 761 762 // Initialize by passing all of the first source bits through. 763 uint32_t ShuffleMask[4] = { 0, 1, 2, 3 }; 764 765 // We may replace the second operand with the zero vector. 766 Value *V1 = II.getArgOperand(1); 767 768 if (ZMask) { 769 // If the zero mask is being used with a single input or the zero mask 770 // overrides the destination lane, this is a shuffle with the zero vector. 771 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 772 (ZMask & (1 << DestLane))) { 773 V1 = ZeroVector; 774 // We may still move 32-bits of the first source vector from one lane 775 // to another. 776 ShuffleMask[DestLane] = SourceLane; 777 // The zero mask may override the previous insert operation. 778 for (unsigned i = 0; i < 4; ++i) 779 if ((ZMask >> i) & 0x1) 780 ShuffleMask[i] = i + 4; 781 } else { 782 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 783 return nullptr; 784 } 785 } else { 786 // Replace the selected destination lane with the selected source lane. 787 ShuffleMask[DestLane] = SourceLane + 4; 788 } 789 790 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 791 } 792 793 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 794 /// or conversion to a shuffle vector. 795 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 796 ConstantInt *CILength, ConstantInt *CIIndex, 797 InstCombiner::BuilderTy &Builder) { 798 auto LowConstantHighUndef = [&](uint64_t Val) { 799 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 800 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 801 UndefValue::get(IntTy64)}; 802 return ConstantVector::get(Args); 803 }; 804 805 // See if we're dealing with constant values. 806 Constant *C0 = dyn_cast<Constant>(Op0); 807 ConstantInt *CI0 = 808 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 809 : nullptr; 810 811 // Attempt to constant fold. 812 if (CILength && CIIndex) { 813 // From AMD documentation: "The bit index and field length are each six 814 // bits in length other bits of the field are ignored." 815 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 816 APInt APLength = CILength->getValue().zextOrTrunc(6); 817 818 unsigned Index = APIndex.getZExtValue(); 819 820 // From AMD documentation: "a value of zero in the field length is 821 // defined as length of 64". 822 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 823 824 // From AMD documentation: "If the sum of the bit index + length field 825 // is greater than 64, the results are undefined". 826 unsigned End = Index + Length; 827 828 // Note that both field index and field length are 8-bit quantities. 829 // Since variables 'Index' and 'Length' are unsigned values 830 // obtained from zero-extending field index and field length 831 // respectively, their sum should never wrap around. 832 if (End > 64) 833 return UndefValue::get(II.getType()); 834 835 // If we are inserting whole bytes, we can convert this to a shuffle. 836 // Lowering can recognize EXTRQI shuffle masks. 837 if ((Length % 8) == 0 && (Index % 8) == 0) { 838 // Convert bit indices to byte indices. 839 Length /= 8; 840 Index /= 8; 841 842 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 843 Type *IntTy32 = Type::getInt32Ty(II.getContext()); 844 VectorType *ShufTy = VectorType::get(IntTy8, 16); 845 846 SmallVector<Constant *, 16> ShuffleMask; 847 for (int i = 0; i != (int)Length; ++i) 848 ShuffleMask.push_back( 849 Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); 850 for (int i = Length; i != 8; ++i) 851 ShuffleMask.push_back( 852 Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); 853 for (int i = 8; i != 16; ++i) 854 ShuffleMask.push_back(UndefValue::get(IntTy32)); 855 856 Value *SV = Builder.CreateShuffleVector( 857 Builder.CreateBitCast(Op0, ShufTy), 858 ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask)); 859 return Builder.CreateBitCast(SV, II.getType()); 860 } 861 862 // Constant Fold - shift Index'th bit to lowest position and mask off 863 // Length bits. 864 if (CI0) { 865 APInt Elt = CI0->getValue(); 866 Elt.lshrInPlace(Index); 867 Elt = Elt.zextOrTrunc(Length); 868 return LowConstantHighUndef(Elt.getZExtValue()); 869 } 870 871 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 872 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 873 Value *Args[] = {Op0, CILength, CIIndex}; 874 Module *M = II.getModule(); 875 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 876 return Builder.CreateCall(F, Args); 877 } 878 } 879 880 // Constant Fold - extraction from zero is always {zero, undef}. 881 if (CI0 && CI0->isZero()) 882 return LowConstantHighUndef(0); 883 884 return nullptr; 885 } 886 887 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 888 /// folding or conversion to a shuffle vector. 889 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 890 APInt APLength, APInt APIndex, 891 InstCombiner::BuilderTy &Builder) { 892 // From AMD documentation: "The bit index and field length are each six bits 893 // in length other bits of the field are ignored." 894 APIndex = APIndex.zextOrTrunc(6); 895 APLength = APLength.zextOrTrunc(6); 896 897 // Attempt to constant fold. 898 unsigned Index = APIndex.getZExtValue(); 899 900 // From AMD documentation: "a value of zero in the field length is 901 // defined as length of 64". 902 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 903 904 // From AMD documentation: "If the sum of the bit index + length field 905 // is greater than 64, the results are undefined". 906 unsigned End = Index + Length; 907 908 // Note that both field index and field length are 8-bit quantities. 909 // Since variables 'Index' and 'Length' are unsigned values 910 // obtained from zero-extending field index and field length 911 // respectively, their sum should never wrap around. 912 if (End > 64) 913 return UndefValue::get(II.getType()); 914 915 // If we are inserting whole bytes, we can convert this to a shuffle. 916 // Lowering can recognize INSERTQI shuffle masks. 917 if ((Length % 8) == 0 && (Index % 8) == 0) { 918 // Convert bit indices to byte indices. 919 Length /= 8; 920 Index /= 8; 921 922 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 923 Type *IntTy32 = Type::getInt32Ty(II.getContext()); 924 VectorType *ShufTy = VectorType::get(IntTy8, 16); 925 926 SmallVector<Constant *, 16> ShuffleMask; 927 for (int i = 0; i != (int)Index; ++i) 928 ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); 929 for (int i = 0; i != (int)Length; ++i) 930 ShuffleMask.push_back( 931 Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); 932 for (int i = Index + Length; i != 8; ++i) 933 ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); 934 for (int i = 8; i != 16; ++i) 935 ShuffleMask.push_back(UndefValue::get(IntTy32)); 936 937 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 938 Builder.CreateBitCast(Op1, ShufTy), 939 ConstantVector::get(ShuffleMask)); 940 return Builder.CreateBitCast(SV, II.getType()); 941 } 942 943 // See if we're dealing with constant values. 944 Constant *C0 = dyn_cast<Constant>(Op0); 945 Constant *C1 = dyn_cast<Constant>(Op1); 946 ConstantInt *CI00 = 947 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 948 : nullptr; 949 ConstantInt *CI10 = 950 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 951 : nullptr; 952 953 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 954 if (CI00 && CI10) { 955 APInt V00 = CI00->getValue(); 956 APInt V10 = CI10->getValue(); 957 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 958 V00 = V00 & ~Mask; 959 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 960 APInt Val = V00 | V10; 961 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 962 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 963 UndefValue::get(IntTy64)}; 964 return ConstantVector::get(Args); 965 } 966 967 // If we were an INSERTQ call, we'll save demanded elements if we convert to 968 // INSERTQI. 969 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 970 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 971 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 972 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 973 974 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 975 Module *M = II.getModule(); 976 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 977 return Builder.CreateCall(F, Args); 978 } 979 980 return nullptr; 981 } 982 983 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 984 static Value *simplifyX86pshufb(const IntrinsicInst &II, 985 InstCombiner::BuilderTy &Builder) { 986 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 987 if (!V) 988 return nullptr; 989 990 auto *VecTy = cast<VectorType>(II.getType()); 991 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 992 unsigned NumElts = VecTy->getNumElements(); 993 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 994 "Unexpected number of elements in shuffle mask!"); 995 996 // Construct a shuffle mask from constant integers or UNDEFs. 997 Constant *Indexes[64] = {nullptr}; 998 999 // Each byte in the shuffle control mask forms an index to permute the 1000 // corresponding byte in the destination operand. 1001 for (unsigned I = 0; I < NumElts; ++I) { 1002 Constant *COp = V->getAggregateElement(I); 1003 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1004 return nullptr; 1005 1006 if (isa<UndefValue>(COp)) { 1007 Indexes[I] = UndefValue::get(MaskEltTy); 1008 continue; 1009 } 1010 1011 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 1012 1013 // If the most significant bit (bit[7]) of each byte of the shuffle 1014 // control mask is set, then zero is written in the result byte. 1015 // The zero vector is in the right-hand side of the resulting 1016 // shufflevector. 1017 1018 // The value of each index for the high 128-bit lane is the least 1019 // significant 4 bits of the respective shuffle control byte. 1020 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 1021 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1022 } 1023 1024 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); 1025 auto V1 = II.getArgOperand(0); 1026 auto V2 = Constant::getNullValue(VecTy); 1027 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1028 } 1029 1030 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 1031 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 1032 InstCombiner::BuilderTy &Builder) { 1033 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 1034 if (!V) 1035 return nullptr; 1036 1037 auto *VecTy = cast<VectorType>(II.getType()); 1038 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 1039 unsigned NumElts = VecTy->getVectorNumElements(); 1040 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 1041 unsigned NumLaneElts = IsPD ? 2 : 4; 1042 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 1043 1044 // Construct a shuffle mask from constant integers or UNDEFs. 1045 Constant *Indexes[16] = {nullptr}; 1046 1047 // The intrinsics only read one or two bits, clear the rest. 1048 for (unsigned I = 0; I < NumElts; ++I) { 1049 Constant *COp = V->getAggregateElement(I); 1050 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1051 return nullptr; 1052 1053 if (isa<UndefValue>(COp)) { 1054 Indexes[I] = UndefValue::get(MaskEltTy); 1055 continue; 1056 } 1057 1058 APInt Index = cast<ConstantInt>(COp)->getValue(); 1059 Index = Index.zextOrTrunc(32).getLoBits(2); 1060 1061 // The PD variants uses bit 1 to select per-lane element index, so 1062 // shift down to convert to generic shuffle mask index. 1063 if (IsPD) 1064 Index.lshrInPlace(1); 1065 1066 // The _256 variants are a bit trickier since the mask bits always index 1067 // into the corresponding 128 half. In order to convert to a generic 1068 // shuffle, we have to make that explicit. 1069 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 1070 1071 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1072 } 1073 1074 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); 1075 auto V1 = II.getArgOperand(0); 1076 auto V2 = UndefValue::get(V1->getType()); 1077 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1078 } 1079 1080 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 1081 static Value *simplifyX86vpermv(const IntrinsicInst &II, 1082 InstCombiner::BuilderTy &Builder) { 1083 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1084 if (!V) 1085 return nullptr; 1086 1087 auto *VecTy = cast<VectorType>(II.getType()); 1088 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 1089 unsigned Size = VecTy->getNumElements(); 1090 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 1091 "Unexpected shuffle mask size"); 1092 1093 // Construct a shuffle mask from constant integers or UNDEFs. 1094 Constant *Indexes[64] = {nullptr}; 1095 1096 for (unsigned I = 0; I < Size; ++I) { 1097 Constant *COp = V->getAggregateElement(I); 1098 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1099 return nullptr; 1100 1101 if (isa<UndefValue>(COp)) { 1102 Indexes[I] = UndefValue::get(MaskEltTy); 1103 continue; 1104 } 1105 1106 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 1107 Index &= Size - 1; 1108 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1109 } 1110 1111 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size)); 1112 auto V1 = II.getArgOperand(0); 1113 auto V2 = UndefValue::get(VecTy); 1114 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1115 } 1116 1117 // TODO, Obvious Missing Transforms: 1118 // * Narrow width by halfs excluding zero/undef lanes 1119 Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) { 1120 Value *LoadPtr = II.getArgOperand(0); 1121 const Align Alignment = 1122 cast<ConstantInt>(II.getArgOperand(1))->getAlignValue(); 1123 1124 // If the mask is all ones or undefs, this is a plain vector load of the 1st 1125 // argument. 1126 if (maskIsAllOneOrUndef(II.getArgOperand(2))) 1127 return Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, 1128 "unmaskedload"); 1129 1130 // If we can unconditionally load from this address, replace with a 1131 // load/select idiom. TODO: use DT for context sensitive query 1132 if (isDereferenceableAndAlignedPointer(LoadPtr, II.getType(), Alignment, 1133 II.getModule()->getDataLayout(), &II, 1134 nullptr)) { 1135 Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, 1136 "unmaskedload"); 1137 return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3)); 1138 } 1139 1140 return nullptr; 1141 } 1142 1143 // TODO, Obvious Missing Transforms: 1144 // * Single constant active lane -> store 1145 // * Narrow width by halfs excluding zero/undef lanes 1146 Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) { 1147 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1148 if (!ConstMask) 1149 return nullptr; 1150 1151 // If the mask is all zeros, this instruction does nothing. 1152 if (ConstMask->isNullValue()) 1153 return eraseInstFromFunction(II); 1154 1155 // If the mask is all ones, this is a plain vector store of the 1st argument. 1156 if (ConstMask->isAllOnesValue()) { 1157 Value *StorePtr = II.getArgOperand(1); 1158 MaybeAlign Alignment( 1159 cast<ConstantInt>(II.getArgOperand(2))->getZExtValue()); 1160 return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment); 1161 } 1162 1163 // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts 1164 APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); 1165 APInt UndefElts(DemandedElts.getBitWidth(), 0); 1166 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), 1167 DemandedElts, UndefElts)) 1168 return replaceOperand(II, 0, V); 1169 1170 return nullptr; 1171 } 1172 1173 // TODO, Obvious Missing Transforms: 1174 // * Single constant active lane load -> load 1175 // * Dereferenceable address & few lanes -> scalarize speculative load/selects 1176 // * Adjacent vector addresses -> masked.load 1177 // * Narrow width by halfs excluding zero/undef lanes 1178 // * Vector splat address w/known mask -> scalar load 1179 // * Vector incrementing address -> vector masked load 1180 Instruction *InstCombiner::simplifyMaskedGather(IntrinsicInst &II) { 1181 return nullptr; 1182 } 1183 1184 // TODO, Obvious Missing Transforms: 1185 // * Single constant active lane -> store 1186 // * Adjacent vector addresses -> masked.store 1187 // * Narrow store width by halfs excluding zero/undef lanes 1188 // * Vector splat address w/known mask -> scalar store 1189 // * Vector incrementing address -> vector masked store 1190 Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) { 1191 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1192 if (!ConstMask) 1193 return nullptr; 1194 1195 // If the mask is all zeros, a scatter does nothing. 1196 if (ConstMask->isNullValue()) 1197 return eraseInstFromFunction(II); 1198 1199 // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts 1200 APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); 1201 APInt UndefElts(DemandedElts.getBitWidth(), 0); 1202 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), 1203 DemandedElts, UndefElts)) 1204 return replaceOperand(II, 0, V); 1205 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(1), 1206 DemandedElts, UndefElts)) 1207 return replaceOperand(II, 1, V); 1208 1209 return nullptr; 1210 } 1211 1212 /// This function transforms launder.invariant.group and strip.invariant.group 1213 /// like: 1214 /// launder(launder(%x)) -> launder(%x) (the result is not the argument) 1215 /// launder(strip(%x)) -> launder(%x) 1216 /// strip(strip(%x)) -> strip(%x) (the result is not the argument) 1217 /// strip(launder(%x)) -> strip(%x) 1218 /// This is legal because it preserves the most recent information about 1219 /// the presence or absence of invariant.group. 1220 static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II, 1221 InstCombiner &IC) { 1222 auto *Arg = II.getArgOperand(0); 1223 auto *StrippedArg = Arg->stripPointerCasts(); 1224 auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups(); 1225 if (StrippedArg == StrippedInvariantGroupsArg) 1226 return nullptr; // No launders/strips to remove. 1227 1228 Value *Result = nullptr; 1229 1230 if (II.getIntrinsicID() == Intrinsic::launder_invariant_group) 1231 Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg); 1232 else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group) 1233 Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg); 1234 else 1235 llvm_unreachable( 1236 "simplifyInvariantGroupIntrinsic only handles launder and strip"); 1237 if (Result->getType()->getPointerAddressSpace() != 1238 II.getType()->getPointerAddressSpace()) 1239 Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType()); 1240 if (Result->getType() != II.getType()) 1241 Result = IC.Builder.CreateBitCast(Result, II.getType()); 1242 1243 return cast<Instruction>(Result); 1244 } 1245 1246 static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { 1247 assert((II.getIntrinsicID() == Intrinsic::cttz || 1248 II.getIntrinsicID() == Intrinsic::ctlz) && 1249 "Expected cttz or ctlz intrinsic"); 1250 bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; 1251 Value *Op0 = II.getArgOperand(0); 1252 Value *X; 1253 // ctlz(bitreverse(x)) -> cttz(x) 1254 // cttz(bitreverse(x)) -> ctlz(x) 1255 if (match(Op0, m_BitReverse(m_Value(X)))) { 1256 Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz; 1257 Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType()); 1258 return CallInst::Create(F, {X, II.getArgOperand(1)}); 1259 } 1260 1261 if (IsTZ) { 1262 // cttz(-x) -> cttz(x) 1263 if (match(Op0, m_Neg(m_Value(X)))) 1264 return IC.replaceOperand(II, 0, X); 1265 1266 // cttz(abs(x)) -> cttz(x) 1267 // cttz(nabs(x)) -> cttz(x) 1268 Value *Y; 1269 SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor; 1270 if (SPF == SPF_ABS || SPF == SPF_NABS) 1271 return IC.replaceOperand(II, 0, X); 1272 } 1273 1274 KnownBits Known = IC.computeKnownBits(Op0, 0, &II); 1275 1276 // Create a mask for bits above (ctlz) or below (cttz) the first known one. 1277 unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros() 1278 : Known.countMaxLeadingZeros(); 1279 unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros() 1280 : Known.countMinLeadingZeros(); 1281 1282 // If all bits above (ctlz) or below (cttz) the first known one are known 1283 // zero, this value is constant. 1284 // FIXME: This should be in InstSimplify because we're replacing an 1285 // instruction with a constant. 1286 if (PossibleZeros == DefiniteZeros) { 1287 auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros); 1288 return IC.replaceInstUsesWith(II, C); 1289 } 1290 1291 // If the input to cttz/ctlz is known to be non-zero, 1292 // then change the 'ZeroIsUndef' parameter to 'true' 1293 // because we know the zero behavior can't affect the result. 1294 if (!Known.One.isNullValue() || 1295 isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, 1296 &IC.getDominatorTree())) { 1297 if (!match(II.getArgOperand(1), m_One())) 1298 return IC.replaceOperand(II, 1, IC.Builder.getTrue()); 1299 } 1300 1301 // Add range metadata since known bits can't completely reflect what we know. 1302 // TODO: Handle splat vectors. 1303 auto *IT = dyn_cast<IntegerType>(Op0->getType()); 1304 if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1305 Metadata *LowAndHigh[] = { 1306 ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)), 1307 ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))}; 1308 II.setMetadata(LLVMContext::MD_range, 1309 MDNode::get(II.getContext(), LowAndHigh)); 1310 return &II; 1311 } 1312 1313 return nullptr; 1314 } 1315 1316 static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { 1317 assert(II.getIntrinsicID() == Intrinsic::ctpop && 1318 "Expected ctpop intrinsic"); 1319 Type *Ty = II.getType(); 1320 unsigned BitWidth = Ty->getScalarSizeInBits(); 1321 Value *Op0 = II.getArgOperand(0); 1322 Value *X; 1323 1324 // ctpop(bitreverse(x)) -> ctpop(x) 1325 // ctpop(bswap(x)) -> ctpop(x) 1326 if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X)))) 1327 return IC.replaceOperand(II, 0, X); 1328 1329 // ctpop(x | -x) -> bitwidth - cttz(x, false) 1330 if (Op0->hasOneUse() && 1331 match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) { 1332 Function *F = 1333 Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); 1334 auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()}); 1335 auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth)); 1336 return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz)); 1337 } 1338 1339 // ctpop(~x & (x - 1)) -> cttz(x, false) 1340 if (match(Op0, 1341 m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) { 1342 Function *F = 1343 Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); 1344 return CallInst::Create(F, {X, IC.Builder.getFalse()}); 1345 } 1346 1347 // FIXME: Try to simplify vectors of integers. 1348 auto *IT = dyn_cast<IntegerType>(Ty); 1349 if (!IT) 1350 return nullptr; 1351 1352 KnownBits Known(BitWidth); 1353 IC.computeKnownBits(Op0, Known, 0, &II); 1354 1355 unsigned MinCount = Known.countMinPopulation(); 1356 unsigned MaxCount = Known.countMaxPopulation(); 1357 1358 // Add range metadata since known bits can't completely reflect what we know. 1359 if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1360 Metadata *LowAndHigh[] = { 1361 ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)), 1362 ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))}; 1363 II.setMetadata(LLVMContext::MD_range, 1364 MDNode::get(II.getContext(), LowAndHigh)); 1365 return &II; 1366 } 1367 1368 return nullptr; 1369 } 1370 1371 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 1372 // XMM register mask efficiently, we could transform all x86 masked intrinsics 1373 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1374 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 1375 Value *Ptr = II.getOperand(0); 1376 Value *Mask = II.getOperand(1); 1377 Constant *ZeroVec = Constant::getNullValue(II.getType()); 1378 1379 // Special case a zero mask since that's not a ConstantDataVector. 1380 // This masked load instruction creates a zero vector. 1381 if (isa<ConstantAggregateZero>(Mask)) 1382 return IC.replaceInstUsesWith(II, ZeroVec); 1383 1384 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1385 if (!ConstMask) 1386 return nullptr; 1387 1388 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1389 // to allow target-independent optimizations. 1390 1391 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1392 // the LLVM intrinsic definition for the pointer argument. 1393 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1394 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 1395 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 1396 1397 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1398 // on each element's most significant bit (the sign bit). 1399 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1400 1401 // The pass-through vector for an x86 masked load is a zero vector. 1402 CallInst *NewMaskedLoad = 1403 IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); 1404 return IC.replaceInstUsesWith(II, NewMaskedLoad); 1405 } 1406 1407 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 1408 // XMM register mask efficiently, we could transform all x86 masked intrinsics 1409 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1410 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 1411 Value *Ptr = II.getOperand(0); 1412 Value *Mask = II.getOperand(1); 1413 Value *Vec = II.getOperand(2); 1414 1415 // Special case a zero mask since that's not a ConstantDataVector: 1416 // this masked store instruction does nothing. 1417 if (isa<ConstantAggregateZero>(Mask)) { 1418 IC.eraseInstFromFunction(II); 1419 return true; 1420 } 1421 1422 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 1423 // anything else at this level. 1424 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 1425 return false; 1426 1427 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1428 if (!ConstMask) 1429 return false; 1430 1431 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1432 // to allow target-independent optimizations. 1433 1434 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1435 // the LLVM intrinsic definition for the pointer argument. 1436 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1437 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 1438 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 1439 1440 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1441 // on each element's most significant bit (the sign bit). 1442 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1443 1444 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 1445 1446 // 'Replace uses' doesn't work for stores. Erase the original masked store. 1447 IC.eraseInstFromFunction(II); 1448 return true; 1449 } 1450 1451 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 1452 // 1453 // A single NaN input is folded to minnum, so we rely on that folding for 1454 // handling NaNs. 1455 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 1456 const APFloat &Src2) { 1457 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 1458 1459 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 1460 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 1461 if (Cmp0 == APFloat::cmpEqual) 1462 return maxnum(Src1, Src2); 1463 1464 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 1465 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 1466 if (Cmp1 == APFloat::cmpEqual) 1467 return maxnum(Src0, Src2); 1468 1469 return maxnum(Src0, Src1); 1470 } 1471 1472 /// Convert a table lookup to shufflevector if the mask is constant. 1473 /// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in 1474 /// which case we could lower the shufflevector with rev64 instructions 1475 /// as it's actually a byte reverse. 1476 static Value *simplifyNeonTbl1(const IntrinsicInst &II, 1477 InstCombiner::BuilderTy &Builder) { 1478 // Bail out if the mask is not a constant. 1479 auto *C = dyn_cast<Constant>(II.getArgOperand(1)); 1480 if (!C) 1481 return nullptr; 1482 1483 auto *VecTy = cast<VectorType>(II.getType()); 1484 unsigned NumElts = VecTy->getNumElements(); 1485 1486 // Only perform this transformation for <8 x i8> vector types. 1487 if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8) 1488 return nullptr; 1489 1490 uint32_t Indexes[8]; 1491 1492 for (unsigned I = 0; I < NumElts; ++I) { 1493 Constant *COp = C->getAggregateElement(I); 1494 1495 if (!COp || !isa<ConstantInt>(COp)) 1496 return nullptr; 1497 1498 Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue(); 1499 1500 // Make sure the mask indices are in range. 1501 if (Indexes[I] >= NumElts) 1502 return nullptr; 1503 } 1504 1505 auto *ShuffleMask = ConstantDataVector::get(II.getContext(), 1506 makeArrayRef(Indexes)); 1507 auto *V1 = II.getArgOperand(0); 1508 auto *V2 = Constant::getNullValue(V1->getType()); 1509 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1510 } 1511 1512 /// Convert a vector load intrinsic into a simple llvm load instruction. 1513 /// This is beneficial when the underlying object being addressed comes 1514 /// from a constant, since we get constant-folding for free. 1515 static Value *simplifyNeonVld1(const IntrinsicInst &II, 1516 unsigned MemAlign, 1517 InstCombiner::BuilderTy &Builder) { 1518 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); 1519 1520 if (!IntrAlign) 1521 return nullptr; 1522 1523 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ? 1524 MemAlign : IntrAlign->getLimitedValue(); 1525 1526 if (!isPowerOf2_32(Alignment)) 1527 return nullptr; 1528 1529 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), 1530 PointerType::get(II.getType(), 0)); 1531 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); 1532 } 1533 1534 // Returns true iff the 2 intrinsics have the same operands, limiting the 1535 // comparison to the first NumOperands. 1536 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, 1537 unsigned NumOperands) { 1538 assert(I.getNumArgOperands() >= NumOperands && "Not enough operands"); 1539 assert(E.getNumArgOperands() >= NumOperands && "Not enough operands"); 1540 for (unsigned i = 0; i < NumOperands; i++) 1541 if (I.getArgOperand(i) != E.getArgOperand(i)) 1542 return false; 1543 return true; 1544 } 1545 1546 // Remove trivially empty start/end intrinsic ranges, i.e. a start 1547 // immediately followed by an end (ignoring debuginfo or other 1548 // start/end intrinsics in between). As this handles only the most trivial 1549 // cases, tracking the nesting level is not needed: 1550 // 1551 // call @llvm.foo.start(i1 0) 1552 // call @llvm.foo.start(i1 0) ; This one won't be skipped: it will be removed 1553 // call @llvm.foo.end(i1 0) 1554 // call @llvm.foo.end(i1 0) ; &I 1555 static bool removeTriviallyEmptyRange( 1556 IntrinsicInst &EndI, InstCombiner &IC, 1557 std::function<bool(const IntrinsicInst &)> IsStart) { 1558 // We start from the end intrinsic and scan backwards, so that InstCombine 1559 // has already processed (and potentially removed) all the instructions 1560 // before the end intrinsic. 1561 BasicBlock::reverse_iterator BI(EndI), BE(EndI.getParent()->rend()); 1562 for (; BI != BE; ++BI) { 1563 if (auto *I = dyn_cast<IntrinsicInst>(&*BI)) { 1564 if (isa<DbgInfoIntrinsic>(I) || 1565 I->getIntrinsicID() == EndI.getIntrinsicID()) 1566 continue; 1567 if (IsStart(*I)) { 1568 if (haveSameOperands(EndI, *I, EndI.getNumArgOperands())) { 1569 IC.eraseInstFromFunction(*I); 1570 IC.eraseInstFromFunction(EndI); 1571 return true; 1572 } 1573 // Skip start intrinsics that don't pair with this end intrinsic. 1574 continue; 1575 } 1576 } 1577 break; 1578 } 1579 1580 return false; 1581 } 1582 1583 // Convert NVVM intrinsics to target-generic LLVM code where possible. 1584 static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) { 1585 // Each NVVM intrinsic we can simplify can be replaced with one of: 1586 // 1587 // * an LLVM intrinsic, 1588 // * an LLVM cast operation, 1589 // * an LLVM binary operation, or 1590 // * ad-hoc LLVM IR for the particular operation. 1591 1592 // Some transformations are only valid when the module's 1593 // flush-denormals-to-zero (ftz) setting is true/false, whereas other 1594 // transformations are valid regardless of the module's ftz setting. 1595 enum FtzRequirementTy { 1596 FTZ_Any, // Any ftz setting is ok. 1597 FTZ_MustBeOn, // Transformation is valid only if ftz is on. 1598 FTZ_MustBeOff, // Transformation is valid only if ftz is off. 1599 }; 1600 // Classes of NVVM intrinsics that can't be replaced one-to-one with a 1601 // target-generic intrinsic, cast op, or binary op but that we can nonetheless 1602 // simplify. 1603 enum SpecialCase { 1604 SPC_Reciprocal, 1605 }; 1606 1607 // SimplifyAction is a poor-man's variant (plus an additional flag) that 1608 // represents how to replace an NVVM intrinsic with target-generic LLVM IR. 1609 struct SimplifyAction { 1610 // Invariant: At most one of these Optionals has a value. 1611 Optional<Intrinsic::ID> IID; 1612 Optional<Instruction::CastOps> CastOp; 1613 Optional<Instruction::BinaryOps> BinaryOp; 1614 Optional<SpecialCase> Special; 1615 1616 FtzRequirementTy FtzRequirement = FTZ_Any; 1617 1618 SimplifyAction() = default; 1619 1620 SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) 1621 : IID(IID), FtzRequirement(FtzReq) {} 1622 1623 // Cast operations don't have anything to do with FTZ, so we skip that 1624 // argument. 1625 SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} 1626 1627 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) 1628 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} 1629 1630 SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) 1631 : Special(Special), FtzRequirement(FtzReq) {} 1632 }; 1633 1634 // Try to generate a SimplifyAction describing how to replace our 1635 // IntrinsicInstr with target-generic LLVM IR. 1636 const SimplifyAction Action = [II]() -> SimplifyAction { 1637 switch (II->getIntrinsicID()) { 1638 // NVVM intrinsics that map directly to LLVM intrinsics. 1639 case Intrinsic::nvvm_ceil_d: 1640 return {Intrinsic::ceil, FTZ_Any}; 1641 case Intrinsic::nvvm_ceil_f: 1642 return {Intrinsic::ceil, FTZ_MustBeOff}; 1643 case Intrinsic::nvvm_ceil_ftz_f: 1644 return {Intrinsic::ceil, FTZ_MustBeOn}; 1645 case Intrinsic::nvvm_fabs_d: 1646 return {Intrinsic::fabs, FTZ_Any}; 1647 case Intrinsic::nvvm_fabs_f: 1648 return {Intrinsic::fabs, FTZ_MustBeOff}; 1649 case Intrinsic::nvvm_fabs_ftz_f: 1650 return {Intrinsic::fabs, FTZ_MustBeOn}; 1651 case Intrinsic::nvvm_floor_d: 1652 return {Intrinsic::floor, FTZ_Any}; 1653 case Intrinsic::nvvm_floor_f: 1654 return {Intrinsic::floor, FTZ_MustBeOff}; 1655 case Intrinsic::nvvm_floor_ftz_f: 1656 return {Intrinsic::floor, FTZ_MustBeOn}; 1657 case Intrinsic::nvvm_fma_rn_d: 1658 return {Intrinsic::fma, FTZ_Any}; 1659 case Intrinsic::nvvm_fma_rn_f: 1660 return {Intrinsic::fma, FTZ_MustBeOff}; 1661 case Intrinsic::nvvm_fma_rn_ftz_f: 1662 return {Intrinsic::fma, FTZ_MustBeOn}; 1663 case Intrinsic::nvvm_fmax_d: 1664 return {Intrinsic::maxnum, FTZ_Any}; 1665 case Intrinsic::nvvm_fmax_f: 1666 return {Intrinsic::maxnum, FTZ_MustBeOff}; 1667 case Intrinsic::nvvm_fmax_ftz_f: 1668 return {Intrinsic::maxnum, FTZ_MustBeOn}; 1669 case Intrinsic::nvvm_fmin_d: 1670 return {Intrinsic::minnum, FTZ_Any}; 1671 case Intrinsic::nvvm_fmin_f: 1672 return {Intrinsic::minnum, FTZ_MustBeOff}; 1673 case Intrinsic::nvvm_fmin_ftz_f: 1674 return {Intrinsic::minnum, FTZ_MustBeOn}; 1675 case Intrinsic::nvvm_round_d: 1676 return {Intrinsic::round, FTZ_Any}; 1677 case Intrinsic::nvvm_round_f: 1678 return {Intrinsic::round, FTZ_MustBeOff}; 1679 case Intrinsic::nvvm_round_ftz_f: 1680 return {Intrinsic::round, FTZ_MustBeOn}; 1681 case Intrinsic::nvvm_sqrt_rn_d: 1682 return {Intrinsic::sqrt, FTZ_Any}; 1683 case Intrinsic::nvvm_sqrt_f: 1684 // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the 1685 // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts 1686 // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are 1687 // the versions with explicit ftz-ness. 1688 return {Intrinsic::sqrt, FTZ_Any}; 1689 case Intrinsic::nvvm_sqrt_rn_f: 1690 return {Intrinsic::sqrt, FTZ_MustBeOff}; 1691 case Intrinsic::nvvm_sqrt_rn_ftz_f: 1692 return {Intrinsic::sqrt, FTZ_MustBeOn}; 1693 case Intrinsic::nvvm_trunc_d: 1694 return {Intrinsic::trunc, FTZ_Any}; 1695 case Intrinsic::nvvm_trunc_f: 1696 return {Intrinsic::trunc, FTZ_MustBeOff}; 1697 case Intrinsic::nvvm_trunc_ftz_f: 1698 return {Intrinsic::trunc, FTZ_MustBeOn}; 1699 1700 // NVVM intrinsics that map to LLVM cast operations. 1701 // 1702 // Note that llvm's target-generic conversion operators correspond to the rz 1703 // (round to zero) versions of the nvvm conversion intrinsics, even though 1704 // most everything else here uses the rn (round to nearest even) nvvm ops. 1705 case Intrinsic::nvvm_d2i_rz: 1706 case Intrinsic::nvvm_f2i_rz: 1707 case Intrinsic::nvvm_d2ll_rz: 1708 case Intrinsic::nvvm_f2ll_rz: 1709 return {Instruction::FPToSI}; 1710 case Intrinsic::nvvm_d2ui_rz: 1711 case Intrinsic::nvvm_f2ui_rz: 1712 case Intrinsic::nvvm_d2ull_rz: 1713 case Intrinsic::nvvm_f2ull_rz: 1714 return {Instruction::FPToUI}; 1715 case Intrinsic::nvvm_i2d_rz: 1716 case Intrinsic::nvvm_i2f_rz: 1717 case Intrinsic::nvvm_ll2d_rz: 1718 case Intrinsic::nvvm_ll2f_rz: 1719 return {Instruction::SIToFP}; 1720 case Intrinsic::nvvm_ui2d_rz: 1721 case Intrinsic::nvvm_ui2f_rz: 1722 case Intrinsic::nvvm_ull2d_rz: 1723 case Intrinsic::nvvm_ull2f_rz: 1724 return {Instruction::UIToFP}; 1725 1726 // NVVM intrinsics that map to LLVM binary ops. 1727 case Intrinsic::nvvm_add_rn_d: 1728 return {Instruction::FAdd, FTZ_Any}; 1729 case Intrinsic::nvvm_add_rn_f: 1730 return {Instruction::FAdd, FTZ_MustBeOff}; 1731 case Intrinsic::nvvm_add_rn_ftz_f: 1732 return {Instruction::FAdd, FTZ_MustBeOn}; 1733 case Intrinsic::nvvm_mul_rn_d: 1734 return {Instruction::FMul, FTZ_Any}; 1735 case Intrinsic::nvvm_mul_rn_f: 1736 return {Instruction::FMul, FTZ_MustBeOff}; 1737 case Intrinsic::nvvm_mul_rn_ftz_f: 1738 return {Instruction::FMul, FTZ_MustBeOn}; 1739 case Intrinsic::nvvm_div_rn_d: 1740 return {Instruction::FDiv, FTZ_Any}; 1741 case Intrinsic::nvvm_div_rn_f: 1742 return {Instruction::FDiv, FTZ_MustBeOff}; 1743 case Intrinsic::nvvm_div_rn_ftz_f: 1744 return {Instruction::FDiv, FTZ_MustBeOn}; 1745 1746 // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but 1747 // need special handling. 1748 // 1749 // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just 1750 // as well. 1751 case Intrinsic::nvvm_rcp_rn_d: 1752 return {SPC_Reciprocal, FTZ_Any}; 1753 case Intrinsic::nvvm_rcp_rn_f: 1754 return {SPC_Reciprocal, FTZ_MustBeOff}; 1755 case Intrinsic::nvvm_rcp_rn_ftz_f: 1756 return {SPC_Reciprocal, FTZ_MustBeOn}; 1757 1758 // We do not currently simplify intrinsics that give an approximate answer. 1759 // These include: 1760 // 1761 // - nvvm_cos_approx_{f,ftz_f} 1762 // - nvvm_ex2_approx_{d,f,ftz_f} 1763 // - nvvm_lg2_approx_{d,f,ftz_f} 1764 // - nvvm_sin_approx_{f,ftz_f} 1765 // - nvvm_sqrt_approx_{f,ftz_f} 1766 // - nvvm_rsqrt_approx_{d,f,ftz_f} 1767 // - nvvm_div_approx_{ftz_d,ftz_f,f} 1768 // - nvvm_rcp_approx_ftz_d 1769 // 1770 // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" 1771 // means that fastmath is enabled in the intrinsic. Unfortunately only 1772 // binary operators (currently) have a fastmath bit in SelectionDAG, so this 1773 // information gets lost and we can't select on it. 1774 // 1775 // TODO: div and rcp are lowered to a binary op, so these we could in theory 1776 // lower them to "fast fdiv". 1777 1778 default: 1779 return {}; 1780 } 1781 }(); 1782 1783 // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we 1784 // can bail out now. (Notice that in the case that IID is not an NVVM 1785 // intrinsic, we don't have to look up any module metadata, as 1786 // FtzRequirementTy will be FTZ_Any.) 1787 if (Action.FtzRequirement != FTZ_Any) { 1788 StringRef Attr = II->getFunction() 1789 ->getFnAttribute("denormal-fp-math-f32") 1790 .getValueAsString(); 1791 DenormalMode Mode = parseDenormalFPAttribute(Attr); 1792 bool FtzEnabled = Mode.Output != DenormalMode::IEEE; 1793 1794 if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) 1795 return nullptr; 1796 } 1797 1798 // Simplify to target-generic intrinsic. 1799 if (Action.IID) { 1800 SmallVector<Value *, 4> Args(II->arg_operands()); 1801 // All the target-generic intrinsics currently of interest to us have one 1802 // type argument, equal to that of the nvvm intrinsic's argument. 1803 Type *Tys[] = {II->getArgOperand(0)->getType()}; 1804 return CallInst::Create( 1805 Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); 1806 } 1807 1808 // Simplify to target-generic binary op. 1809 if (Action.BinaryOp) 1810 return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), 1811 II->getArgOperand(1), II->getName()); 1812 1813 // Simplify to target-generic cast op. 1814 if (Action.CastOp) 1815 return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), 1816 II->getName()); 1817 1818 // All that's left are the special cases. 1819 if (!Action.Special) 1820 return nullptr; 1821 1822 switch (*Action.Special) { 1823 case SPC_Reciprocal: 1824 // Simplify reciprocal. 1825 return BinaryOperator::Create( 1826 Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), 1827 II->getArgOperand(0), II->getName()); 1828 } 1829 llvm_unreachable("All SpecialCase enumerators should be handled in switch."); 1830 } 1831 1832 Instruction *InstCombiner::visitVAEndInst(VAEndInst &I) { 1833 removeTriviallyEmptyRange(I, *this, [](const IntrinsicInst &I) { 1834 return I.getIntrinsicID() == Intrinsic::vastart || 1835 I.getIntrinsicID() == Intrinsic::vacopy; 1836 }); 1837 return nullptr; 1838 } 1839 1840 static Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) { 1841 assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap"); 1842 Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1); 1843 if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) { 1844 Call.setArgOperand(0, Arg1); 1845 Call.setArgOperand(1, Arg0); 1846 return &Call; 1847 } 1848 return nullptr; 1849 } 1850 1851 Instruction *InstCombiner::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) { 1852 WithOverflowInst *WO = cast<WithOverflowInst>(II); 1853 Value *OperationResult = nullptr; 1854 Constant *OverflowResult = nullptr; 1855 if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(), 1856 WO->getRHS(), *WO, OperationResult, OverflowResult)) 1857 return CreateOverflowTuple(WO, OperationResult, OverflowResult); 1858 return nullptr; 1859 } 1860 1861 /// CallInst simplification. This mostly only handles folding of intrinsic 1862 /// instructions. For normal calls, it allows visitCallBase to do the heavy 1863 /// lifting. 1864 Instruction *InstCombiner::visitCallInst(CallInst &CI) { 1865 // Don't try to simplify calls without uses. It will not do anything useful, 1866 // but will result in the following folds being skipped. 1867 if (!CI.use_empty()) 1868 if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI))) 1869 return replaceInstUsesWith(CI, V); 1870 1871 if (isFreeCall(&CI, &TLI)) 1872 return visitFree(CI); 1873 1874 // If the caller function is nounwind, mark the call as nounwind, even if the 1875 // callee isn't. 1876 if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) { 1877 CI.setDoesNotThrow(); 1878 return &CI; 1879 } 1880 1881 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI); 1882 if (!II) return visitCallBase(CI); 1883 1884 // For atomic unordered mem intrinsics if len is not a positive or 1885 // not a multiple of element size then behavior is undefined. 1886 if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(II)) 1887 if (ConstantInt *NumBytes = dyn_cast<ConstantInt>(AMI->getLength())) 1888 if (NumBytes->getSExtValue() < 0 || 1889 (NumBytes->getZExtValue() % AMI->getElementSizeInBytes() != 0)) { 1890 CreateNonTerminatorUnreachable(AMI); 1891 assert(AMI->getType()->isVoidTy() && 1892 "non void atomic unordered mem intrinsic"); 1893 return eraseInstFromFunction(*AMI); 1894 } 1895 1896 // Intrinsics cannot occur in an invoke or a callbr, so handle them here 1897 // instead of in visitCallBase. 1898 if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) { 1899 bool Changed = false; 1900 1901 // memmove/cpy/set of zero bytes is a noop. 1902 if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) { 1903 if (NumBytes->isNullValue()) 1904 return eraseInstFromFunction(CI); 1905 1906 if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) 1907 if (CI->getZExtValue() == 1) { 1908 // Replace the instruction with just byte operations. We would 1909 // transform other cases to loads/stores, but we don't know if 1910 // alignment is sufficient. 1911 } 1912 } 1913 1914 // No other transformations apply to volatile transfers. 1915 if (auto *M = dyn_cast<MemIntrinsic>(MI)) 1916 if (M->isVolatile()) 1917 return nullptr; 1918 1919 // If we have a memmove and the source operation is a constant global, 1920 // then the source and dest pointers can't alias, so we can change this 1921 // into a call to memcpy. 1922 if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) { 1923 if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) 1924 if (GVSrc->isConstant()) { 1925 Module *M = CI.getModule(); 1926 Intrinsic::ID MemCpyID = 1927 isa<AtomicMemMoveInst>(MMI) 1928 ? Intrinsic::memcpy_element_unordered_atomic 1929 : Intrinsic::memcpy; 1930 Type *Tys[3] = { CI.getArgOperand(0)->getType(), 1931 CI.getArgOperand(1)->getType(), 1932 CI.getArgOperand(2)->getType() }; 1933 CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys)); 1934 Changed = true; 1935 } 1936 } 1937 1938 if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) { 1939 // memmove(x,x,size) -> noop. 1940 if (MTI->getSource() == MTI->getDest()) 1941 return eraseInstFromFunction(CI); 1942 } 1943 1944 // If we can determine a pointer alignment that is bigger than currently 1945 // set, update the alignment. 1946 if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) { 1947 if (Instruction *I = SimplifyAnyMemTransfer(MTI)) 1948 return I; 1949 } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) { 1950 if (Instruction *I = SimplifyAnyMemSet(MSI)) 1951 return I; 1952 } 1953 1954 if (Changed) return II; 1955 } 1956 1957 // For vector result intrinsics, use the generic demanded vector support. 1958 if (II->getType()->isVectorTy()) { 1959 auto VWidth = II->getType()->getVectorNumElements(); 1960 APInt UndefElts(VWidth, 0); 1961 APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); 1962 if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { 1963 if (V != II) 1964 return replaceInstUsesWith(*II, V); 1965 return II; 1966 } 1967 } 1968 1969 if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) 1970 return I; 1971 1972 auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, 1973 unsigned DemandedWidth) { 1974 APInt UndefElts(Width, 0); 1975 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 1976 return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 1977 }; 1978 1979 Intrinsic::ID IID = II->getIntrinsicID(); 1980 switch (IID) { 1981 default: break; 1982 case Intrinsic::objectsize: 1983 if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false)) 1984 return replaceInstUsesWith(CI, V); 1985 return nullptr; 1986 case Intrinsic::bswap: { 1987 Value *IIOperand = II->getArgOperand(0); 1988 Value *X = nullptr; 1989 1990 // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) 1991 if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { 1992 unsigned C = X->getType()->getPrimitiveSizeInBits() - 1993 IIOperand->getType()->getPrimitiveSizeInBits(); 1994 Value *CV = ConstantInt::get(X->getType(), C); 1995 Value *V = Builder.CreateLShr(X, CV); 1996 return new TruncInst(V, IIOperand->getType()); 1997 } 1998 break; 1999 } 2000 case Intrinsic::masked_load: 2001 if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II)) 2002 return replaceInstUsesWith(CI, SimplifiedMaskedOp); 2003 break; 2004 case Intrinsic::masked_store: 2005 return simplifyMaskedStore(*II); 2006 case Intrinsic::masked_gather: 2007 return simplifyMaskedGather(*II); 2008 case Intrinsic::masked_scatter: 2009 return simplifyMaskedScatter(*II); 2010 case Intrinsic::launder_invariant_group: 2011 case Intrinsic::strip_invariant_group: 2012 if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this)) 2013 return replaceInstUsesWith(*II, SkippedBarrier); 2014 break; 2015 case Intrinsic::powi: 2016 if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2017 // 0 and 1 are handled in instsimplify 2018 2019 // powi(x, -1) -> 1/x 2020 if (Power->isMinusOne()) 2021 return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0), 2022 II->getArgOperand(0)); 2023 // powi(x, 2) -> x*x 2024 if (Power->equalsInt(2)) 2025 return BinaryOperator::CreateFMul(II->getArgOperand(0), 2026 II->getArgOperand(0)); 2027 } 2028 break; 2029 2030 case Intrinsic::cttz: 2031 case Intrinsic::ctlz: 2032 if (auto *I = foldCttzCtlz(*II, *this)) 2033 return I; 2034 break; 2035 2036 case Intrinsic::ctpop: 2037 if (auto *I = foldCtpop(*II, *this)) 2038 return I; 2039 break; 2040 2041 case Intrinsic::fshl: 2042 case Intrinsic::fshr: { 2043 Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1); 2044 Type *Ty = II->getType(); 2045 unsigned BitWidth = Ty->getScalarSizeInBits(); 2046 Constant *ShAmtC; 2047 if (match(II->getArgOperand(2), m_Constant(ShAmtC)) && 2048 !isa<ConstantExpr>(ShAmtC) && !ShAmtC->containsConstantExpression()) { 2049 // Canonicalize a shift amount constant operand to modulo the bit-width. 2050 Constant *WidthC = ConstantInt::get(Ty, BitWidth); 2051 Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC); 2052 if (ModuloC != ShAmtC) 2053 return replaceOperand(*II, 2, ModuloC); 2054 2055 assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) == 2056 ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) && 2057 "Shift amount expected to be modulo bitwidth"); 2058 2059 // Canonicalize funnel shift right by constant to funnel shift left. This 2060 // is not entirely arbitrary. For historical reasons, the backend may 2061 // recognize rotate left patterns but miss rotate right patterns. 2062 if (IID == Intrinsic::fshr) { 2063 // fshr X, Y, C --> fshl X, Y, (BitWidth - C) 2064 Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC); 2065 Module *Mod = II->getModule(); 2066 Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty); 2067 return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC }); 2068 } 2069 assert(IID == Intrinsic::fshl && 2070 "All funnel shifts by simple constants should go left"); 2071 2072 // fshl(X, 0, C) --> shl X, C 2073 // fshl(X, undef, C) --> shl X, C 2074 if (match(Op1, m_ZeroInt()) || match(Op1, m_Undef())) 2075 return BinaryOperator::CreateShl(Op0, ShAmtC); 2076 2077 // fshl(0, X, C) --> lshr X, (BW-C) 2078 // fshl(undef, X, C) --> lshr X, (BW-C) 2079 if (match(Op0, m_ZeroInt()) || match(Op0, m_Undef())) 2080 return BinaryOperator::CreateLShr(Op1, 2081 ConstantExpr::getSub(WidthC, ShAmtC)); 2082 2083 // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form) 2084 if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) { 2085 Module *Mod = II->getModule(); 2086 Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty); 2087 return CallInst::Create(Bswap, { Op0 }); 2088 } 2089 } 2090 2091 // Left or right might be masked. 2092 if (SimplifyDemandedInstructionBits(*II)) 2093 return &CI; 2094 2095 // The shift amount (operand 2) of a funnel shift is modulo the bitwidth, 2096 // so only the low bits of the shift amount are demanded if the bitwidth is 2097 // a power-of-2. 2098 if (!isPowerOf2_32(BitWidth)) 2099 break; 2100 APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth)); 2101 KnownBits Op2Known(BitWidth); 2102 if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known)) 2103 return &CI; 2104 break; 2105 } 2106 case Intrinsic::uadd_with_overflow: 2107 case Intrinsic::sadd_with_overflow: { 2108 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2109 return I; 2110 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2111 return I; 2112 2113 // Given 2 constant operands whose sum does not overflow: 2114 // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1 2115 // saddo (X +nsw C0), C1 -> saddo X, C0 + C1 2116 Value *X; 2117 const APInt *C0, *C1; 2118 Value *Arg0 = II->getArgOperand(0); 2119 Value *Arg1 = II->getArgOperand(1); 2120 bool IsSigned = IID == Intrinsic::sadd_with_overflow; 2121 bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0))) 2122 : match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0))); 2123 if (HasNWAdd && match(Arg1, m_APInt(C1))) { 2124 bool Overflow; 2125 APInt NewC = 2126 IsSigned ? C1->sadd_ov(*C0, Overflow) : C1->uadd_ov(*C0, Overflow); 2127 if (!Overflow) 2128 return replaceInstUsesWith( 2129 *II, Builder.CreateBinaryIntrinsic( 2130 IID, X, ConstantInt::get(Arg1->getType(), NewC))); 2131 } 2132 break; 2133 } 2134 2135 case Intrinsic::umul_with_overflow: 2136 case Intrinsic::smul_with_overflow: 2137 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2138 return I; 2139 LLVM_FALLTHROUGH; 2140 2141 case Intrinsic::usub_with_overflow: 2142 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2143 return I; 2144 break; 2145 2146 case Intrinsic::ssub_with_overflow: { 2147 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2148 return I; 2149 2150 Constant *C; 2151 Value *Arg0 = II->getArgOperand(0); 2152 Value *Arg1 = II->getArgOperand(1); 2153 // Given a constant C that is not the minimum signed value 2154 // for an integer of a given bit width: 2155 // 2156 // ssubo X, C -> saddo X, -C 2157 if (match(Arg1, m_Constant(C)) && C->isNotMinSignedValue()) { 2158 Value *NegVal = ConstantExpr::getNeg(C); 2159 // Build a saddo call that is equivalent to the discovered 2160 // ssubo call. 2161 return replaceInstUsesWith( 2162 *II, Builder.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, 2163 Arg0, NegVal)); 2164 } 2165 2166 break; 2167 } 2168 2169 case Intrinsic::uadd_sat: 2170 case Intrinsic::sadd_sat: 2171 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2172 return I; 2173 LLVM_FALLTHROUGH; 2174 case Intrinsic::usub_sat: 2175 case Intrinsic::ssub_sat: { 2176 SaturatingInst *SI = cast<SaturatingInst>(II); 2177 Type *Ty = SI->getType(); 2178 Value *Arg0 = SI->getLHS(); 2179 Value *Arg1 = SI->getRHS(); 2180 2181 // Make use of known overflow information. 2182 OverflowResult OR = computeOverflow(SI->getBinaryOp(), SI->isSigned(), 2183 Arg0, Arg1, SI); 2184 switch (OR) { 2185 case OverflowResult::MayOverflow: 2186 break; 2187 case OverflowResult::NeverOverflows: 2188 if (SI->isSigned()) 2189 return BinaryOperator::CreateNSW(SI->getBinaryOp(), Arg0, Arg1); 2190 else 2191 return BinaryOperator::CreateNUW(SI->getBinaryOp(), Arg0, Arg1); 2192 case OverflowResult::AlwaysOverflowsLow: { 2193 unsigned BitWidth = Ty->getScalarSizeInBits(); 2194 APInt Min = APSInt::getMinValue(BitWidth, !SI->isSigned()); 2195 return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Min)); 2196 } 2197 case OverflowResult::AlwaysOverflowsHigh: { 2198 unsigned BitWidth = Ty->getScalarSizeInBits(); 2199 APInt Max = APSInt::getMaxValue(BitWidth, !SI->isSigned()); 2200 return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Max)); 2201 } 2202 } 2203 2204 // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN 2205 Constant *C; 2206 if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) && 2207 C->isNotMinSignedValue()) { 2208 Value *NegVal = ConstantExpr::getNeg(C); 2209 return replaceInstUsesWith( 2210 *II, Builder.CreateBinaryIntrinsic( 2211 Intrinsic::sadd_sat, Arg0, NegVal)); 2212 } 2213 2214 // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2)) 2215 // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2)) 2216 // if Val and Val2 have the same sign 2217 if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) { 2218 Value *X; 2219 const APInt *Val, *Val2; 2220 APInt NewVal; 2221 bool IsUnsigned = 2222 IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat; 2223 if (Other->getIntrinsicID() == IID && 2224 match(Arg1, m_APInt(Val)) && 2225 match(Other->getArgOperand(0), m_Value(X)) && 2226 match(Other->getArgOperand(1), m_APInt(Val2))) { 2227 if (IsUnsigned) 2228 NewVal = Val->uadd_sat(*Val2); 2229 else if (Val->isNonNegative() == Val2->isNonNegative()) { 2230 bool Overflow; 2231 NewVal = Val->sadd_ov(*Val2, Overflow); 2232 if (Overflow) { 2233 // Both adds together may add more than SignedMaxValue 2234 // without saturating the final result. 2235 break; 2236 } 2237 } else { 2238 // Cannot fold saturated addition with different signs. 2239 break; 2240 } 2241 2242 return replaceInstUsesWith( 2243 *II, Builder.CreateBinaryIntrinsic( 2244 IID, X, ConstantInt::get(II->getType(), NewVal))); 2245 } 2246 } 2247 break; 2248 } 2249 2250 case Intrinsic::minnum: 2251 case Intrinsic::maxnum: 2252 case Intrinsic::minimum: 2253 case Intrinsic::maximum: { 2254 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2255 return I; 2256 Value *Arg0 = II->getArgOperand(0); 2257 Value *Arg1 = II->getArgOperand(1); 2258 Value *X, *Y; 2259 if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) && 2260 (Arg0->hasOneUse() || Arg1->hasOneUse())) { 2261 // If both operands are negated, invert the call and negate the result: 2262 // min(-X, -Y) --> -(max(X, Y)) 2263 // max(-X, -Y) --> -(min(X, Y)) 2264 Intrinsic::ID NewIID; 2265 switch (IID) { 2266 case Intrinsic::maxnum: 2267 NewIID = Intrinsic::minnum; 2268 break; 2269 case Intrinsic::minnum: 2270 NewIID = Intrinsic::maxnum; 2271 break; 2272 case Intrinsic::maximum: 2273 NewIID = Intrinsic::minimum; 2274 break; 2275 case Intrinsic::minimum: 2276 NewIID = Intrinsic::maximum; 2277 break; 2278 default: 2279 llvm_unreachable("unexpected intrinsic ID"); 2280 } 2281 Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II); 2282 Instruction *FNeg = UnaryOperator::CreateFNeg(NewCall); 2283 FNeg->copyIRFlags(II); 2284 return FNeg; 2285 } 2286 2287 // m(m(X, C2), C1) -> m(X, C) 2288 const APFloat *C1, *C2; 2289 if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) { 2290 if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) && 2291 ((match(M->getArgOperand(0), m_Value(X)) && 2292 match(M->getArgOperand(1), m_APFloat(C2))) || 2293 (match(M->getArgOperand(1), m_Value(X)) && 2294 match(M->getArgOperand(0), m_APFloat(C2))))) { 2295 APFloat Res(0.0); 2296 switch (IID) { 2297 case Intrinsic::maxnum: 2298 Res = maxnum(*C1, *C2); 2299 break; 2300 case Intrinsic::minnum: 2301 Res = minnum(*C1, *C2); 2302 break; 2303 case Intrinsic::maximum: 2304 Res = maximum(*C1, *C2); 2305 break; 2306 case Intrinsic::minimum: 2307 Res = minimum(*C1, *C2); 2308 break; 2309 default: 2310 llvm_unreachable("unexpected intrinsic ID"); 2311 } 2312 Instruction *NewCall = Builder.CreateBinaryIntrinsic( 2313 IID, X, ConstantFP::get(Arg0->getType(), Res)); 2314 NewCall->copyIRFlags(II); 2315 return replaceInstUsesWith(*II, NewCall); 2316 } 2317 } 2318 2319 Value *ExtSrc0; 2320 Value *ExtSrc1; 2321 2322 // minnum (fpext x), (fpext y) -> minnum x, y 2323 // maxnum (fpext x), (fpext y) -> maxnum x, y 2324 if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc0)))) && 2325 match(II->getArgOperand(1), m_OneUse(m_FPExt(m_Value(ExtSrc1)))) && 2326 ExtSrc0->getType() == ExtSrc1->getType()) { 2327 Value *F = Intrinsic::getDeclaration(II->getModule(), II->getIntrinsicID(), 2328 { ExtSrc0->getType() }); 2329 CallInst *NewCall = Builder.CreateCall(F, { ExtSrc0, ExtSrc1 }); 2330 NewCall->copyFastMathFlags(II); 2331 NewCall->takeName(II); 2332 return new FPExtInst(NewCall, II->getType()); 2333 } 2334 2335 break; 2336 } 2337 case Intrinsic::fmuladd: { 2338 // Canonicalize fast fmuladd to the separate fmul + fadd. 2339 if (II->isFast()) { 2340 BuilderTy::FastMathFlagGuard Guard(Builder); 2341 Builder.setFastMathFlags(II->getFastMathFlags()); 2342 Value *Mul = Builder.CreateFMul(II->getArgOperand(0), 2343 II->getArgOperand(1)); 2344 Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2)); 2345 Add->takeName(II); 2346 return replaceInstUsesWith(*II, Add); 2347 } 2348 2349 // Try to simplify the underlying FMul. 2350 if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1), 2351 II->getFastMathFlags(), 2352 SQ.getWithInstruction(II))) { 2353 auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); 2354 FAdd->copyFastMathFlags(II); 2355 return FAdd; 2356 } 2357 2358 LLVM_FALLTHROUGH; 2359 } 2360 case Intrinsic::fma: { 2361 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2362 return I; 2363 2364 // fma fneg(x), fneg(y), z -> fma x, y, z 2365 Value *Src0 = II->getArgOperand(0); 2366 Value *Src1 = II->getArgOperand(1); 2367 Value *X, *Y; 2368 if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) { 2369 replaceOperand(*II, 0, X); 2370 replaceOperand(*II, 1, Y); 2371 return II; 2372 } 2373 2374 // fma fabs(x), fabs(x), z -> fma x, x, z 2375 if (match(Src0, m_FAbs(m_Value(X))) && 2376 match(Src1, m_FAbs(m_Specific(X)))) { 2377 replaceOperand(*II, 0, X); 2378 replaceOperand(*II, 1, X); 2379 return II; 2380 } 2381 2382 // Try to simplify the underlying FMul. We can only apply simplifications 2383 // that do not require rounding. 2384 if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1), 2385 II->getFastMathFlags(), 2386 SQ.getWithInstruction(II))) { 2387 auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); 2388 FAdd->copyFastMathFlags(II); 2389 return FAdd; 2390 } 2391 2392 break; 2393 } 2394 case Intrinsic::copysign: { 2395 if (SignBitMustBeZero(II->getArgOperand(1), &TLI)) { 2396 // If we know that the sign argument is positive, reduce to FABS: 2397 // copysign X, Pos --> fabs X 2398 Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, 2399 II->getArgOperand(0), II); 2400 return replaceInstUsesWith(*II, Fabs); 2401 } 2402 // TODO: There should be a ValueTracking sibling like SignBitMustBeOne. 2403 const APFloat *C; 2404 if (match(II->getArgOperand(1), m_APFloat(C)) && C->isNegative()) { 2405 // If we know that the sign argument is negative, reduce to FNABS: 2406 // copysign X, Neg --> fneg (fabs X) 2407 Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, 2408 II->getArgOperand(0), II); 2409 return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II)); 2410 } 2411 2412 // Propagate sign argument through nested calls: 2413 // copysign X, (copysign ?, SignArg) --> copysign X, SignArg 2414 Value *SignArg; 2415 if (match(II->getArgOperand(1), 2416 m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(SignArg)))) 2417 return replaceOperand(*II, 1, SignArg); 2418 2419 break; 2420 } 2421 case Intrinsic::fabs: { 2422 Value *Cond; 2423 Constant *LHS, *RHS; 2424 if (match(II->getArgOperand(0), 2425 m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) { 2426 CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS}); 2427 CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS}); 2428 return SelectInst::Create(Cond, Call0, Call1); 2429 } 2430 2431 LLVM_FALLTHROUGH; 2432 } 2433 case Intrinsic::ceil: 2434 case Intrinsic::floor: 2435 case Intrinsic::round: 2436 case Intrinsic::nearbyint: 2437 case Intrinsic::rint: 2438 case Intrinsic::trunc: { 2439 Value *ExtSrc; 2440 if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) { 2441 // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x) 2442 Value *NarrowII = Builder.CreateUnaryIntrinsic(IID, ExtSrc, II); 2443 return new FPExtInst(NarrowII, II->getType()); 2444 } 2445 break; 2446 } 2447 case Intrinsic::cos: 2448 case Intrinsic::amdgcn_cos: { 2449 Value *X; 2450 Value *Src = II->getArgOperand(0); 2451 if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) { 2452 // cos(-x) -> cos(x) 2453 // cos(fabs(x)) -> cos(x) 2454 return replaceOperand(*II, 0, X); 2455 } 2456 break; 2457 } 2458 case Intrinsic::sin: { 2459 Value *X; 2460 if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) { 2461 // sin(-x) --> -sin(x) 2462 Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II); 2463 Instruction *FNeg = UnaryOperator::CreateFNeg(NewSin); 2464 FNeg->copyFastMathFlags(II); 2465 return FNeg; 2466 } 2467 break; 2468 } 2469 case Intrinsic::ppc_altivec_lvx: 2470 case Intrinsic::ppc_altivec_lvxl: 2471 // Turn PPC lvx -> load if the pointer is known aligned. 2472 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, 2473 &DT) >= 16) { 2474 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2475 PointerType::getUnqual(II->getType())); 2476 return new LoadInst(II->getType(), Ptr); 2477 } 2478 break; 2479 case Intrinsic::ppc_vsx_lxvw4x: 2480 case Intrinsic::ppc_vsx_lxvd2x: { 2481 // Turn PPC VSX loads into normal loads. 2482 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2483 PointerType::getUnqual(II->getType())); 2484 return new LoadInst(II->getType(), Ptr, Twine(""), false, Align(1)); 2485 } 2486 case Intrinsic::ppc_altivec_stvx: 2487 case Intrinsic::ppc_altivec_stvxl: 2488 // Turn stvx -> store if the pointer is known aligned. 2489 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, 2490 &DT) >= 16) { 2491 Type *OpPtrTy = 2492 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2493 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2494 return new StoreInst(II->getArgOperand(0), Ptr); 2495 } 2496 break; 2497 case Intrinsic::ppc_vsx_stxvw4x: 2498 case Intrinsic::ppc_vsx_stxvd2x: { 2499 // Turn PPC VSX stores into normal stores. 2500 Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); 2501 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2502 return new StoreInst(II->getArgOperand(0), Ptr, false, Align(1)); 2503 } 2504 case Intrinsic::ppc_qpx_qvlfs: 2505 // Turn PPC QPX qvlfs -> load if the pointer is known aligned. 2506 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, 2507 &DT) >= 16) { 2508 Type *VTy = VectorType::get(Builder.getFloatTy(), 2509 II->getType()->getVectorNumElements()); 2510 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2511 PointerType::getUnqual(VTy)); 2512 Value *Load = Builder.CreateLoad(VTy, Ptr); 2513 return new FPExtInst(Load, II->getType()); 2514 } 2515 break; 2516 case Intrinsic::ppc_qpx_qvlfd: 2517 // Turn PPC QPX qvlfd -> load if the pointer is known aligned. 2518 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC, 2519 &DT) >= 32) { 2520 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2521 PointerType::getUnqual(II->getType())); 2522 return new LoadInst(II->getType(), Ptr); 2523 } 2524 break; 2525 case Intrinsic::ppc_qpx_qvstfs: 2526 // Turn PPC QPX qvstfs -> store if the pointer is known aligned. 2527 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, 2528 &DT) >= 16) { 2529 Type *VTy = VectorType::get(Builder.getFloatTy(), 2530 II->getArgOperand(0)->getType()->getVectorNumElements()); 2531 Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy); 2532 Type *OpPtrTy = PointerType::getUnqual(VTy); 2533 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2534 return new StoreInst(TOp, Ptr); 2535 } 2536 break; 2537 case Intrinsic::ppc_qpx_qvstfd: 2538 // Turn PPC QPX qvstfd -> store if the pointer is known aligned. 2539 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC, 2540 &DT) >= 32) { 2541 Type *OpPtrTy = 2542 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2543 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2544 return new StoreInst(II->getArgOperand(0), Ptr); 2545 } 2546 break; 2547 2548 case Intrinsic::x86_bmi_bextr_32: 2549 case Intrinsic::x86_bmi_bextr_64: 2550 case Intrinsic::x86_tbm_bextri_u32: 2551 case Intrinsic::x86_tbm_bextri_u64: 2552 // If the RHS is a constant we can try some simplifications. 2553 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2554 uint64_t Shift = C->getZExtValue(); 2555 uint64_t Length = (Shift >> 8) & 0xff; 2556 Shift &= 0xff; 2557 unsigned BitWidth = II->getType()->getIntegerBitWidth(); 2558 // If the length is 0 or the shift is out of range, replace with zero. 2559 if (Length == 0 || Shift >= BitWidth) 2560 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2561 // If the LHS is also a constant, we can completely constant fold this. 2562 if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2563 uint64_t Result = InC->getZExtValue() >> Shift; 2564 if (Length > BitWidth) 2565 Length = BitWidth; 2566 Result &= maskTrailingOnes<uint64_t>(Length); 2567 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2568 } 2569 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 2570 // are only masking bits that a shift already cleared? 2571 } 2572 break; 2573 2574 case Intrinsic::x86_bmi_bzhi_32: 2575 case Intrinsic::x86_bmi_bzhi_64: 2576 // If the RHS is a constant we can try some simplifications. 2577 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2578 uint64_t Index = C->getZExtValue() & 0xff; 2579 unsigned BitWidth = II->getType()->getIntegerBitWidth(); 2580 if (Index >= BitWidth) 2581 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2582 if (Index == 0) 2583 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2584 // If the LHS is also a constant, we can completely constant fold this. 2585 if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2586 uint64_t Result = InC->getZExtValue(); 2587 Result &= maskTrailingOnes<uint64_t>(Index); 2588 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2589 } 2590 // TODO should we convert this to an AND if the RHS is constant? 2591 } 2592 break; 2593 case Intrinsic::x86_bmi_pext_32: 2594 case Intrinsic::x86_bmi_pext_64: 2595 if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2596 if (MaskC->isNullValue()) 2597 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2598 if (MaskC->isAllOnesValue()) 2599 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2600 2601 if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2602 uint64_t Src = SrcC->getZExtValue(); 2603 uint64_t Mask = MaskC->getZExtValue(); 2604 uint64_t Result = 0; 2605 uint64_t BitToSet = 1; 2606 2607 while (Mask) { 2608 // Isolate lowest set bit. 2609 uint64_t BitToTest = Mask & -Mask; 2610 if (BitToTest & Src) 2611 Result |= BitToSet; 2612 2613 BitToSet <<= 1; 2614 // Clear lowest set bit. 2615 Mask &= Mask - 1; 2616 } 2617 2618 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2619 } 2620 } 2621 break; 2622 case Intrinsic::x86_bmi_pdep_32: 2623 case Intrinsic::x86_bmi_pdep_64: 2624 if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2625 if (MaskC->isNullValue()) 2626 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2627 if (MaskC->isAllOnesValue()) 2628 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2629 2630 if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2631 uint64_t Src = SrcC->getZExtValue(); 2632 uint64_t Mask = MaskC->getZExtValue(); 2633 uint64_t Result = 0; 2634 uint64_t BitToTest = 1; 2635 2636 while (Mask) { 2637 // Isolate lowest set bit. 2638 uint64_t BitToSet = Mask & -Mask; 2639 if (BitToTest & Src) 2640 Result |= BitToSet; 2641 2642 BitToTest <<= 1; 2643 // Clear lowest set bit; 2644 Mask &= Mask - 1; 2645 } 2646 2647 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2648 } 2649 } 2650 break; 2651 2652 case Intrinsic::x86_sse_cvtss2si: 2653 case Intrinsic::x86_sse_cvtss2si64: 2654 case Intrinsic::x86_sse_cvttss2si: 2655 case Intrinsic::x86_sse_cvttss2si64: 2656 case Intrinsic::x86_sse2_cvtsd2si: 2657 case Intrinsic::x86_sse2_cvtsd2si64: 2658 case Intrinsic::x86_sse2_cvttsd2si: 2659 case Intrinsic::x86_sse2_cvttsd2si64: 2660 case Intrinsic::x86_avx512_vcvtss2si32: 2661 case Intrinsic::x86_avx512_vcvtss2si64: 2662 case Intrinsic::x86_avx512_vcvtss2usi32: 2663 case Intrinsic::x86_avx512_vcvtss2usi64: 2664 case Intrinsic::x86_avx512_vcvtsd2si32: 2665 case Intrinsic::x86_avx512_vcvtsd2si64: 2666 case Intrinsic::x86_avx512_vcvtsd2usi32: 2667 case Intrinsic::x86_avx512_vcvtsd2usi64: 2668 case Intrinsic::x86_avx512_cvttss2si: 2669 case Intrinsic::x86_avx512_cvttss2si64: 2670 case Intrinsic::x86_avx512_cvttss2usi: 2671 case Intrinsic::x86_avx512_cvttss2usi64: 2672 case Intrinsic::x86_avx512_cvttsd2si: 2673 case Intrinsic::x86_avx512_cvttsd2si64: 2674 case Intrinsic::x86_avx512_cvttsd2usi: 2675 case Intrinsic::x86_avx512_cvttsd2usi64: { 2676 // These intrinsics only demand the 0th element of their input vectors. If 2677 // we can simplify the input based on that, do so now. 2678 Value *Arg = II->getArgOperand(0); 2679 unsigned VWidth = Arg->getType()->getVectorNumElements(); 2680 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) 2681 return replaceOperand(*II, 0, V); 2682 break; 2683 } 2684 2685 case Intrinsic::x86_mmx_pmovmskb: 2686 case Intrinsic::x86_sse_movmsk_ps: 2687 case Intrinsic::x86_sse2_movmsk_pd: 2688 case Intrinsic::x86_sse2_pmovmskb_128: 2689 case Intrinsic::x86_avx_movmsk_pd_256: 2690 case Intrinsic::x86_avx_movmsk_ps_256: 2691 case Intrinsic::x86_avx2_pmovmskb: 2692 if (Value *V = simplifyX86movmsk(*II, Builder)) 2693 return replaceInstUsesWith(*II, V); 2694 break; 2695 2696 case Intrinsic::x86_sse_comieq_ss: 2697 case Intrinsic::x86_sse_comige_ss: 2698 case Intrinsic::x86_sse_comigt_ss: 2699 case Intrinsic::x86_sse_comile_ss: 2700 case Intrinsic::x86_sse_comilt_ss: 2701 case Intrinsic::x86_sse_comineq_ss: 2702 case Intrinsic::x86_sse_ucomieq_ss: 2703 case Intrinsic::x86_sse_ucomige_ss: 2704 case Intrinsic::x86_sse_ucomigt_ss: 2705 case Intrinsic::x86_sse_ucomile_ss: 2706 case Intrinsic::x86_sse_ucomilt_ss: 2707 case Intrinsic::x86_sse_ucomineq_ss: 2708 case Intrinsic::x86_sse2_comieq_sd: 2709 case Intrinsic::x86_sse2_comige_sd: 2710 case Intrinsic::x86_sse2_comigt_sd: 2711 case Intrinsic::x86_sse2_comile_sd: 2712 case Intrinsic::x86_sse2_comilt_sd: 2713 case Intrinsic::x86_sse2_comineq_sd: 2714 case Intrinsic::x86_sse2_ucomieq_sd: 2715 case Intrinsic::x86_sse2_ucomige_sd: 2716 case Intrinsic::x86_sse2_ucomigt_sd: 2717 case Intrinsic::x86_sse2_ucomile_sd: 2718 case Intrinsic::x86_sse2_ucomilt_sd: 2719 case Intrinsic::x86_sse2_ucomineq_sd: 2720 case Intrinsic::x86_avx512_vcomi_ss: 2721 case Intrinsic::x86_avx512_vcomi_sd: 2722 case Intrinsic::x86_avx512_mask_cmp_ss: 2723 case Intrinsic::x86_avx512_mask_cmp_sd: { 2724 // These intrinsics only demand the 0th element of their input vectors. If 2725 // we can simplify the input based on that, do so now. 2726 bool MadeChange = false; 2727 Value *Arg0 = II->getArgOperand(0); 2728 Value *Arg1 = II->getArgOperand(1); 2729 unsigned VWidth = Arg0->getType()->getVectorNumElements(); 2730 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2731 replaceOperand(*II, 0, V); 2732 MadeChange = true; 2733 } 2734 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2735 replaceOperand(*II, 1, V); 2736 MadeChange = true; 2737 } 2738 if (MadeChange) 2739 return II; 2740 break; 2741 } 2742 case Intrinsic::x86_avx512_cmp_pd_128: 2743 case Intrinsic::x86_avx512_cmp_pd_256: 2744 case Intrinsic::x86_avx512_cmp_pd_512: 2745 case Intrinsic::x86_avx512_cmp_ps_128: 2746 case Intrinsic::x86_avx512_cmp_ps_256: 2747 case Intrinsic::x86_avx512_cmp_ps_512: { 2748 // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a) 2749 Value *Arg0 = II->getArgOperand(0); 2750 Value *Arg1 = II->getArgOperand(1); 2751 bool Arg0IsZero = match(Arg0, m_PosZeroFP()); 2752 if (Arg0IsZero) 2753 std::swap(Arg0, Arg1); 2754 Value *A, *B; 2755 // This fold requires only the NINF(not +/- inf) since inf minus 2756 // inf is nan. 2757 // NSZ(No Signed Zeros) is not needed because zeros of any sign are 2758 // equal for both compares. 2759 // NNAN is not needed because nans compare the same for both compares. 2760 // The compare intrinsic uses the above assumptions and therefore 2761 // doesn't require additional flags. 2762 if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) && 2763 match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) && 2764 cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) { 2765 if (Arg0IsZero) 2766 std::swap(A, B); 2767 replaceOperand(*II, 0, A); 2768 replaceOperand(*II, 1, B); 2769 return II; 2770 } 2771 break; 2772 } 2773 2774 case Intrinsic::x86_avx512_add_ps_512: 2775 case Intrinsic::x86_avx512_div_ps_512: 2776 case Intrinsic::x86_avx512_mul_ps_512: 2777 case Intrinsic::x86_avx512_sub_ps_512: 2778 case Intrinsic::x86_avx512_add_pd_512: 2779 case Intrinsic::x86_avx512_div_pd_512: 2780 case Intrinsic::x86_avx512_mul_pd_512: 2781 case Intrinsic::x86_avx512_sub_pd_512: 2782 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2783 // IR operations. 2784 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) { 2785 if (R->getValue() == 4) { 2786 Value *Arg0 = II->getArgOperand(0); 2787 Value *Arg1 = II->getArgOperand(1); 2788 2789 Value *V; 2790 switch (IID) { 2791 default: llvm_unreachable("Case stmts out of sync!"); 2792 case Intrinsic::x86_avx512_add_ps_512: 2793 case Intrinsic::x86_avx512_add_pd_512: 2794 V = Builder.CreateFAdd(Arg0, Arg1); 2795 break; 2796 case Intrinsic::x86_avx512_sub_ps_512: 2797 case Intrinsic::x86_avx512_sub_pd_512: 2798 V = Builder.CreateFSub(Arg0, Arg1); 2799 break; 2800 case Intrinsic::x86_avx512_mul_ps_512: 2801 case Intrinsic::x86_avx512_mul_pd_512: 2802 V = Builder.CreateFMul(Arg0, Arg1); 2803 break; 2804 case Intrinsic::x86_avx512_div_ps_512: 2805 case Intrinsic::x86_avx512_div_pd_512: 2806 V = Builder.CreateFDiv(Arg0, Arg1); 2807 break; 2808 } 2809 2810 return replaceInstUsesWith(*II, V); 2811 } 2812 } 2813 break; 2814 2815 case Intrinsic::x86_avx512_mask_add_ss_round: 2816 case Intrinsic::x86_avx512_mask_div_ss_round: 2817 case Intrinsic::x86_avx512_mask_mul_ss_round: 2818 case Intrinsic::x86_avx512_mask_sub_ss_round: 2819 case Intrinsic::x86_avx512_mask_add_sd_round: 2820 case Intrinsic::x86_avx512_mask_div_sd_round: 2821 case Intrinsic::x86_avx512_mask_mul_sd_round: 2822 case Intrinsic::x86_avx512_mask_sub_sd_round: 2823 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2824 // IR operations. 2825 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { 2826 if (R->getValue() == 4) { 2827 // Extract the element as scalars. 2828 Value *Arg0 = II->getArgOperand(0); 2829 Value *Arg1 = II->getArgOperand(1); 2830 Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0); 2831 Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0); 2832 2833 Value *V; 2834 switch (IID) { 2835 default: llvm_unreachable("Case stmts out of sync!"); 2836 case Intrinsic::x86_avx512_mask_add_ss_round: 2837 case Intrinsic::x86_avx512_mask_add_sd_round: 2838 V = Builder.CreateFAdd(LHS, RHS); 2839 break; 2840 case Intrinsic::x86_avx512_mask_sub_ss_round: 2841 case Intrinsic::x86_avx512_mask_sub_sd_round: 2842 V = Builder.CreateFSub(LHS, RHS); 2843 break; 2844 case Intrinsic::x86_avx512_mask_mul_ss_round: 2845 case Intrinsic::x86_avx512_mask_mul_sd_round: 2846 V = Builder.CreateFMul(LHS, RHS); 2847 break; 2848 case Intrinsic::x86_avx512_mask_div_ss_round: 2849 case Intrinsic::x86_avx512_mask_div_sd_round: 2850 V = Builder.CreateFDiv(LHS, RHS); 2851 break; 2852 } 2853 2854 // Handle the masking aspect of the intrinsic. 2855 Value *Mask = II->getArgOperand(3); 2856 auto *C = dyn_cast<ConstantInt>(Mask); 2857 // We don't need a select if we know the mask bit is a 1. 2858 if (!C || !C->getValue()[0]) { 2859 // Cast the mask to an i1 vector and then extract the lowest element. 2860 auto *MaskTy = VectorType::get(Builder.getInt1Ty(), 2861 cast<IntegerType>(Mask->getType())->getBitWidth()); 2862 Mask = Builder.CreateBitCast(Mask, MaskTy); 2863 Mask = Builder.CreateExtractElement(Mask, (uint64_t)0); 2864 // Extract the lowest element from the passthru operand. 2865 Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2), 2866 (uint64_t)0); 2867 V = Builder.CreateSelect(Mask, V, Passthru); 2868 } 2869 2870 // Insert the result back into the original argument 0. 2871 V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2872 2873 return replaceInstUsesWith(*II, V); 2874 } 2875 } 2876 break; 2877 2878 // Constant fold ashr( <A x Bi>, Ci ). 2879 // Constant fold lshr( <A x Bi>, Ci ). 2880 // Constant fold shl( <A x Bi>, Ci ). 2881 case Intrinsic::x86_sse2_psrai_d: 2882 case Intrinsic::x86_sse2_psrai_w: 2883 case Intrinsic::x86_avx2_psrai_d: 2884 case Intrinsic::x86_avx2_psrai_w: 2885 case Intrinsic::x86_avx512_psrai_q_128: 2886 case Intrinsic::x86_avx512_psrai_q_256: 2887 case Intrinsic::x86_avx512_psrai_d_512: 2888 case Intrinsic::x86_avx512_psrai_q_512: 2889 case Intrinsic::x86_avx512_psrai_w_512: 2890 case Intrinsic::x86_sse2_psrli_d: 2891 case Intrinsic::x86_sse2_psrli_q: 2892 case Intrinsic::x86_sse2_psrli_w: 2893 case Intrinsic::x86_avx2_psrli_d: 2894 case Intrinsic::x86_avx2_psrli_q: 2895 case Intrinsic::x86_avx2_psrli_w: 2896 case Intrinsic::x86_avx512_psrli_d_512: 2897 case Intrinsic::x86_avx512_psrli_q_512: 2898 case Intrinsic::x86_avx512_psrli_w_512: 2899 case Intrinsic::x86_sse2_pslli_d: 2900 case Intrinsic::x86_sse2_pslli_q: 2901 case Intrinsic::x86_sse2_pslli_w: 2902 case Intrinsic::x86_avx2_pslli_d: 2903 case Intrinsic::x86_avx2_pslli_q: 2904 case Intrinsic::x86_avx2_pslli_w: 2905 case Intrinsic::x86_avx512_pslli_d_512: 2906 case Intrinsic::x86_avx512_pslli_q_512: 2907 case Intrinsic::x86_avx512_pslli_w_512: 2908 if (Value *V = simplifyX86immShift(*II, Builder)) 2909 return replaceInstUsesWith(*II, V); 2910 break; 2911 2912 case Intrinsic::x86_sse2_psra_d: 2913 case Intrinsic::x86_sse2_psra_w: 2914 case Intrinsic::x86_avx2_psra_d: 2915 case Intrinsic::x86_avx2_psra_w: 2916 case Intrinsic::x86_avx512_psra_q_128: 2917 case Intrinsic::x86_avx512_psra_q_256: 2918 case Intrinsic::x86_avx512_psra_d_512: 2919 case Intrinsic::x86_avx512_psra_q_512: 2920 case Intrinsic::x86_avx512_psra_w_512: 2921 case Intrinsic::x86_sse2_psrl_d: 2922 case Intrinsic::x86_sse2_psrl_q: 2923 case Intrinsic::x86_sse2_psrl_w: 2924 case Intrinsic::x86_avx2_psrl_d: 2925 case Intrinsic::x86_avx2_psrl_q: 2926 case Intrinsic::x86_avx2_psrl_w: 2927 case Intrinsic::x86_avx512_psrl_d_512: 2928 case Intrinsic::x86_avx512_psrl_q_512: 2929 case Intrinsic::x86_avx512_psrl_w_512: 2930 case Intrinsic::x86_sse2_psll_d: 2931 case Intrinsic::x86_sse2_psll_q: 2932 case Intrinsic::x86_sse2_psll_w: 2933 case Intrinsic::x86_avx2_psll_d: 2934 case Intrinsic::x86_avx2_psll_q: 2935 case Intrinsic::x86_avx2_psll_w: 2936 case Intrinsic::x86_avx512_psll_d_512: 2937 case Intrinsic::x86_avx512_psll_q_512: 2938 case Intrinsic::x86_avx512_psll_w_512: { 2939 if (Value *V = simplifyX86immShift(*II, Builder)) 2940 return replaceInstUsesWith(*II, V); 2941 2942 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2943 // operand to compute the shift amount. 2944 Value *Arg1 = II->getArgOperand(1); 2945 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2946 "Unexpected packed shift size"); 2947 unsigned VWidth = Arg1->getType()->getVectorNumElements(); 2948 2949 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) 2950 return replaceOperand(*II, 1, V); 2951 break; 2952 } 2953 2954 case Intrinsic::x86_avx2_psllv_d: 2955 case Intrinsic::x86_avx2_psllv_d_256: 2956 case Intrinsic::x86_avx2_psllv_q: 2957 case Intrinsic::x86_avx2_psllv_q_256: 2958 case Intrinsic::x86_avx512_psllv_d_512: 2959 case Intrinsic::x86_avx512_psllv_q_512: 2960 case Intrinsic::x86_avx512_psllv_w_128: 2961 case Intrinsic::x86_avx512_psllv_w_256: 2962 case Intrinsic::x86_avx512_psllv_w_512: 2963 case Intrinsic::x86_avx2_psrav_d: 2964 case Intrinsic::x86_avx2_psrav_d_256: 2965 case Intrinsic::x86_avx512_psrav_q_128: 2966 case Intrinsic::x86_avx512_psrav_q_256: 2967 case Intrinsic::x86_avx512_psrav_d_512: 2968 case Intrinsic::x86_avx512_psrav_q_512: 2969 case Intrinsic::x86_avx512_psrav_w_128: 2970 case Intrinsic::x86_avx512_psrav_w_256: 2971 case Intrinsic::x86_avx512_psrav_w_512: 2972 case Intrinsic::x86_avx2_psrlv_d: 2973 case Intrinsic::x86_avx2_psrlv_d_256: 2974 case Intrinsic::x86_avx2_psrlv_q: 2975 case Intrinsic::x86_avx2_psrlv_q_256: 2976 case Intrinsic::x86_avx512_psrlv_d_512: 2977 case Intrinsic::x86_avx512_psrlv_q_512: 2978 case Intrinsic::x86_avx512_psrlv_w_128: 2979 case Intrinsic::x86_avx512_psrlv_w_256: 2980 case Intrinsic::x86_avx512_psrlv_w_512: 2981 if (Value *V = simplifyX86varShift(*II, Builder)) 2982 return replaceInstUsesWith(*II, V); 2983 break; 2984 2985 case Intrinsic::x86_sse2_packssdw_128: 2986 case Intrinsic::x86_sse2_packsswb_128: 2987 case Intrinsic::x86_avx2_packssdw: 2988 case Intrinsic::x86_avx2_packsswb: 2989 case Intrinsic::x86_avx512_packssdw_512: 2990 case Intrinsic::x86_avx512_packsswb_512: 2991 if (Value *V = simplifyX86pack(*II, Builder, true)) 2992 return replaceInstUsesWith(*II, V); 2993 break; 2994 2995 case Intrinsic::x86_sse2_packuswb_128: 2996 case Intrinsic::x86_sse41_packusdw: 2997 case Intrinsic::x86_avx2_packusdw: 2998 case Intrinsic::x86_avx2_packuswb: 2999 case Intrinsic::x86_avx512_packusdw_512: 3000 case Intrinsic::x86_avx512_packuswb_512: 3001 if (Value *V = simplifyX86pack(*II, Builder, false)) 3002 return replaceInstUsesWith(*II, V); 3003 break; 3004 3005 case Intrinsic::x86_pclmulqdq: 3006 case Intrinsic::x86_pclmulqdq_256: 3007 case Intrinsic::x86_pclmulqdq_512: { 3008 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) { 3009 unsigned Imm = C->getZExtValue(); 3010 3011 bool MadeChange = false; 3012 Value *Arg0 = II->getArgOperand(0); 3013 Value *Arg1 = II->getArgOperand(1); 3014 unsigned VWidth = Arg0->getType()->getVectorNumElements(); 3015 3016 APInt UndefElts1(VWidth, 0); 3017 APInt DemandedElts1 = APInt::getSplat(VWidth, 3018 APInt(2, (Imm & 0x01) ? 2 : 1)); 3019 if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1, 3020 UndefElts1)) { 3021 replaceOperand(*II, 0, V); 3022 MadeChange = true; 3023 } 3024 3025 APInt UndefElts2(VWidth, 0); 3026 APInt DemandedElts2 = APInt::getSplat(VWidth, 3027 APInt(2, (Imm & 0x10) ? 2 : 1)); 3028 if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2, 3029 UndefElts2)) { 3030 replaceOperand(*II, 1, V); 3031 MadeChange = true; 3032 } 3033 3034 // If either input elements are undef, the result is zero. 3035 if (DemandedElts1.isSubsetOf(UndefElts1) || 3036 DemandedElts2.isSubsetOf(UndefElts2)) 3037 return replaceInstUsesWith(*II, 3038 ConstantAggregateZero::get(II->getType())); 3039 3040 if (MadeChange) 3041 return II; 3042 } 3043 break; 3044 } 3045 3046 case Intrinsic::x86_sse41_insertps: 3047 if (Value *V = simplifyX86insertps(*II, Builder)) 3048 return replaceInstUsesWith(*II, V); 3049 break; 3050 3051 case Intrinsic::x86_sse4a_extrq: { 3052 Value *Op0 = II->getArgOperand(0); 3053 Value *Op1 = II->getArgOperand(1); 3054 unsigned VWidth0 = Op0->getType()->getVectorNumElements(); 3055 unsigned VWidth1 = Op1->getType()->getVectorNumElements(); 3056 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 3057 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 3058 VWidth1 == 16 && "Unexpected operand sizes"); 3059 3060 // See if we're dealing with constant values. 3061 Constant *C1 = dyn_cast<Constant>(Op1); 3062 ConstantInt *CILength = 3063 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 3064 : nullptr; 3065 ConstantInt *CIIndex = 3066 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 3067 : nullptr; 3068 3069 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 3070 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) 3071 return replaceInstUsesWith(*II, V); 3072 3073 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 3074 // operands and the lowest 16-bits of the second. 3075 bool MadeChange = false; 3076 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 3077 replaceOperand(*II, 0, V); 3078 MadeChange = true; 3079 } 3080 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 3081 replaceOperand(*II, 1, V); 3082 MadeChange = true; 3083 } 3084 if (MadeChange) 3085 return II; 3086 break; 3087 } 3088 3089 case Intrinsic::x86_sse4a_extrqi: { 3090 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 3091 // bits of the lower 64-bits. The upper 64-bits are undefined. 3092 Value *Op0 = II->getArgOperand(0); 3093 unsigned VWidth = Op0->getType()->getVectorNumElements(); 3094 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 3095 "Unexpected operand size"); 3096 3097 // See if we're dealing with constant values. 3098 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3099 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3100 3101 // Attempt to simplify to a constant or shuffle vector. 3102 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) 3103 return replaceInstUsesWith(*II, V); 3104 3105 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 3106 // operand. 3107 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) 3108 return replaceOperand(*II, 0, V); 3109 break; 3110 } 3111 3112 case Intrinsic::x86_sse4a_insertq: { 3113 Value *Op0 = II->getArgOperand(0); 3114 Value *Op1 = II->getArgOperand(1); 3115 unsigned VWidth = Op0->getType()->getVectorNumElements(); 3116 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 3117 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 3118 Op1->getType()->getVectorNumElements() == 2 && 3119 "Unexpected operand size"); 3120 3121 // See if we're dealing with constant values. 3122 Constant *C1 = dyn_cast<Constant>(Op1); 3123 ConstantInt *CI11 = 3124 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 3125 : nullptr; 3126 3127 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 3128 if (CI11) { 3129 const APInt &V11 = CI11->getValue(); 3130 APInt Len = V11.zextOrTrunc(6); 3131 APInt Idx = V11.lshr(8).zextOrTrunc(6); 3132 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) 3133 return replaceInstUsesWith(*II, V); 3134 } 3135 3136 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 3137 // operand. 3138 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) 3139 return replaceOperand(*II, 0, V); 3140 break; 3141 } 3142 3143 case Intrinsic::x86_sse4a_insertqi: { 3144 // INSERTQI: Extract lowest Length bits from lower half of second source and 3145 // insert over first source starting at Index bit. The upper 64-bits are 3146 // undefined. 3147 Value *Op0 = II->getArgOperand(0); 3148 Value *Op1 = II->getArgOperand(1); 3149 unsigned VWidth0 = Op0->getType()->getVectorNumElements(); 3150 unsigned VWidth1 = Op1->getType()->getVectorNumElements(); 3151 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 3152 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 3153 VWidth1 == 2 && "Unexpected operand sizes"); 3154 3155 // See if we're dealing with constant values. 3156 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3157 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3)); 3158 3159 // Attempt to simplify to a constant or shuffle vector. 3160 if (CILength && CIIndex) { 3161 APInt Len = CILength->getValue().zextOrTrunc(6); 3162 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 3163 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) 3164 return replaceInstUsesWith(*II, V); 3165 } 3166 3167 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 3168 // operands. 3169 bool MadeChange = false; 3170 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 3171 replaceOperand(*II, 0, V); 3172 MadeChange = true; 3173 } 3174 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 3175 replaceOperand(*II, 1, V); 3176 MadeChange = true; 3177 } 3178 if (MadeChange) 3179 return II; 3180 break; 3181 } 3182 3183 case Intrinsic::x86_sse41_pblendvb: 3184 case Intrinsic::x86_sse41_blendvps: 3185 case Intrinsic::x86_sse41_blendvpd: 3186 case Intrinsic::x86_avx_blendv_ps_256: 3187 case Intrinsic::x86_avx_blendv_pd_256: 3188 case Intrinsic::x86_avx2_pblendvb: { 3189 // fold (blend A, A, Mask) -> A 3190 Value *Op0 = II->getArgOperand(0); 3191 Value *Op1 = II->getArgOperand(1); 3192 Value *Mask = II->getArgOperand(2); 3193 if (Op0 == Op1) 3194 return replaceInstUsesWith(CI, Op0); 3195 3196 // Zero Mask - select 1st argument. 3197 if (isa<ConstantAggregateZero>(Mask)) 3198 return replaceInstUsesWith(CI, Op0); 3199 3200 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 3201 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 3202 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 3203 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 3204 } 3205 3206 // Convert to a vector select if we can bypass casts and find a boolean 3207 // vector condition value. 3208 Value *BoolVec; 3209 Mask = peekThroughBitcast(Mask); 3210 if (match(Mask, m_SExt(m_Value(BoolVec))) && 3211 BoolVec->getType()->isVectorTy() && 3212 BoolVec->getType()->getScalarSizeInBits() == 1) { 3213 assert(Mask->getType()->getPrimitiveSizeInBits() == 3214 II->getType()->getPrimitiveSizeInBits() && 3215 "Not expecting mask and operands with different sizes"); 3216 3217 unsigned NumMaskElts = Mask->getType()->getVectorNumElements(); 3218 unsigned NumOperandElts = II->getType()->getVectorNumElements(); 3219 if (NumMaskElts == NumOperandElts) 3220 return SelectInst::Create(BoolVec, Op1, Op0); 3221 3222 // If the mask has less elements than the operands, each mask bit maps to 3223 // multiple elements of the operands. Bitcast back and forth. 3224 if (NumMaskElts < NumOperandElts) { 3225 Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType()); 3226 Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType()); 3227 Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 3228 return new BitCastInst(Sel, II->getType()); 3229 } 3230 } 3231 3232 break; 3233 } 3234 3235 case Intrinsic::x86_ssse3_pshuf_b_128: 3236 case Intrinsic::x86_avx2_pshuf_b: 3237 case Intrinsic::x86_avx512_pshuf_b_512: 3238 if (Value *V = simplifyX86pshufb(*II, Builder)) 3239 return replaceInstUsesWith(*II, V); 3240 break; 3241 3242 case Intrinsic::x86_avx_vpermilvar_ps: 3243 case Intrinsic::x86_avx_vpermilvar_ps_256: 3244 case Intrinsic::x86_avx512_vpermilvar_ps_512: 3245 case Intrinsic::x86_avx_vpermilvar_pd: 3246 case Intrinsic::x86_avx_vpermilvar_pd_256: 3247 case Intrinsic::x86_avx512_vpermilvar_pd_512: 3248 if (Value *V = simplifyX86vpermilvar(*II, Builder)) 3249 return replaceInstUsesWith(*II, V); 3250 break; 3251 3252 case Intrinsic::x86_avx2_permd: 3253 case Intrinsic::x86_avx2_permps: 3254 case Intrinsic::x86_avx512_permvar_df_256: 3255 case Intrinsic::x86_avx512_permvar_df_512: 3256 case Intrinsic::x86_avx512_permvar_di_256: 3257 case Intrinsic::x86_avx512_permvar_di_512: 3258 case Intrinsic::x86_avx512_permvar_hi_128: 3259 case Intrinsic::x86_avx512_permvar_hi_256: 3260 case Intrinsic::x86_avx512_permvar_hi_512: 3261 case Intrinsic::x86_avx512_permvar_qi_128: 3262 case Intrinsic::x86_avx512_permvar_qi_256: 3263 case Intrinsic::x86_avx512_permvar_qi_512: 3264 case Intrinsic::x86_avx512_permvar_sf_512: 3265 case Intrinsic::x86_avx512_permvar_si_512: 3266 if (Value *V = simplifyX86vpermv(*II, Builder)) 3267 return replaceInstUsesWith(*II, V); 3268 break; 3269 3270 case Intrinsic::x86_avx_maskload_ps: 3271 case Intrinsic::x86_avx_maskload_pd: 3272 case Intrinsic::x86_avx_maskload_ps_256: 3273 case Intrinsic::x86_avx_maskload_pd_256: 3274 case Intrinsic::x86_avx2_maskload_d: 3275 case Intrinsic::x86_avx2_maskload_q: 3276 case Intrinsic::x86_avx2_maskload_d_256: 3277 case Intrinsic::x86_avx2_maskload_q_256: 3278 if (Instruction *I = simplifyX86MaskedLoad(*II, *this)) 3279 return I; 3280 break; 3281 3282 case Intrinsic::x86_sse2_maskmov_dqu: 3283 case Intrinsic::x86_avx_maskstore_ps: 3284 case Intrinsic::x86_avx_maskstore_pd: 3285 case Intrinsic::x86_avx_maskstore_ps_256: 3286 case Intrinsic::x86_avx_maskstore_pd_256: 3287 case Intrinsic::x86_avx2_maskstore_d: 3288 case Intrinsic::x86_avx2_maskstore_q: 3289 case Intrinsic::x86_avx2_maskstore_d_256: 3290 case Intrinsic::x86_avx2_maskstore_q_256: 3291 if (simplifyX86MaskedStore(*II, *this)) 3292 return nullptr; 3293 break; 3294 3295 case Intrinsic::x86_addcarry_32: 3296 case Intrinsic::x86_addcarry_64: 3297 if (Value *V = simplifyX86addcarry(*II, Builder)) 3298 return replaceInstUsesWith(*II, V); 3299 break; 3300 3301 case Intrinsic::ppc_altivec_vperm: 3302 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. 3303 // Note that ppc_altivec_vperm has a big-endian bias, so when creating 3304 // a vectorshuffle for little endian, we must undo the transformation 3305 // performed on vec_perm in altivec.h. That is, we must complement 3306 // the permutation mask with respect to 31 and reverse the order of 3307 // V1 and V2. 3308 if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) { 3309 assert(Mask->getType()->getVectorNumElements() == 16 && 3310 "Bad type for intrinsic!"); 3311 3312 // Check that all of the elements are integer constants or undefs. 3313 bool AllEltsOk = true; 3314 for (unsigned i = 0; i != 16; ++i) { 3315 Constant *Elt = Mask->getAggregateElement(i); 3316 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) { 3317 AllEltsOk = false; 3318 break; 3319 } 3320 } 3321 3322 if (AllEltsOk) { 3323 // Cast the input vectors to byte vectors. 3324 Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0), 3325 Mask->getType()); 3326 Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1), 3327 Mask->getType()); 3328 Value *Result = UndefValue::get(Op0->getType()); 3329 3330 // Only extract each element once. 3331 Value *ExtractedElts[32]; 3332 memset(ExtractedElts, 0, sizeof(ExtractedElts)); 3333 3334 for (unsigned i = 0; i != 16; ++i) { 3335 if (isa<UndefValue>(Mask->getAggregateElement(i))) 3336 continue; 3337 unsigned Idx = 3338 cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue(); 3339 Idx &= 31; // Match the hardware behavior. 3340 if (DL.isLittleEndian()) 3341 Idx = 31 - Idx; 3342 3343 if (!ExtractedElts[Idx]) { 3344 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0; 3345 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1; 3346 ExtractedElts[Idx] = 3347 Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse, 3348 Builder.getInt32(Idx&15)); 3349 } 3350 3351 // Insert this value into the result vector. 3352 Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx], 3353 Builder.getInt32(i)); 3354 } 3355 return CastInst::Create(Instruction::BitCast, Result, CI.getType()); 3356 } 3357 } 3358 break; 3359 3360 case Intrinsic::arm_neon_vld1: { 3361 unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), 3362 DL, II, &AC, &DT); 3363 if (Value *V = simplifyNeonVld1(*II, MemAlign, Builder)) 3364 return replaceInstUsesWith(*II, V); 3365 break; 3366 } 3367 3368 case Intrinsic::arm_neon_vld2: 3369 case Intrinsic::arm_neon_vld3: 3370 case Intrinsic::arm_neon_vld4: 3371 case Intrinsic::arm_neon_vld2lane: 3372 case Intrinsic::arm_neon_vld3lane: 3373 case Intrinsic::arm_neon_vld4lane: 3374 case Intrinsic::arm_neon_vst1: 3375 case Intrinsic::arm_neon_vst2: 3376 case Intrinsic::arm_neon_vst3: 3377 case Intrinsic::arm_neon_vst4: 3378 case Intrinsic::arm_neon_vst2lane: 3379 case Intrinsic::arm_neon_vst3lane: 3380 case Intrinsic::arm_neon_vst4lane: { 3381 unsigned MemAlign = 3382 getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); 3383 unsigned AlignArg = II->getNumArgOperands() - 1; 3384 ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg)); 3385 if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) 3386 return replaceOperand(*II, AlignArg, 3387 ConstantInt::get(Type::getInt32Ty(II->getContext()), 3388 MemAlign, false)); 3389 break; 3390 } 3391 3392 case Intrinsic::arm_neon_vtbl1: 3393 case Intrinsic::aarch64_neon_tbl1: 3394 if (Value *V = simplifyNeonTbl1(*II, Builder)) 3395 return replaceInstUsesWith(*II, V); 3396 break; 3397 3398 case Intrinsic::arm_neon_vmulls: 3399 case Intrinsic::arm_neon_vmullu: 3400 case Intrinsic::aarch64_neon_smull: 3401 case Intrinsic::aarch64_neon_umull: { 3402 Value *Arg0 = II->getArgOperand(0); 3403 Value *Arg1 = II->getArgOperand(1); 3404 3405 // Handle mul by zero first: 3406 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) { 3407 return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); 3408 } 3409 3410 // Check for constant LHS & RHS - in this case we just simplify. 3411 bool Zext = (IID == Intrinsic::arm_neon_vmullu || 3412 IID == Intrinsic::aarch64_neon_umull); 3413 VectorType *NewVT = cast<VectorType>(II->getType()); 3414 if (Constant *CV0 = dyn_cast<Constant>(Arg0)) { 3415 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) { 3416 CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext); 3417 CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext); 3418 3419 return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1)); 3420 } 3421 3422 // Couldn't simplify - canonicalize constant to the RHS. 3423 std::swap(Arg0, Arg1); 3424 } 3425 3426 // Handle mul by one: 3427 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) 3428 if (ConstantInt *Splat = 3429 dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) 3430 if (Splat->isOne()) 3431 return CastInst::CreateIntegerCast(Arg0, II->getType(), 3432 /*isSigned=*/!Zext); 3433 3434 break; 3435 } 3436 case Intrinsic::arm_neon_aesd: 3437 case Intrinsic::arm_neon_aese: 3438 case Intrinsic::aarch64_crypto_aesd: 3439 case Intrinsic::aarch64_crypto_aese: { 3440 Value *DataArg = II->getArgOperand(0); 3441 Value *KeyArg = II->getArgOperand(1); 3442 3443 // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR 3444 Value *Data, *Key; 3445 if (match(KeyArg, m_ZeroInt()) && 3446 match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) { 3447 replaceOperand(*II, 0, Data); 3448 replaceOperand(*II, 1, Key); 3449 return II; 3450 } 3451 break; 3452 } 3453 case Intrinsic::arm_mve_pred_i2v: { 3454 Value *Arg = II->getArgOperand(0); 3455 Value *ArgArg; 3456 if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg))) && 3457 II->getType() == ArgArg->getType()) 3458 return replaceInstUsesWith(*II, ArgArg); 3459 Constant *XorMask; 3460 if (match(Arg, 3461 m_Xor(m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg)), 3462 m_Constant(XorMask))) && 3463 II->getType() == ArgArg->getType()) { 3464 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { 3465 if (CI->getValue().trunc(16).isAllOnesValue()) { 3466 auto TrueVector = Builder.CreateVectorSplat( 3467 II->getType()->getVectorNumElements(), Builder.getTrue()); 3468 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); 3469 } 3470 } 3471 } 3472 KnownBits ScalarKnown(32); 3473 if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16), 3474 ScalarKnown, 0)) 3475 return II; 3476 break; 3477 } 3478 case Intrinsic::arm_mve_pred_v2i: { 3479 Value *Arg = II->getArgOperand(0); 3480 Value *ArgArg; 3481 if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(m_Value(ArgArg)))) 3482 return replaceInstUsesWith(*II, ArgArg); 3483 if (!II->getMetadata(LLVMContext::MD_range)) { 3484 Type *IntTy32 = Type::getInt32Ty(II->getContext()); 3485 Metadata *M[] = { 3486 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), 3487 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF)) 3488 }; 3489 II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M)); 3490 return II; 3491 } 3492 break; 3493 } 3494 case Intrinsic::arm_mve_vadc: 3495 case Intrinsic::arm_mve_vadc_predicated: { 3496 unsigned CarryOp = 3497 (II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; 3498 assert(II->getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && 3499 "Bad type for intrinsic!"); 3500 3501 KnownBits CarryKnown(32); 3502 if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29), 3503 CarryKnown)) 3504 return II; 3505 break; 3506 } 3507 case Intrinsic::amdgcn_rcp: { 3508 Value *Src = II->getArgOperand(0); 3509 3510 // TODO: Move to ConstantFolding/InstSimplify? 3511 if (isa<UndefValue>(Src)) 3512 return replaceInstUsesWith(CI, Src); 3513 3514 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3515 const APFloat &ArgVal = C->getValueAPF(); 3516 APFloat Val(ArgVal.getSemantics(), 1); 3517 APFloat::opStatus Status = Val.divide(ArgVal, 3518 APFloat::rmNearestTiesToEven); 3519 // Only do this if it was exact and therefore not dependent on the 3520 // rounding mode. 3521 if (Status == APFloat::opOK) 3522 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); 3523 } 3524 3525 break; 3526 } 3527 case Intrinsic::amdgcn_rsq: { 3528 Value *Src = II->getArgOperand(0); 3529 3530 // TODO: Move to ConstantFolding/InstSimplify? 3531 if (isa<UndefValue>(Src)) 3532 return replaceInstUsesWith(CI, Src); 3533 break; 3534 } 3535 case Intrinsic::amdgcn_frexp_mant: 3536 case Intrinsic::amdgcn_frexp_exp: { 3537 Value *Src = II->getArgOperand(0); 3538 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3539 int Exp; 3540 APFloat Significand = frexp(C->getValueAPF(), Exp, 3541 APFloat::rmNearestTiesToEven); 3542 3543 if (IID == Intrinsic::amdgcn_frexp_mant) { 3544 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), 3545 Significand)); 3546 } 3547 3548 // Match instruction special case behavior. 3549 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 3550 Exp = 0; 3551 3552 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp)); 3553 } 3554 3555 if (isa<UndefValue>(Src)) 3556 return replaceInstUsesWith(CI, UndefValue::get(II->getType())); 3557 3558 break; 3559 } 3560 case Intrinsic::amdgcn_class: { 3561 enum { 3562 S_NAN = 1 << 0, // Signaling NaN 3563 Q_NAN = 1 << 1, // Quiet NaN 3564 N_INFINITY = 1 << 2, // Negative infinity 3565 N_NORMAL = 1 << 3, // Negative normal 3566 N_SUBNORMAL = 1 << 4, // Negative subnormal 3567 N_ZERO = 1 << 5, // Negative zero 3568 P_ZERO = 1 << 6, // Positive zero 3569 P_SUBNORMAL = 1 << 7, // Positive subnormal 3570 P_NORMAL = 1 << 8, // Positive normal 3571 P_INFINITY = 1 << 9 // Positive infinity 3572 }; 3573 3574 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 3575 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY; 3576 3577 Value *Src0 = II->getArgOperand(0); 3578 Value *Src1 = II->getArgOperand(1); 3579 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 3580 if (!CMask) { 3581 if (isa<UndefValue>(Src0)) 3582 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3583 3584 if (isa<UndefValue>(Src1)) 3585 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3586 break; 3587 } 3588 3589 uint32_t Mask = CMask->getZExtValue(); 3590 3591 // If all tests are made, it doesn't matter what the value is. 3592 if ((Mask & FullMask) == FullMask) 3593 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true)); 3594 3595 if ((Mask & FullMask) == 0) 3596 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3597 3598 if (Mask == (S_NAN | Q_NAN)) { 3599 // Equivalent of isnan. Replace with standard fcmp. 3600 Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0); 3601 FCmp->takeName(II); 3602 return replaceInstUsesWith(*II, FCmp); 3603 } 3604 3605 if (Mask == (N_ZERO | P_ZERO)) { 3606 // Equivalent of == 0. 3607 Value *FCmp = Builder.CreateFCmpOEQ( 3608 Src0, ConstantFP::get(Src0->getType(), 0.0)); 3609 3610 FCmp->takeName(II); 3611 return replaceInstUsesWith(*II, FCmp); 3612 } 3613 3614 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 3615 if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI)) 3616 return replaceOperand(*II, 1, ConstantInt::get(Src1->getType(), 3617 Mask & ~(S_NAN | Q_NAN))); 3618 3619 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 3620 if (!CVal) { 3621 if (isa<UndefValue>(Src0)) 3622 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3623 3624 // Clamp mask to used bits 3625 if ((Mask & FullMask) != Mask) { 3626 CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(), 3627 { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) } 3628 ); 3629 3630 NewCall->takeName(II); 3631 return replaceInstUsesWith(*II, NewCall); 3632 } 3633 3634 break; 3635 } 3636 3637 const APFloat &Val = CVal->getValueAPF(); 3638 3639 bool Result = 3640 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 3641 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 3642 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 3643 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 3644 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 3645 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 3646 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 3647 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 3648 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 3649 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 3650 3651 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result)); 3652 } 3653 case Intrinsic::amdgcn_cvt_pkrtz: { 3654 Value *Src0 = II->getArgOperand(0); 3655 Value *Src1 = II->getArgOperand(1); 3656 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3657 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3658 const fltSemantics &HalfSem 3659 = II->getType()->getScalarType()->getFltSemantics(); 3660 bool LosesInfo; 3661 APFloat Val0 = C0->getValueAPF(); 3662 APFloat Val1 = C1->getValueAPF(); 3663 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3664 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3665 3666 Constant *Folded = ConstantVector::get({ 3667 ConstantFP::get(II->getContext(), Val0), 3668 ConstantFP::get(II->getContext(), Val1) }); 3669 return replaceInstUsesWith(*II, Folded); 3670 } 3671 } 3672 3673 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) 3674 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3675 3676 break; 3677 } 3678 case Intrinsic::amdgcn_cvt_pknorm_i16: 3679 case Intrinsic::amdgcn_cvt_pknorm_u16: 3680 case Intrinsic::amdgcn_cvt_pk_i16: 3681 case Intrinsic::amdgcn_cvt_pk_u16: { 3682 Value *Src0 = II->getArgOperand(0); 3683 Value *Src1 = II->getArgOperand(1); 3684 3685 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) 3686 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3687 3688 break; 3689 } 3690 case Intrinsic::amdgcn_ubfe: 3691 case Intrinsic::amdgcn_sbfe: { 3692 // Decompose simple cases into standard shifts. 3693 Value *Src = II->getArgOperand(0); 3694 if (isa<UndefValue>(Src)) 3695 return replaceInstUsesWith(*II, Src); 3696 3697 unsigned Width; 3698 Type *Ty = II->getType(); 3699 unsigned IntSize = Ty->getIntegerBitWidth(); 3700 3701 ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3702 if (CWidth) { 3703 Width = CWidth->getZExtValue(); 3704 if ((Width & (IntSize - 1)) == 0) 3705 return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty)); 3706 3707 // Hardware ignores high bits, so remove those. 3708 if (Width >= IntSize) 3709 return replaceOperand(*II, 2, ConstantInt::get(CWidth->getType(), 3710 Width & (IntSize - 1))); 3711 } 3712 3713 unsigned Offset; 3714 ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3715 if (COffset) { 3716 Offset = COffset->getZExtValue(); 3717 if (Offset >= IntSize) 3718 return replaceOperand(*II, 1, ConstantInt::get(COffset->getType(), 3719 Offset & (IntSize - 1))); 3720 } 3721 3722 bool Signed = IID == Intrinsic::amdgcn_sbfe; 3723 3724 if (!CWidth || !COffset) 3725 break; 3726 3727 // The case of Width == 0 is handled above, which makes this tranformation 3728 // safe. If Width == 0, then the ashr and lshr instructions become poison 3729 // value since the shift amount would be equal to the bit size. 3730 assert(Width != 0); 3731 3732 // TODO: This allows folding to undef when the hardware has specific 3733 // behavior? 3734 if (Offset + Width < IntSize) { 3735 Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width); 3736 Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width) 3737 : Builder.CreateLShr(Shl, IntSize - Width); 3738 RightShift->takeName(II); 3739 return replaceInstUsesWith(*II, RightShift); 3740 } 3741 3742 Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset) 3743 : Builder.CreateLShr(Src, Offset); 3744 3745 RightShift->takeName(II); 3746 return replaceInstUsesWith(*II, RightShift); 3747 } 3748 case Intrinsic::amdgcn_exp: 3749 case Intrinsic::amdgcn_exp_compr: { 3750 ConstantInt *En = cast<ConstantInt>(II->getArgOperand(1)); 3751 unsigned EnBits = En->getZExtValue(); 3752 if (EnBits == 0xf) 3753 break; // All inputs enabled. 3754 3755 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 3756 bool Changed = false; 3757 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 3758 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 3759 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 3760 Value *Src = II->getArgOperand(I + 2); 3761 if (!isa<UndefValue>(Src)) { 3762 replaceOperand(*II, I + 2, UndefValue::get(Src->getType())); 3763 Changed = true; 3764 } 3765 } 3766 } 3767 3768 if (Changed) 3769 return II; 3770 3771 break; 3772 } 3773 case Intrinsic::amdgcn_fmed3: { 3774 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 3775 // for the shader. 3776 3777 Value *Src0 = II->getArgOperand(0); 3778 Value *Src1 = II->getArgOperand(1); 3779 Value *Src2 = II->getArgOperand(2); 3780 3781 // Checking for NaN before canonicalization provides better fidelity when 3782 // mapping other operations onto fmed3 since the order of operands is 3783 // unchanged. 3784 CallInst *NewCall = nullptr; 3785 if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) { 3786 NewCall = Builder.CreateMinNum(Src1, Src2); 3787 } else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) { 3788 NewCall = Builder.CreateMinNum(Src0, Src2); 3789 } else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) { 3790 NewCall = Builder.CreateMaxNum(Src0, Src1); 3791 } 3792 3793 if (NewCall) { 3794 NewCall->copyFastMathFlags(II); 3795 NewCall->takeName(II); 3796 return replaceInstUsesWith(*II, NewCall); 3797 } 3798 3799 bool Swap = false; 3800 // Canonicalize constants to RHS operands. 3801 // 3802 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 3803 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3804 std::swap(Src0, Src1); 3805 Swap = true; 3806 } 3807 3808 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 3809 std::swap(Src1, Src2); 3810 Swap = true; 3811 } 3812 3813 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3814 std::swap(Src0, Src1); 3815 Swap = true; 3816 } 3817 3818 if (Swap) { 3819 II->setArgOperand(0, Src0); 3820 II->setArgOperand(1, Src1); 3821 II->setArgOperand(2, Src2); 3822 return II; 3823 } 3824 3825 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3826 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3827 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 3828 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 3829 C2->getValueAPF()); 3830 return replaceInstUsesWith(*II, 3831 ConstantFP::get(Builder.getContext(), Result)); 3832 } 3833 } 3834 } 3835 3836 break; 3837 } 3838 case Intrinsic::amdgcn_icmp: 3839 case Intrinsic::amdgcn_fcmp: { 3840 const ConstantInt *CC = cast<ConstantInt>(II->getArgOperand(2)); 3841 // Guard against invalid arguments. 3842 int64_t CCVal = CC->getZExtValue(); 3843 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 3844 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 3845 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 3846 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 3847 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 3848 break; 3849 3850 Value *Src0 = II->getArgOperand(0); 3851 Value *Src1 = II->getArgOperand(1); 3852 3853 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 3854 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 3855 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 3856 if (CCmp->isNullValue()) { 3857 return replaceInstUsesWith( 3858 *II, ConstantExpr::getSExt(CCmp, II->getType())); 3859 } 3860 3861 // The result of V_ICMP/V_FCMP assembly instructions (which this 3862 // intrinsic exposes) is one bit per thread, masked with the EXEC 3863 // register (which contains the bitmask of live threads). So a 3864 // comparison that always returns true is the same as a read of the 3865 // EXEC register. 3866 Function *NewF = Intrinsic::getDeclaration( 3867 II->getModule(), Intrinsic::read_register, II->getType()); 3868 Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")}; 3869 MDNode *MD = MDNode::get(II->getContext(), MDArgs); 3870 Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; 3871 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3872 NewCall->addAttribute(AttributeList::FunctionIndex, 3873 Attribute::Convergent); 3874 NewCall->takeName(II); 3875 return replaceInstUsesWith(*II, NewCall); 3876 } 3877 3878 // Canonicalize constants to RHS. 3879 CmpInst::Predicate SwapPred 3880 = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 3881 II->setArgOperand(0, Src1); 3882 II->setArgOperand(1, Src0); 3883 II->setArgOperand(2, ConstantInt::get(CC->getType(), 3884 static_cast<int>(SwapPred))); 3885 return II; 3886 } 3887 3888 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 3889 break; 3890 3891 // Canonicalize compare eq with true value to compare != 0 3892 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 3893 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 3894 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 3895 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 3896 Value *ExtSrc; 3897 if (CCVal == CmpInst::ICMP_EQ && 3898 ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) || 3899 (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) && 3900 ExtSrc->getType()->isIntegerTy(1)) { 3901 replaceOperand(*II, 1, ConstantInt::getNullValue(Src1->getType())); 3902 replaceOperand(*II, 2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 3903 return II; 3904 } 3905 3906 CmpInst::Predicate SrcPred; 3907 Value *SrcLHS; 3908 Value *SrcRHS; 3909 3910 // Fold compare eq/ne with 0 from a compare result as the predicate to the 3911 // intrinsic. The typical use is a wave vote function in the library, which 3912 // will be fed from a user code condition compared with 0. Fold in the 3913 // redundant compare. 3914 3915 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 3916 // -> llvm.amdgcn.[if]cmp(a, b, pred) 3917 // 3918 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 3919 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 3920 if (match(Src1, m_Zero()) && 3921 match(Src0, 3922 m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) { 3923 if (CCVal == CmpInst::ICMP_EQ) 3924 SrcPred = CmpInst::getInversePredicate(SrcPred); 3925 3926 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ? 3927 Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp; 3928 3929 Type *Ty = SrcLHS->getType(); 3930 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 3931 // Promote to next legal integer type. 3932 unsigned Width = CmpType->getBitWidth(); 3933 unsigned NewWidth = Width; 3934 3935 // Don't do anything for i1 comparisons. 3936 if (Width == 1) 3937 break; 3938 3939 if (Width <= 16) 3940 NewWidth = 16; 3941 else if (Width <= 32) 3942 NewWidth = 32; 3943 else if (Width <= 64) 3944 NewWidth = 64; 3945 else if (Width > 64) 3946 break; // Can't handle this. 3947 3948 if (Width != NewWidth) { 3949 IntegerType *CmpTy = Builder.getIntNTy(NewWidth); 3950 if (CmpInst::isSigned(SrcPred)) { 3951 SrcLHS = Builder.CreateSExt(SrcLHS, CmpTy); 3952 SrcRHS = Builder.CreateSExt(SrcRHS, CmpTy); 3953 } else { 3954 SrcLHS = Builder.CreateZExt(SrcLHS, CmpTy); 3955 SrcRHS = Builder.CreateZExt(SrcRHS, CmpTy); 3956 } 3957 } 3958 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 3959 break; 3960 3961 Function *NewF = 3962 Intrinsic::getDeclaration(II->getModule(), NewIID, 3963 { II->getType(), 3964 SrcLHS->getType() }); 3965 Value *Args[] = { SrcLHS, SrcRHS, 3966 ConstantInt::get(CC->getType(), SrcPred) }; 3967 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3968 NewCall->takeName(II); 3969 return replaceInstUsesWith(*II, NewCall); 3970 } 3971 3972 break; 3973 } 3974 case Intrinsic::amdgcn_ballot: { 3975 if (auto *Src = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 3976 if (Src->isZero()) { 3977 // amdgcn.ballot(i1 0) is zero. 3978 return replaceInstUsesWith(*II, Constant::getNullValue(II->getType())); 3979 } 3980 3981 if (Src->isOne()) { 3982 // amdgcn.ballot(i1 1) is exec. 3983 const char *RegName = "exec"; 3984 if (II->getType()->isIntegerTy(32)) 3985 RegName = "exec_lo"; 3986 else if (!II->getType()->isIntegerTy(64)) 3987 break; 3988 3989 Function *NewF = Intrinsic::getDeclaration( 3990 II->getModule(), Intrinsic::read_register, II->getType()); 3991 Metadata *MDArgs[] = {MDString::get(II->getContext(), RegName)}; 3992 MDNode *MD = MDNode::get(II->getContext(), MDArgs); 3993 Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; 3994 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3995 NewCall->addAttribute(AttributeList::FunctionIndex, 3996 Attribute::Convergent); 3997 NewCall->takeName(II); 3998 return replaceInstUsesWith(*II, NewCall); 3999 } 4000 } 4001 break; 4002 } 4003 case Intrinsic::amdgcn_wqm_vote: { 4004 // wqm_vote is identity when the argument is constant. 4005 if (!isa<Constant>(II->getArgOperand(0))) 4006 break; 4007 4008 return replaceInstUsesWith(*II, II->getArgOperand(0)); 4009 } 4010 case Intrinsic::amdgcn_kill: { 4011 const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0)); 4012 if (!C || !C->getZExtValue()) 4013 break; 4014 4015 // amdgcn.kill(i1 1) is a no-op 4016 return eraseInstFromFunction(CI); 4017 } 4018 case Intrinsic::amdgcn_update_dpp: { 4019 Value *Old = II->getArgOperand(0); 4020 4021 auto BC = cast<ConstantInt>(II->getArgOperand(5)); 4022 auto RM = cast<ConstantInt>(II->getArgOperand(3)); 4023 auto BM = cast<ConstantInt>(II->getArgOperand(4)); 4024 if (BC->isZeroValue() || 4025 RM->getZExtValue() != 0xF || 4026 BM->getZExtValue() != 0xF || 4027 isa<UndefValue>(Old)) 4028 break; 4029 4030 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 4031 return replaceOperand(*II, 0, UndefValue::get(Old->getType())); 4032 } 4033 case Intrinsic::amdgcn_permlane16: 4034 case Intrinsic::amdgcn_permlanex16: { 4035 // Discard vdst_in if it's not going to be read. 4036 Value *VDstIn = II->getArgOperand(0); 4037 if (isa<UndefValue>(VDstIn)) 4038 break; 4039 4040 ConstantInt *FetchInvalid = cast<ConstantInt>(II->getArgOperand(4)); 4041 ConstantInt *BoundCtrl = cast<ConstantInt>(II->getArgOperand(5)); 4042 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 4043 break; 4044 4045 return replaceOperand(*II, 0, UndefValue::get(VDstIn->getType())); 4046 } 4047 case Intrinsic::amdgcn_readfirstlane: 4048 case Intrinsic::amdgcn_readlane: { 4049 // A constant value is trivially uniform. 4050 if (Constant *C = dyn_cast<Constant>(II->getArgOperand(0))) 4051 return replaceInstUsesWith(*II, C); 4052 4053 // The rest of these may not be safe if the exec may not be the same between 4054 // the def and use. 4055 Value *Src = II->getArgOperand(0); 4056 Instruction *SrcInst = dyn_cast<Instruction>(Src); 4057 if (SrcInst && SrcInst->getParent() != II->getParent()) 4058 break; 4059 4060 // readfirstlane (readfirstlane x) -> readfirstlane x 4061 // readlane (readfirstlane x), y -> readfirstlane x 4062 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) 4063 return replaceInstUsesWith(*II, Src); 4064 4065 if (IID == Intrinsic::amdgcn_readfirstlane) { 4066 // readfirstlane (readlane x, y) -> readlane x, y 4067 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>())) 4068 return replaceInstUsesWith(*II, Src); 4069 } else { 4070 // readlane (readlane x, y), y -> readlane x, y 4071 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>( 4072 m_Value(), m_Specific(II->getArgOperand(1))))) 4073 return replaceInstUsesWith(*II, Src); 4074 } 4075 4076 break; 4077 } 4078 case Intrinsic::hexagon_V6_vandvrt: 4079 case Intrinsic::hexagon_V6_vandvrt_128B: { 4080 // Simplify Q -> V -> Q conversion. 4081 if (auto Op0 = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { 4082 Intrinsic::ID ID0 = Op0->getIntrinsicID(); 4083 if (ID0 != Intrinsic::hexagon_V6_vandqrt && 4084 ID0 != Intrinsic::hexagon_V6_vandqrt_128B) 4085 break; 4086 Value *Bytes = Op0->getArgOperand(1), *Mask = II->getArgOperand(1); 4087 uint64_t Bytes1 = computeKnownBits(Bytes, 0, Op0).One.getZExtValue(); 4088 uint64_t Mask1 = computeKnownBits(Mask, 0, II).One.getZExtValue(); 4089 // Check if every byte has common bits in Bytes and Mask. 4090 uint64_t C = Bytes1 & Mask1; 4091 if ((C & 0xFF) && (C & 0xFF00) && (C & 0xFF0000) && (C & 0xFF000000)) 4092 return replaceInstUsesWith(*II, Op0->getArgOperand(0)); 4093 } 4094 break; 4095 } 4096 case Intrinsic::stackrestore: { 4097 // If the save is right next to the restore, remove the restore. This can 4098 // happen when variable allocas are DCE'd. 4099 if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { 4100 if (SS->getIntrinsicID() == Intrinsic::stacksave) { 4101 // Skip over debug info. 4102 if (SS->getNextNonDebugInstruction() == II) { 4103 return eraseInstFromFunction(CI); 4104 } 4105 } 4106 } 4107 4108 // Scan down this block to see if there is another stack restore in the 4109 // same block without an intervening call/alloca. 4110 BasicBlock::iterator BI(II); 4111 Instruction *TI = II->getParent()->getTerminator(); 4112 bool CannotRemove = false; 4113 for (++BI; &*BI != TI; ++BI) { 4114 if (isa<AllocaInst>(BI)) { 4115 CannotRemove = true; 4116 break; 4117 } 4118 if (CallInst *BCI = dyn_cast<CallInst>(BI)) { 4119 if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) { 4120 // If there is a stackrestore below this one, remove this one. 4121 if (II2->getIntrinsicID() == Intrinsic::stackrestore) 4122 return eraseInstFromFunction(CI); 4123 4124 // Bail if we cross over an intrinsic with side effects, such as 4125 // llvm.stacksave, or llvm.read_register. 4126 if (II2->mayHaveSideEffects()) { 4127 CannotRemove = true; 4128 break; 4129 } 4130 } else { 4131 // If we found a non-intrinsic call, we can't remove the stack 4132 // restore. 4133 CannotRemove = true; 4134 break; 4135 } 4136 } 4137 } 4138 4139 // If the stack restore is in a return, resume, or unwind block and if there 4140 // are no allocas or calls between the restore and the return, nuke the 4141 // restore. 4142 if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI))) 4143 return eraseInstFromFunction(CI); 4144 break; 4145 } 4146 case Intrinsic::lifetime_end: 4147 // Asan needs to poison memory to detect invalid access which is possible 4148 // even for empty lifetime range. 4149 if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) || 4150 II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) || 4151 II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress)) 4152 break; 4153 4154 if (removeTriviallyEmptyRange(*II, *this, [](const IntrinsicInst &I) { 4155 return I.getIntrinsicID() == Intrinsic::lifetime_start; 4156 })) 4157 return nullptr; 4158 break; 4159 case Intrinsic::assume: { 4160 Value *IIOperand = II->getArgOperand(0); 4161 // Remove an assume if it is followed by an identical assume. 4162 // TODO: Do we need this? Unless there are conflicting assumptions, the 4163 // computeKnownBits(IIOperand) below here eliminates redundant assumes. 4164 Instruction *Next = II->getNextNonDebugInstruction(); 4165 if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand)))) 4166 return eraseInstFromFunction(CI); 4167 4168 // Canonicalize assume(a && b) -> assume(a); assume(b); 4169 // Note: New assumption intrinsics created here are registered by 4170 // the InstCombineIRInserter object. 4171 FunctionType *AssumeIntrinsicTy = II->getFunctionType(); 4172 Value *AssumeIntrinsic = II->getCalledValue(); 4173 Value *A, *B; 4174 if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) { 4175 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, II->getName()); 4176 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName()); 4177 return eraseInstFromFunction(*II); 4178 } 4179 // assume(!(a || b)) -> assume(!a); assume(!b); 4180 if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) { 4181 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, 4182 Builder.CreateNot(A), II->getName()); 4183 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, 4184 Builder.CreateNot(B), II->getName()); 4185 return eraseInstFromFunction(*II); 4186 } 4187 4188 // assume( (load addr) != null ) -> add 'nonnull' metadata to load 4189 // (if assume is valid at the load) 4190 CmpInst::Predicate Pred; 4191 Instruction *LHS; 4192 if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) && 4193 Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load && 4194 LHS->getType()->isPointerTy() && 4195 isValidAssumeForContext(II, LHS, &DT)) { 4196 MDNode *MD = MDNode::get(II->getContext(), None); 4197 LHS->setMetadata(LLVMContext::MD_nonnull, MD); 4198 return eraseInstFromFunction(*II); 4199 4200 // TODO: apply nonnull return attributes to calls and invokes 4201 // TODO: apply range metadata for range check patterns? 4202 } 4203 4204 // If there is a dominating assume with the same condition as this one, 4205 // then this one is redundant, and should be removed. 4206 KnownBits Known(1); 4207 computeKnownBits(IIOperand, Known, 0, II); 4208 if (Known.isAllOnes() && isAssumeWithEmptyBundle(*II)) 4209 return eraseInstFromFunction(*II); 4210 4211 // Update the cache of affected values for this assumption (we might be 4212 // here because we just simplified the condition). 4213 AC.updateAffectedValues(II); 4214 break; 4215 } 4216 case Intrinsic::experimental_gc_relocate: { 4217 auto &GCR = *cast<GCRelocateInst>(II); 4218 4219 // If we have two copies of the same pointer in the statepoint argument 4220 // list, canonicalize to one. This may let us common gc.relocates. 4221 if (GCR.getBasePtr() == GCR.getDerivedPtr() && 4222 GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) { 4223 auto *OpIntTy = GCR.getOperand(2)->getType(); 4224 return replaceOperand(*II, 2, 4225 ConstantInt::get(OpIntTy, GCR.getBasePtrIndex())); 4226 } 4227 4228 // Translate facts known about a pointer before relocating into 4229 // facts about the relocate value, while being careful to 4230 // preserve relocation semantics. 4231 Value *DerivedPtr = GCR.getDerivedPtr(); 4232 4233 // Remove the relocation if unused, note that this check is required 4234 // to prevent the cases below from looping forever. 4235 if (II->use_empty()) 4236 return eraseInstFromFunction(*II); 4237 4238 // Undef is undef, even after relocation. 4239 // TODO: provide a hook for this in GCStrategy. This is clearly legal for 4240 // most practical collectors, but there was discussion in the review thread 4241 // about whether it was legal for all possible collectors. 4242 if (isa<UndefValue>(DerivedPtr)) 4243 // Use undef of gc_relocate's type to replace it. 4244 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 4245 4246 if (auto *PT = dyn_cast<PointerType>(II->getType())) { 4247 // The relocation of null will be null for most any collector. 4248 // TODO: provide a hook for this in GCStrategy. There might be some 4249 // weird collector this property does not hold for. 4250 if (isa<ConstantPointerNull>(DerivedPtr)) 4251 // Use null-pointer of gc_relocate's type to replace it. 4252 return replaceInstUsesWith(*II, ConstantPointerNull::get(PT)); 4253 4254 // isKnownNonNull -> nonnull attribute 4255 if (!II->hasRetAttr(Attribute::NonNull) && 4256 isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) { 4257 II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); 4258 return II; 4259 } 4260 } 4261 4262 // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) 4263 // Canonicalize on the type from the uses to the defs 4264 4265 // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...) 4266 break; 4267 } 4268 4269 case Intrinsic::experimental_guard: { 4270 // Is this guard followed by another guard? We scan forward over a small 4271 // fixed window of instructions to handle common cases with conditions 4272 // computed between guards. 4273 Instruction *NextInst = II->getNextNonDebugInstruction(); 4274 for (unsigned i = 0; i < GuardWideningWindow; i++) { 4275 // Note: Using context-free form to avoid compile time blow up 4276 if (!isSafeToSpeculativelyExecute(NextInst)) 4277 break; 4278 NextInst = NextInst->getNextNonDebugInstruction(); 4279 } 4280 Value *NextCond = nullptr; 4281 if (match(NextInst, 4282 m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) { 4283 Value *CurrCond = II->getArgOperand(0); 4284 4285 // Remove a guard that it is immediately preceded by an identical guard. 4286 // Otherwise canonicalize guard(a); guard(b) -> guard(a & b). 4287 if (CurrCond != NextCond) { 4288 Instruction *MoveI = II->getNextNonDebugInstruction(); 4289 while (MoveI != NextInst) { 4290 auto *Temp = MoveI; 4291 MoveI = MoveI->getNextNonDebugInstruction(); 4292 Temp->moveBefore(II); 4293 } 4294 replaceOperand(*II, 0, Builder.CreateAnd(CurrCond, NextCond)); 4295 } 4296 eraseInstFromFunction(*NextInst); 4297 return II; 4298 } 4299 break; 4300 } 4301 } 4302 return visitCallBase(*II); 4303 } 4304 4305 // Fence instruction simplification 4306 Instruction *InstCombiner::visitFenceInst(FenceInst &FI) { 4307 // Remove identical consecutive fences. 4308 Instruction *Next = FI.getNextNonDebugInstruction(); 4309 if (auto *NFI = dyn_cast<FenceInst>(Next)) 4310 if (FI.isIdenticalTo(NFI)) 4311 return eraseInstFromFunction(FI); 4312 return nullptr; 4313 } 4314 4315 // InvokeInst simplification 4316 Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { 4317 return visitCallBase(II); 4318 } 4319 4320 // CallBrInst simplification 4321 Instruction *InstCombiner::visitCallBrInst(CallBrInst &CBI) { 4322 return visitCallBase(CBI); 4323 } 4324 4325 /// If this cast does not affect the value passed through the varargs area, we 4326 /// can eliminate the use of the cast. 4327 static bool isSafeToEliminateVarargsCast(const CallBase &Call, 4328 const DataLayout &DL, 4329 const CastInst *const CI, 4330 const int ix) { 4331 if (!CI->isLosslessCast()) 4332 return false; 4333 4334 // If this is a GC intrinsic, avoid munging types. We need types for 4335 // statepoint reconstruction in SelectionDAG. 4336 // TODO: This is probably something which should be expanded to all 4337 // intrinsics since the entire point of intrinsics is that 4338 // they are understandable by the optimizer. 4339 if (isStatepoint(&Call) || isGCRelocate(&Call) || isGCResult(&Call)) 4340 return false; 4341 4342 // The size of ByVal or InAlloca arguments is derived from the type, so we 4343 // can't change to a type with a different size. If the size were 4344 // passed explicitly we could avoid this check. 4345 if (!Call.isByValOrInAllocaArgument(ix)) 4346 return true; 4347 4348 Type* SrcTy = 4349 cast<PointerType>(CI->getOperand(0)->getType())->getElementType(); 4350 Type *DstTy = Call.isByValArgument(ix) 4351 ? Call.getParamByValType(ix) 4352 : cast<PointerType>(CI->getType())->getElementType(); 4353 if (!SrcTy->isSized() || !DstTy->isSized()) 4354 return false; 4355 if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy)) 4356 return false; 4357 return true; 4358 } 4359 4360 Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { 4361 if (!CI->getCalledFunction()) return nullptr; 4362 4363 auto InstCombineRAUW = [this](Instruction *From, Value *With) { 4364 replaceInstUsesWith(*From, With); 4365 }; 4366 auto InstCombineErase = [this](Instruction *I) { 4367 eraseInstFromFunction(*I); 4368 }; 4369 LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW, 4370 InstCombineErase); 4371 if (Value *With = Simplifier.optimizeCall(CI, Builder)) { 4372 ++NumSimplified; 4373 return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); 4374 } 4375 4376 return nullptr; 4377 } 4378 4379 static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) { 4380 // Strip off at most one level of pointer casts, looking for an alloca. This 4381 // is good enough in practice and simpler than handling any number of casts. 4382 Value *Underlying = TrampMem->stripPointerCasts(); 4383 if (Underlying != TrampMem && 4384 (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem)) 4385 return nullptr; 4386 if (!isa<AllocaInst>(Underlying)) 4387 return nullptr; 4388 4389 IntrinsicInst *InitTrampoline = nullptr; 4390 for (User *U : TrampMem->users()) { 4391 IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); 4392 if (!II) 4393 return nullptr; 4394 if (II->getIntrinsicID() == Intrinsic::init_trampoline) { 4395 if (InitTrampoline) 4396 // More than one init_trampoline writes to this value. Give up. 4397 return nullptr; 4398 InitTrampoline = II; 4399 continue; 4400 } 4401 if (II->getIntrinsicID() == Intrinsic::adjust_trampoline) 4402 // Allow any number of calls to adjust.trampoline. 4403 continue; 4404 return nullptr; 4405 } 4406 4407 // No call to init.trampoline found. 4408 if (!InitTrampoline) 4409 return nullptr; 4410 4411 // Check that the alloca is being used in the expected way. 4412 if (InitTrampoline->getOperand(0) != TrampMem) 4413 return nullptr; 4414 4415 return InitTrampoline; 4416 } 4417 4418 static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp, 4419 Value *TrampMem) { 4420 // Visit all the previous instructions in the basic block, and try to find a 4421 // init.trampoline which has a direct path to the adjust.trampoline. 4422 for (BasicBlock::iterator I = AdjustTramp->getIterator(), 4423 E = AdjustTramp->getParent()->begin(); 4424 I != E;) { 4425 Instruction *Inst = &*--I; 4426 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 4427 if (II->getIntrinsicID() == Intrinsic::init_trampoline && 4428 II->getOperand(0) == TrampMem) 4429 return II; 4430 if (Inst->mayWriteToMemory()) 4431 return nullptr; 4432 } 4433 return nullptr; 4434 } 4435 4436 // Given a call to llvm.adjust.trampoline, find and return the corresponding 4437 // call to llvm.init.trampoline if the call to the trampoline can be optimized 4438 // to a direct call to a function. Otherwise return NULL. 4439 static IntrinsicInst *findInitTrampoline(Value *Callee) { 4440 Callee = Callee->stripPointerCasts(); 4441 IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee); 4442 if (!AdjustTramp || 4443 AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline) 4444 return nullptr; 4445 4446 Value *TrampMem = AdjustTramp->getOperand(0); 4447 4448 if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem)) 4449 return IT; 4450 if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem)) 4451 return IT; 4452 return nullptr; 4453 } 4454 4455 static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) { 4456 unsigned NumArgs = Call.getNumArgOperands(); 4457 ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0)); 4458 ConstantInt *Op1C = 4459 (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1)); 4460 // Bail out if the allocation size is zero (or an invalid alignment of zero 4461 // with aligned_alloc). 4462 if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue())) 4463 return; 4464 4465 if (isMallocLikeFn(&Call, TLI) && Op0C) { 4466 if (isOpNewLikeFn(&Call, TLI)) 4467 Call.addAttribute(AttributeList::ReturnIndex, 4468 Attribute::getWithDereferenceableBytes( 4469 Call.getContext(), Op0C->getZExtValue())); 4470 else 4471 Call.addAttribute(AttributeList::ReturnIndex, 4472 Attribute::getWithDereferenceableOrNullBytes( 4473 Call.getContext(), Op0C->getZExtValue())); 4474 } else if (isAlignedAllocLikeFn(&Call, TLI) && Op1C) { 4475 Call.addAttribute(AttributeList::ReturnIndex, 4476 Attribute::getWithDereferenceableOrNullBytes( 4477 Call.getContext(), Op1C->getZExtValue())); 4478 // Add alignment attribute if alignment is a power of two constant. 4479 if (Op0C) { 4480 uint64_t AlignmentVal = Op0C->getZExtValue(); 4481 if (llvm::isPowerOf2_64(AlignmentVal)) 4482 Call.addAttribute(AttributeList::ReturnIndex, 4483 Attribute::getWithAlignment(Call.getContext(), 4484 Align(AlignmentVal))); 4485 } 4486 } else if (isReallocLikeFn(&Call, TLI) && Op1C) { 4487 Call.addAttribute(AttributeList::ReturnIndex, 4488 Attribute::getWithDereferenceableOrNullBytes( 4489 Call.getContext(), Op1C->getZExtValue())); 4490 } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) { 4491 bool Overflow; 4492 const APInt &N = Op0C->getValue(); 4493 APInt Size = N.umul_ov(Op1C->getValue(), Overflow); 4494 if (!Overflow) 4495 Call.addAttribute(AttributeList::ReturnIndex, 4496 Attribute::getWithDereferenceableOrNullBytes( 4497 Call.getContext(), Size.getZExtValue())); 4498 } else if (isStrdupLikeFn(&Call, TLI)) { 4499 uint64_t Len = GetStringLength(Call.getOperand(0)); 4500 if (Len) { 4501 // strdup 4502 if (NumArgs == 1) 4503 Call.addAttribute(AttributeList::ReturnIndex, 4504 Attribute::getWithDereferenceableOrNullBytes( 4505 Call.getContext(), Len)); 4506 // strndup 4507 else if (NumArgs == 2 && Op1C) 4508 Call.addAttribute( 4509 AttributeList::ReturnIndex, 4510 Attribute::getWithDereferenceableOrNullBytes( 4511 Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1))); 4512 } 4513 } 4514 } 4515 4516 /// Improvements for call, callbr and invoke instructions. 4517 Instruction *InstCombiner::visitCallBase(CallBase &Call) { 4518 if (isAllocationFn(&Call, &TLI)) 4519 annotateAnyAllocSite(Call, &TLI); 4520 4521 bool Changed = false; 4522 4523 // Mark any parameters that are known to be non-null with the nonnull 4524 // attribute. This is helpful for inlining calls to functions with null 4525 // checks on their arguments. 4526 SmallVector<unsigned, 4> ArgNos; 4527 unsigned ArgNo = 0; 4528 4529 for (Value *V : Call.args()) { 4530 if (V->getType()->isPointerTy() && 4531 !Call.paramHasAttr(ArgNo, Attribute::NonNull) && 4532 isKnownNonZero(V, DL, 0, &AC, &Call, &DT)) 4533 ArgNos.push_back(ArgNo); 4534 ArgNo++; 4535 } 4536 4537 assert(ArgNo == Call.arg_size() && "sanity check"); 4538 4539 if (!ArgNos.empty()) { 4540 AttributeList AS = Call.getAttributes(); 4541 LLVMContext &Ctx = Call.getContext(); 4542 AS = AS.addParamAttribute(Ctx, ArgNos, 4543 Attribute::get(Ctx, Attribute::NonNull)); 4544 Call.setAttributes(AS); 4545 Changed = true; 4546 } 4547 4548 // If the callee is a pointer to a function, attempt to move any casts to the 4549 // arguments of the call/callbr/invoke. 4550 Value *Callee = Call.getCalledValue(); 4551 if (!isa<Function>(Callee) && transformConstExprCastCall(Call)) 4552 return nullptr; 4553 4554 if (Function *CalleeF = dyn_cast<Function>(Callee)) { 4555 // Remove the convergent attr on calls when the callee is not convergent. 4556 if (Call.isConvergent() && !CalleeF->isConvergent() && 4557 !CalleeF->isIntrinsic()) { 4558 LLVM_DEBUG(dbgs() << "Removing convergent attr from instr " << Call 4559 << "\n"); 4560 Call.setNotConvergent(); 4561 return &Call; 4562 } 4563 4564 // If the call and callee calling conventions don't match, this call must 4565 // be unreachable, as the call is undefined. 4566 if (CalleeF->getCallingConv() != Call.getCallingConv() && 4567 // Only do this for calls to a function with a body. A prototype may 4568 // not actually end up matching the implementation's calling conv for a 4569 // variety of reasons (e.g. it may be written in assembly). 4570 !CalleeF->isDeclaration()) { 4571 Instruction *OldCall = &Call; 4572 CreateNonTerminatorUnreachable(OldCall); 4573 // If OldCall does not return void then replaceAllUsesWith undef. 4574 // This allows ValueHandlers and custom metadata to adjust itself. 4575 if (!OldCall->getType()->isVoidTy()) 4576 replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType())); 4577 if (isa<CallInst>(OldCall)) 4578 return eraseInstFromFunction(*OldCall); 4579 4580 // We cannot remove an invoke or a callbr, because it would change thexi 4581 // CFG, just change the callee to a null pointer. 4582 cast<CallBase>(OldCall)->setCalledFunction( 4583 CalleeF->getFunctionType(), 4584 Constant::getNullValue(CalleeF->getType())); 4585 return nullptr; 4586 } 4587 } 4588 4589 if ((isa<ConstantPointerNull>(Callee) && 4590 !NullPointerIsDefined(Call.getFunction())) || 4591 isa<UndefValue>(Callee)) { 4592 // If Call does not return void then replaceAllUsesWith undef. 4593 // This allows ValueHandlers and custom metadata to adjust itself. 4594 if (!Call.getType()->isVoidTy()) 4595 replaceInstUsesWith(Call, UndefValue::get(Call.getType())); 4596 4597 if (Call.isTerminator()) { 4598 // Can't remove an invoke or callbr because we cannot change the CFG. 4599 return nullptr; 4600 } 4601 4602 // This instruction is not reachable, just remove it. 4603 CreateNonTerminatorUnreachable(&Call); 4604 return eraseInstFromFunction(Call); 4605 } 4606 4607 if (IntrinsicInst *II = findInitTrampoline(Callee)) 4608 return transformCallThroughTrampoline(Call, *II); 4609 4610 PointerType *PTy = cast<PointerType>(Callee->getType()); 4611 FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); 4612 if (FTy->isVarArg()) { 4613 int ix = FTy->getNumParams(); 4614 // See if we can optimize any arguments passed through the varargs area of 4615 // the call. 4616 for (auto I = Call.arg_begin() + FTy->getNumParams(), E = Call.arg_end(); 4617 I != E; ++I, ++ix) { 4618 CastInst *CI = dyn_cast<CastInst>(*I); 4619 if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) { 4620 replaceUse(*I, CI->getOperand(0)); 4621 4622 // Update the byval type to match the argument type. 4623 if (Call.isByValArgument(ix)) { 4624 Call.removeParamAttr(ix, Attribute::ByVal); 4625 Call.addParamAttr( 4626 ix, Attribute::getWithByValType( 4627 Call.getContext(), 4628 CI->getOperand(0)->getType()->getPointerElementType())); 4629 } 4630 Changed = true; 4631 } 4632 } 4633 } 4634 4635 if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) { 4636 // Inline asm calls cannot throw - mark them 'nounwind'. 4637 Call.setDoesNotThrow(); 4638 Changed = true; 4639 } 4640 4641 // Try to optimize the call if possible, we require DataLayout for most of 4642 // this. None of these calls are seen as possibly dead so go ahead and 4643 // delete the instruction now. 4644 if (CallInst *CI = dyn_cast<CallInst>(&Call)) { 4645 Instruction *I = tryOptimizeCall(CI); 4646 // If we changed something return the result, etc. Otherwise let 4647 // the fallthrough check. 4648 if (I) return eraseInstFromFunction(*I); 4649 } 4650 4651 if (!Call.use_empty() && !Call.isMustTailCall()) 4652 if (Value *ReturnedArg = Call.getReturnedArgOperand()) 4653 return replaceInstUsesWith(Call, ReturnedArg); 4654 4655 if (isAllocLikeFn(&Call, &TLI)) 4656 return visitAllocSite(Call); 4657 4658 return Changed ? &Call : nullptr; 4659 } 4660 4661 /// If the callee is a constexpr cast of a function, attempt to move the cast to 4662 /// the arguments of the call/callbr/invoke. 4663 bool InstCombiner::transformConstExprCastCall(CallBase &Call) { 4664 auto *Callee = dyn_cast<Function>(Call.getCalledValue()->stripPointerCasts()); 4665 if (!Callee) 4666 return false; 4667 4668 // If this is a call to a thunk function, don't remove the cast. Thunks are 4669 // used to transparently forward all incoming parameters and outgoing return 4670 // values, so it's important to leave the cast in place. 4671 if (Callee->hasFnAttribute("thunk")) 4672 return false; 4673 4674 // If this is a musttail call, the callee's prototype must match the caller's 4675 // prototype with the exception of pointee types. The code below doesn't 4676 // implement that, so we can't do this transform. 4677 // TODO: Do the transform if it only requires adding pointer casts. 4678 if (Call.isMustTailCall()) 4679 return false; 4680 4681 Instruction *Caller = &Call; 4682 const AttributeList &CallerPAL = Call.getAttributes(); 4683 4684 // Okay, this is a cast from a function to a different type. Unless doing so 4685 // would cause a type conversion of one of our arguments, change this call to 4686 // be a direct call with arguments casted to the appropriate types. 4687 FunctionType *FT = Callee->getFunctionType(); 4688 Type *OldRetTy = Caller->getType(); 4689 Type *NewRetTy = FT->getReturnType(); 4690 4691 // Check to see if we are changing the return type... 4692 if (OldRetTy != NewRetTy) { 4693 4694 if (NewRetTy->isStructTy()) 4695 return false; // TODO: Handle multiple return values. 4696 4697 if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) { 4698 if (Callee->isDeclaration()) 4699 return false; // Cannot transform this return value. 4700 4701 if (!Caller->use_empty() && 4702 // void -> non-void is handled specially 4703 !NewRetTy->isVoidTy()) 4704 return false; // Cannot transform this return value. 4705 } 4706 4707 if (!CallerPAL.isEmpty() && !Caller->use_empty()) { 4708 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4709 if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) 4710 return false; // Attribute not compatible with transformed value. 4711 } 4712 4713 // If the callbase is an invoke/callbr instruction, and the return value is 4714 // used by a PHI node in a successor, we cannot change the return type of 4715 // the call because there is no place to put the cast instruction (without 4716 // breaking the critical edge). Bail out in this case. 4717 if (!Caller->use_empty()) { 4718 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) 4719 for (User *U : II->users()) 4720 if (PHINode *PN = dyn_cast<PHINode>(U)) 4721 if (PN->getParent() == II->getNormalDest() || 4722 PN->getParent() == II->getUnwindDest()) 4723 return false; 4724 // FIXME: Be conservative for callbr to avoid a quadratic search. 4725 if (isa<CallBrInst>(Caller)) 4726 return false; 4727 } 4728 } 4729 4730 unsigned NumActualArgs = Call.arg_size(); 4731 unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs); 4732 4733 // Prevent us turning: 4734 // declare void @takes_i32_inalloca(i32* inalloca) 4735 // call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0) 4736 // 4737 // into: 4738 // call void @takes_i32_inalloca(i32* null) 4739 // 4740 // Similarly, avoid folding away bitcasts of byval calls. 4741 if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) || 4742 Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal)) 4743 return false; 4744 4745 auto AI = Call.arg_begin(); 4746 for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) { 4747 Type *ParamTy = FT->getParamType(i); 4748 Type *ActTy = (*AI)->getType(); 4749 4750 if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) 4751 return false; // Cannot transform this parameter value. 4752 4753 if (AttrBuilder(CallerPAL.getParamAttributes(i)) 4754 .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) 4755 return false; // Attribute not compatible with transformed value. 4756 4757 if (Call.isInAllocaArgument(i)) 4758 return false; // Cannot transform to and from inalloca. 4759 4760 // If the parameter is passed as a byval argument, then we have to have a 4761 // sized type and the sized type has to have the same size as the old type. 4762 if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 4763 PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); 4764 if (!ParamPTy || !ParamPTy->getElementType()->isSized()) 4765 return false; 4766 4767 Type *CurElTy = Call.getParamByValType(i); 4768 if (DL.getTypeAllocSize(CurElTy) != 4769 DL.getTypeAllocSize(ParamPTy->getElementType())) 4770 return false; 4771 } 4772 } 4773 4774 if (Callee->isDeclaration()) { 4775 // Do not delete arguments unless we have a function body. 4776 if (FT->getNumParams() < NumActualArgs && !FT->isVarArg()) 4777 return false; 4778 4779 // If the callee is just a declaration, don't change the varargsness of the 4780 // call. We don't want to introduce a varargs call where one doesn't 4781 // already exist. 4782 PointerType *APTy = cast<PointerType>(Call.getCalledValue()->getType()); 4783 if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg()) 4784 return false; 4785 4786 // If both the callee and the cast type are varargs, we still have to make 4787 // sure the number of fixed parameters are the same or we have the same 4788 // ABI issues as if we introduce a varargs call. 4789 if (FT->isVarArg() && 4790 cast<FunctionType>(APTy->getElementType())->isVarArg() && 4791 FT->getNumParams() != 4792 cast<FunctionType>(APTy->getElementType())->getNumParams()) 4793 return false; 4794 } 4795 4796 if (FT->getNumParams() < NumActualArgs && FT->isVarArg() && 4797 !CallerPAL.isEmpty()) { 4798 // In this case we have more arguments than the new function type, but we 4799 // won't be dropping them. Check that these extra arguments have attributes 4800 // that are compatible with being a vararg call argument. 4801 unsigned SRetIdx; 4802 if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) && 4803 SRetIdx > FT->getNumParams()) 4804 return false; 4805 } 4806 4807 // Okay, we decided that this is a safe thing to do: go ahead and start 4808 // inserting cast instructions as necessary. 4809 SmallVector<Value *, 8> Args; 4810 SmallVector<AttributeSet, 8> ArgAttrs; 4811 Args.reserve(NumActualArgs); 4812 ArgAttrs.reserve(NumActualArgs); 4813 4814 // Get any return attributes. 4815 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4816 4817 // If the return value is not being used, the type may not be compatible 4818 // with the existing attributes. Wipe out any problematic attributes. 4819 RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy)); 4820 4821 LLVMContext &Ctx = Call.getContext(); 4822 AI = Call.arg_begin(); 4823 for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { 4824 Type *ParamTy = FT->getParamType(i); 4825 4826 Value *NewArg = *AI; 4827 if ((*AI)->getType() != ParamTy) 4828 NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy); 4829 Args.push_back(NewArg); 4830 4831 // Add any parameter attributes. 4832 if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 4833 AttrBuilder AB(CallerPAL.getParamAttributes(i)); 4834 AB.addByValAttr(NewArg->getType()->getPointerElementType()); 4835 ArgAttrs.push_back(AttributeSet::get(Ctx, AB)); 4836 } else 4837 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4838 } 4839 4840 // If the function takes more arguments than the call was taking, add them 4841 // now. 4842 for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) { 4843 Args.push_back(Constant::getNullValue(FT->getParamType(i))); 4844 ArgAttrs.push_back(AttributeSet()); 4845 } 4846 4847 // If we are removing arguments to the function, emit an obnoxious warning. 4848 if (FT->getNumParams() < NumActualArgs) { 4849 // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722 4850 if (FT->isVarArg()) { 4851 // Add all of the arguments in their promoted form to the arg list. 4852 for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) { 4853 Type *PTy = getPromotedType((*AI)->getType()); 4854 Value *NewArg = *AI; 4855 if (PTy != (*AI)->getType()) { 4856 // Must promote to pass through va_arg area! 4857 Instruction::CastOps opcode = 4858 CastInst::getCastOpcode(*AI, false, PTy, false); 4859 NewArg = Builder.CreateCast(opcode, *AI, PTy); 4860 } 4861 Args.push_back(NewArg); 4862 4863 // Add any parameter attributes. 4864 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4865 } 4866 } 4867 } 4868 4869 AttributeSet FnAttrs = CallerPAL.getFnAttributes(); 4870 4871 if (NewRetTy->isVoidTy()) 4872 Caller->setName(""); // Void type should not have a name. 4873 4874 assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) && 4875 "missing argument attributes"); 4876 AttributeList NewCallerPAL = AttributeList::get( 4877 Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs); 4878 4879 SmallVector<OperandBundleDef, 1> OpBundles; 4880 Call.getOperandBundlesAsDefs(OpBundles); 4881 4882 CallBase *NewCall; 4883 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4884 NewCall = Builder.CreateInvoke(Callee, II->getNormalDest(), 4885 II->getUnwindDest(), Args, OpBundles); 4886 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { 4887 NewCall = Builder.CreateCallBr(Callee, CBI->getDefaultDest(), 4888 CBI->getIndirectDests(), Args, OpBundles); 4889 } else { 4890 NewCall = Builder.CreateCall(Callee, Args, OpBundles); 4891 cast<CallInst>(NewCall)->setTailCallKind( 4892 cast<CallInst>(Caller)->getTailCallKind()); 4893 } 4894 NewCall->takeName(Caller); 4895 NewCall->setCallingConv(Call.getCallingConv()); 4896 NewCall->setAttributes(NewCallerPAL); 4897 4898 // Preserve the weight metadata for the new call instruction. The metadata 4899 // is used by SamplePGO to check callsite's hotness. 4900 uint64_t W; 4901 if (Caller->extractProfTotalWeight(W)) 4902 NewCall->setProfWeight(W); 4903 4904 // Insert a cast of the return type as necessary. 4905 Instruction *NC = NewCall; 4906 Value *NV = NC; 4907 if (OldRetTy != NV->getType() && !Caller->use_empty()) { 4908 if (!NV->getType()->isVoidTy()) { 4909 NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy); 4910 NC->setDebugLoc(Caller->getDebugLoc()); 4911 4912 // If this is an invoke/callbr instruction, we should insert it after the 4913 // first non-phi instruction in the normal successor block. 4914 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4915 BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt(); 4916 InsertNewInstBefore(NC, *I); 4917 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { 4918 BasicBlock::iterator I = CBI->getDefaultDest()->getFirstInsertionPt(); 4919 InsertNewInstBefore(NC, *I); 4920 } else { 4921 // Otherwise, it's a call, just insert cast right after the call. 4922 InsertNewInstBefore(NC, *Caller); 4923 } 4924 Worklist.pushUsersToWorkList(*Caller); 4925 } else { 4926 NV = UndefValue::get(Caller->getType()); 4927 } 4928 } 4929 4930 if (!Caller->use_empty()) 4931 replaceInstUsesWith(*Caller, NV); 4932 else if (Caller->hasValueHandle()) { 4933 if (OldRetTy == NV->getType()) 4934 ValueHandleBase::ValueIsRAUWd(Caller, NV); 4935 else 4936 // We cannot call ValueIsRAUWd with a different type, and the 4937 // actual tracked value will disappear. 4938 ValueHandleBase::ValueIsDeleted(Caller); 4939 } 4940 4941 eraseInstFromFunction(*Caller); 4942 return true; 4943 } 4944 4945 /// Turn a call to a function created by init_trampoline / adjust_trampoline 4946 /// intrinsic pair into a direct call to the underlying function. 4947 Instruction * 4948 InstCombiner::transformCallThroughTrampoline(CallBase &Call, 4949 IntrinsicInst &Tramp) { 4950 Value *Callee = Call.getCalledValue(); 4951 Type *CalleeTy = Callee->getType(); 4952 FunctionType *FTy = Call.getFunctionType(); 4953 AttributeList Attrs = Call.getAttributes(); 4954 4955 // If the call already has the 'nest' attribute somewhere then give up - 4956 // otherwise 'nest' would occur twice after splicing in the chain. 4957 if (Attrs.hasAttrSomewhere(Attribute::Nest)) 4958 return nullptr; 4959 4960 Function *NestF = cast<Function>(Tramp.getArgOperand(1)->stripPointerCasts()); 4961 FunctionType *NestFTy = NestF->getFunctionType(); 4962 4963 AttributeList NestAttrs = NestF->getAttributes(); 4964 if (!NestAttrs.isEmpty()) { 4965 unsigned NestArgNo = 0; 4966 Type *NestTy = nullptr; 4967 AttributeSet NestAttr; 4968 4969 // Look for a parameter marked with the 'nest' attribute. 4970 for (FunctionType::param_iterator I = NestFTy->param_begin(), 4971 E = NestFTy->param_end(); 4972 I != E; ++NestArgNo, ++I) { 4973 AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo); 4974 if (AS.hasAttribute(Attribute::Nest)) { 4975 // Record the parameter type and any other attributes. 4976 NestTy = *I; 4977 NestAttr = AS; 4978 break; 4979 } 4980 } 4981 4982 if (NestTy) { 4983 std::vector<Value*> NewArgs; 4984 std::vector<AttributeSet> NewArgAttrs; 4985 NewArgs.reserve(Call.arg_size() + 1); 4986 NewArgAttrs.reserve(Call.arg_size()); 4987 4988 // Insert the nest argument into the call argument list, which may 4989 // mean appending it. Likewise for attributes. 4990 4991 { 4992 unsigned ArgNo = 0; 4993 auto I = Call.arg_begin(), E = Call.arg_end(); 4994 do { 4995 if (ArgNo == NestArgNo) { 4996 // Add the chain argument and attributes. 4997 Value *NestVal = Tramp.getArgOperand(2); 4998 if (NestVal->getType() != NestTy) 4999 NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest"); 5000 NewArgs.push_back(NestVal); 5001 NewArgAttrs.push_back(NestAttr); 5002 } 5003 5004 if (I == E) 5005 break; 5006 5007 // Add the original argument and attributes. 5008 NewArgs.push_back(*I); 5009 NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); 5010 5011 ++ArgNo; 5012 ++I; 5013 } while (true); 5014 } 5015 5016 // The trampoline may have been bitcast to a bogus type (FTy). 5017 // Handle this by synthesizing a new function type, equal to FTy 5018 // with the chain parameter inserted. 5019 5020 std::vector<Type*> NewTypes; 5021 NewTypes.reserve(FTy->getNumParams()+1); 5022 5023 // Insert the chain's type into the list of parameter types, which may 5024 // mean appending it. 5025 { 5026 unsigned ArgNo = 0; 5027 FunctionType::param_iterator I = FTy->param_begin(), 5028 E = FTy->param_end(); 5029 5030 do { 5031 if (ArgNo == NestArgNo) 5032 // Add the chain's type. 5033 NewTypes.push_back(NestTy); 5034 5035 if (I == E) 5036 break; 5037 5038 // Add the original type. 5039 NewTypes.push_back(*I); 5040 5041 ++ArgNo; 5042 ++I; 5043 } while (true); 5044 } 5045 5046 // Replace the trampoline call with a direct call. Let the generic 5047 // code sort out any function type mismatches. 5048 FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes, 5049 FTy->isVarArg()); 5050 Constant *NewCallee = 5051 NestF->getType() == PointerType::getUnqual(NewFTy) ? 5052 NestF : ConstantExpr::getBitCast(NestF, 5053 PointerType::getUnqual(NewFTy)); 5054 AttributeList NewPAL = 5055 AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(), 5056 Attrs.getRetAttributes(), NewArgAttrs); 5057 5058 SmallVector<OperandBundleDef, 1> OpBundles; 5059 Call.getOperandBundlesAsDefs(OpBundles); 5060 5061 Instruction *NewCaller; 5062 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { 5063 NewCaller = InvokeInst::Create(NewFTy, NewCallee, 5064 II->getNormalDest(), II->getUnwindDest(), 5065 NewArgs, OpBundles); 5066 cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv()); 5067 cast<InvokeInst>(NewCaller)->setAttributes(NewPAL); 5068 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(&Call)) { 5069 NewCaller = 5070 CallBrInst::Create(NewFTy, NewCallee, CBI->getDefaultDest(), 5071 CBI->getIndirectDests(), NewArgs, OpBundles); 5072 cast<CallBrInst>(NewCaller)->setCallingConv(CBI->getCallingConv()); 5073 cast<CallBrInst>(NewCaller)->setAttributes(NewPAL); 5074 } else { 5075 NewCaller = CallInst::Create(NewFTy, NewCallee, NewArgs, OpBundles); 5076 cast<CallInst>(NewCaller)->setTailCallKind( 5077 cast<CallInst>(Call).getTailCallKind()); 5078 cast<CallInst>(NewCaller)->setCallingConv( 5079 cast<CallInst>(Call).getCallingConv()); 5080 cast<CallInst>(NewCaller)->setAttributes(NewPAL); 5081 } 5082 NewCaller->setDebugLoc(Call.getDebugLoc()); 5083 5084 return NewCaller; 5085 } 5086 } 5087 5088 // Replace the trampoline call with a direct call. Since there is no 'nest' 5089 // parameter, there is no need to adjust the argument list. Let the generic 5090 // code sort out any function type mismatches. 5091 Constant *NewCallee = ConstantExpr::getBitCast(NestF, CalleeTy); 5092 Call.setCalledFunction(FTy, NewCallee); 5093 return &Call; 5094 } 5095