1 //===- InstCombineCalls.cpp -----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the visitCall, visitInvoke, and visitCallBr functions. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "InstCombineInternal.h" 14 #include "llvm/ADT/APFloat.h" 15 #include "llvm/ADT/APInt.h" 16 #include "llvm/ADT/APSInt.h" 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/FloatingPointMode.h" 19 #include "llvm/ADT/None.h" 20 #include "llvm/ADT/Optional.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/ADT/SmallVector.h" 23 #include "llvm/ADT/Statistic.h" 24 #include "llvm/ADT/Twine.h" 25 #include "llvm/Analysis/AssumptionCache.h" 26 #include "llvm/Analysis/InstructionSimplify.h" 27 #include "llvm/Analysis/Loads.h" 28 #include "llvm/Analysis/MemoryBuiltins.h" 29 #include "llvm/Analysis/ValueTracking.h" 30 #include "llvm/Analysis/VectorUtils.h" 31 #include "llvm/IR/Attributes.h" 32 #include "llvm/IR/BasicBlock.h" 33 #include "llvm/IR/Constant.h" 34 #include "llvm/IR/Constants.h" 35 #include "llvm/IR/DataLayout.h" 36 #include "llvm/IR/DerivedTypes.h" 37 #include "llvm/IR/Function.h" 38 #include "llvm/IR/GlobalVariable.h" 39 #include "llvm/IR/InstrTypes.h" 40 #include "llvm/IR/Instruction.h" 41 #include "llvm/IR/Instructions.h" 42 #include "llvm/IR/IntrinsicInst.h" 43 #include "llvm/IR/Intrinsics.h" 44 #include "llvm/IR/IntrinsicsX86.h" 45 #include "llvm/IR/IntrinsicsARM.h" 46 #include "llvm/IR/IntrinsicsAArch64.h" 47 #include "llvm/IR/IntrinsicsHexagon.h" 48 #include "llvm/IR/IntrinsicsNVPTX.h" 49 #include "llvm/IR/IntrinsicsAMDGPU.h" 50 #include "llvm/IR/IntrinsicsPowerPC.h" 51 #include "llvm/IR/LLVMContext.h" 52 #include "llvm/IR/Metadata.h" 53 #include "llvm/IR/PatternMatch.h" 54 #include "llvm/IR/Statepoint.h" 55 #include "llvm/IR/Type.h" 56 #include "llvm/IR/User.h" 57 #include "llvm/IR/Value.h" 58 #include "llvm/IR/ValueHandle.h" 59 #include "llvm/Support/AtomicOrdering.h" 60 #include "llvm/Support/Casting.h" 61 #include "llvm/Support/CommandLine.h" 62 #include "llvm/Support/Compiler.h" 63 #include "llvm/Support/Debug.h" 64 #include "llvm/Support/ErrorHandling.h" 65 #include "llvm/Support/KnownBits.h" 66 #include "llvm/Support/MathExtras.h" 67 #include "llvm/Support/raw_ostream.h" 68 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h" 69 #include "llvm/Transforms/Utils/Local.h" 70 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 71 #include <algorithm> 72 #include <cassert> 73 #include <cstdint> 74 #include <cstring> 75 #include <utility> 76 #include <vector> 77 78 using namespace llvm; 79 using namespace PatternMatch; 80 81 #define DEBUG_TYPE "instcombine" 82 83 STATISTIC(NumSimplified, "Number of library calls simplified"); 84 85 static cl::opt<unsigned> GuardWideningWindow( 86 "instcombine-guard-widening-window", 87 cl::init(3), 88 cl::desc("How wide an instruction window to bypass looking for " 89 "another guard")); 90 91 /// Return the specified type promoted as it would be to pass though a va_arg 92 /// area. 93 static Type *getPromotedType(Type *Ty) { 94 if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) { 95 if (ITy->getBitWidth() < 32) 96 return Type::getInt32Ty(Ty->getContext()); 97 } 98 return Ty; 99 } 100 101 /// Return a constant boolean vector that has true elements in all positions 102 /// where the input constant data vector has an element with the sign bit set. 103 static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { 104 SmallVector<Constant *, 32> BoolVec; 105 IntegerType *BoolTy = Type::getInt1Ty(V->getContext()); 106 for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) { 107 Constant *Elt = V->getElementAsConstant(I); 108 assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) && 109 "Unexpected constant data vector element type"); 110 bool Sign = V->getElementType()->isIntegerTy() 111 ? cast<ConstantInt>(Elt)->isNegative() 112 : cast<ConstantFP>(Elt)->isNegative(); 113 BoolVec.push_back(ConstantInt::get(BoolTy, Sign)); 114 } 115 return ConstantVector::get(BoolVec); 116 } 117 118 Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { 119 unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT); 120 unsigned CopyDstAlign = MI->getDestAlignment(); 121 if (CopyDstAlign < DstAlign){ 122 MI->setDestAlignment(DstAlign); 123 return MI; 124 } 125 126 unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT); 127 unsigned CopySrcAlign = MI->getSourceAlignment(); 128 if (CopySrcAlign < SrcAlign) { 129 MI->setSourceAlignment(SrcAlign); 130 return MI; 131 } 132 133 // If we have a store to a location which is known constant, we can conclude 134 // that the store must be storing the constant value (else the memory 135 // wouldn't be constant), and this must be a noop. 136 if (AA->pointsToConstantMemory(MI->getDest())) { 137 // Set the size of the copy to 0, it will be deleted on the next iteration. 138 MI->setLength(Constant::getNullValue(MI->getLength()->getType())); 139 return MI; 140 } 141 142 // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with 143 // load/store. 144 ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength()); 145 if (!MemOpLength) return nullptr; 146 147 // Source and destination pointer types are always "i8*" for intrinsic. See 148 // if the size is something we can handle with a single primitive load/store. 149 // A single load+store correctly handles overlapping memory in the memmove 150 // case. 151 uint64_t Size = MemOpLength->getLimitedValue(); 152 assert(Size && "0-sized memory transferring should be removed already."); 153 154 if (Size > 8 || (Size&(Size-1))) 155 return nullptr; // If not 1/2/4/8 bytes, exit. 156 157 // If it is an atomic and alignment is less than the size then we will 158 // introduce the unaligned memory access which will be later transformed 159 // into libcall in CodeGen. This is not evident performance gain so disable 160 // it now. 161 if (isa<AtomicMemTransferInst>(MI)) 162 if (CopyDstAlign < Size || CopySrcAlign < Size) 163 return nullptr; 164 165 // Use an integer load+store unless we can find something better. 166 unsigned SrcAddrSp = 167 cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace(); 168 unsigned DstAddrSp = 169 cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace(); 170 171 IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3); 172 Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp); 173 Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp); 174 175 // If the memcpy has metadata describing the members, see if we can get the 176 // TBAA tag describing our copy. 177 MDNode *CopyMD = nullptr; 178 if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) { 179 CopyMD = M; 180 } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) { 181 if (M->getNumOperands() == 3 && M->getOperand(0) && 182 mdconst::hasa<ConstantInt>(M->getOperand(0)) && 183 mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() && 184 M->getOperand(1) && 185 mdconst::hasa<ConstantInt>(M->getOperand(1)) && 186 mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() == 187 Size && 188 M->getOperand(2) && isa<MDNode>(M->getOperand(2))) 189 CopyMD = cast<MDNode>(M->getOperand(2)); 190 } 191 192 Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy); 193 Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); 194 LoadInst *L = Builder.CreateLoad(IntType, Src); 195 // Alignment from the mem intrinsic will be better, so use it. 196 L->setAlignment( 197 MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead. 198 if (CopyMD) 199 L->setMetadata(LLVMContext::MD_tbaa, CopyMD); 200 MDNode *LoopMemParallelMD = 201 MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access); 202 if (LoopMemParallelMD) 203 L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 204 MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group); 205 if (AccessGroupMD) 206 L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); 207 208 StoreInst *S = Builder.CreateStore(L, Dest); 209 // Alignment from the mem intrinsic will be better, so use it. 210 S->setAlignment( 211 MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead. 212 if (CopyMD) 213 S->setMetadata(LLVMContext::MD_tbaa, CopyMD); 214 if (LoopMemParallelMD) 215 S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); 216 if (AccessGroupMD) 217 S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD); 218 219 if (auto *MT = dyn_cast<MemTransferInst>(MI)) { 220 // non-atomics can be volatile 221 L->setVolatile(MT->isVolatile()); 222 S->setVolatile(MT->isVolatile()); 223 } 224 if (isa<AtomicMemTransferInst>(MI)) { 225 // atomics have to be unordered 226 L->setOrdering(AtomicOrdering::Unordered); 227 S->setOrdering(AtomicOrdering::Unordered); 228 } 229 230 // Set the size of the copy to 0, it will be deleted on the next iteration. 231 MI->setLength(Constant::getNullValue(MemOpLength->getType())); 232 return MI; 233 } 234 235 Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { 236 const unsigned KnownAlignment = 237 getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); 238 if (MI->getDestAlignment() < KnownAlignment) { 239 MI->setDestAlignment(KnownAlignment); 240 return MI; 241 } 242 243 // If we have a store to a location which is known constant, we can conclude 244 // that the store must be storing the constant value (else the memory 245 // wouldn't be constant), and this must be a noop. 246 if (AA->pointsToConstantMemory(MI->getDest())) { 247 // Set the size of the copy to 0, it will be deleted on the next iteration. 248 MI->setLength(Constant::getNullValue(MI->getLength()->getType())); 249 return MI; 250 } 251 252 // Extract the length and alignment and fill if they are constant. 253 ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength()); 254 ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue()); 255 if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) 256 return nullptr; 257 const uint64_t Len = LenC->getLimitedValue(); 258 assert(Len && "0-sized memory setting should be removed already."); 259 const Align Alignment = assumeAligned(MI->getDestAlignment()); 260 261 // If it is an atomic and alignment is less than the size then we will 262 // introduce the unaligned memory access which will be later transformed 263 // into libcall in CodeGen. This is not evident performance gain so disable 264 // it now. 265 if (isa<AtomicMemSetInst>(MI)) 266 if (Alignment < Len) 267 return nullptr; 268 269 // memset(s,c,n) -> store s, c (for n=1,2,4,8) 270 if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) { 271 Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8. 272 273 Value *Dest = MI->getDest(); 274 unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace(); 275 Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp); 276 Dest = Builder.CreateBitCast(Dest, NewDstPtrTy); 277 278 // Extract the fill value and store. 279 uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; 280 StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest, 281 MI->isVolatile()); 282 S->setAlignment(Alignment); 283 if (isa<AtomicMemSetInst>(MI)) 284 S->setOrdering(AtomicOrdering::Unordered); 285 286 // Set the size of the copy to 0, it will be deleted on the next iteration. 287 MI->setLength(Constant::getNullValue(LenC->getType())); 288 return MI; 289 } 290 291 return nullptr; 292 } 293 294 static Value *simplifyX86immShift(const IntrinsicInst &II, 295 InstCombiner::BuilderTy &Builder) { 296 bool LogicalShift = false; 297 bool ShiftLeft = false; 298 299 switch (II.getIntrinsicID()) { 300 default: llvm_unreachable("Unexpected intrinsic!"); 301 case Intrinsic::x86_sse2_psra_d: 302 case Intrinsic::x86_sse2_psra_w: 303 case Intrinsic::x86_sse2_psrai_d: 304 case Intrinsic::x86_sse2_psrai_w: 305 case Intrinsic::x86_avx2_psra_d: 306 case Intrinsic::x86_avx2_psra_w: 307 case Intrinsic::x86_avx2_psrai_d: 308 case Intrinsic::x86_avx2_psrai_w: 309 case Intrinsic::x86_avx512_psra_q_128: 310 case Intrinsic::x86_avx512_psrai_q_128: 311 case Intrinsic::x86_avx512_psra_q_256: 312 case Intrinsic::x86_avx512_psrai_q_256: 313 case Intrinsic::x86_avx512_psra_d_512: 314 case Intrinsic::x86_avx512_psra_q_512: 315 case Intrinsic::x86_avx512_psra_w_512: 316 case Intrinsic::x86_avx512_psrai_d_512: 317 case Intrinsic::x86_avx512_psrai_q_512: 318 case Intrinsic::x86_avx512_psrai_w_512: 319 LogicalShift = false; ShiftLeft = false; 320 break; 321 case Intrinsic::x86_sse2_psrl_d: 322 case Intrinsic::x86_sse2_psrl_q: 323 case Intrinsic::x86_sse2_psrl_w: 324 case Intrinsic::x86_sse2_psrli_d: 325 case Intrinsic::x86_sse2_psrli_q: 326 case Intrinsic::x86_sse2_psrli_w: 327 case Intrinsic::x86_avx2_psrl_d: 328 case Intrinsic::x86_avx2_psrl_q: 329 case Intrinsic::x86_avx2_psrl_w: 330 case Intrinsic::x86_avx2_psrli_d: 331 case Intrinsic::x86_avx2_psrli_q: 332 case Intrinsic::x86_avx2_psrli_w: 333 case Intrinsic::x86_avx512_psrl_d_512: 334 case Intrinsic::x86_avx512_psrl_q_512: 335 case Intrinsic::x86_avx512_psrl_w_512: 336 case Intrinsic::x86_avx512_psrli_d_512: 337 case Intrinsic::x86_avx512_psrli_q_512: 338 case Intrinsic::x86_avx512_psrli_w_512: 339 LogicalShift = true; ShiftLeft = false; 340 break; 341 case Intrinsic::x86_sse2_psll_d: 342 case Intrinsic::x86_sse2_psll_q: 343 case Intrinsic::x86_sse2_psll_w: 344 case Intrinsic::x86_sse2_pslli_d: 345 case Intrinsic::x86_sse2_pslli_q: 346 case Intrinsic::x86_sse2_pslli_w: 347 case Intrinsic::x86_avx2_psll_d: 348 case Intrinsic::x86_avx2_psll_q: 349 case Intrinsic::x86_avx2_psll_w: 350 case Intrinsic::x86_avx2_pslli_d: 351 case Intrinsic::x86_avx2_pslli_q: 352 case Intrinsic::x86_avx2_pslli_w: 353 case Intrinsic::x86_avx512_psll_d_512: 354 case Intrinsic::x86_avx512_psll_q_512: 355 case Intrinsic::x86_avx512_psll_w_512: 356 case Intrinsic::x86_avx512_pslli_d_512: 357 case Intrinsic::x86_avx512_pslli_q_512: 358 case Intrinsic::x86_avx512_pslli_w_512: 359 LogicalShift = true; ShiftLeft = true; 360 break; 361 } 362 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 363 364 // Simplify if count is constant. 365 auto Arg1 = II.getArgOperand(1); 366 auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1); 367 auto CDV = dyn_cast<ConstantDataVector>(Arg1); 368 auto CInt = dyn_cast<ConstantInt>(Arg1); 369 if (!CAZ && !CDV && !CInt) 370 return nullptr; 371 372 APInt Count(64, 0); 373 if (CDV) { 374 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 375 // operand to compute the shift amount. 376 auto VT = cast<VectorType>(CDV->getType()); 377 unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); 378 assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); 379 unsigned NumSubElts = 64 / BitWidth; 380 381 // Concatenate the sub-elements to create the 64-bit value. 382 for (unsigned i = 0; i != NumSubElts; ++i) { 383 unsigned SubEltIdx = (NumSubElts - 1) - i; 384 auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 385 Count <<= BitWidth; 386 Count |= SubElt->getValue().zextOrTrunc(64); 387 } 388 } 389 else if (CInt) 390 Count = CInt->getValue(); 391 392 auto Vec = II.getArgOperand(0); 393 auto VT = cast<VectorType>(Vec->getType()); 394 auto SVT = VT->getElementType(); 395 unsigned VWidth = VT->getNumElements(); 396 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 397 398 // If shift-by-zero then just return the original value. 399 if (Count.isNullValue()) 400 return Vec; 401 402 // Handle cases when Shift >= BitWidth. 403 if (Count.uge(BitWidth)) { 404 // If LogicalShift - just return zero. 405 if (LogicalShift) 406 return ConstantAggregateZero::get(VT); 407 408 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 409 Count = APInt(64, BitWidth - 1); 410 } 411 412 // Get a constant vector of the same type as the first operand. 413 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 414 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 415 416 if (ShiftLeft) 417 return Builder.CreateShl(Vec, ShiftVec); 418 419 if (LogicalShift) 420 return Builder.CreateLShr(Vec, ShiftVec); 421 422 return Builder.CreateAShr(Vec, ShiftVec); 423 } 424 425 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 426 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 427 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 428 static Value *simplifyX86varShift(const IntrinsicInst &II, 429 InstCombiner::BuilderTy &Builder) { 430 bool LogicalShift = false; 431 bool ShiftLeft = false; 432 433 switch (II.getIntrinsicID()) { 434 default: llvm_unreachable("Unexpected intrinsic!"); 435 case Intrinsic::x86_avx2_psrav_d: 436 case Intrinsic::x86_avx2_psrav_d_256: 437 case Intrinsic::x86_avx512_psrav_q_128: 438 case Intrinsic::x86_avx512_psrav_q_256: 439 case Intrinsic::x86_avx512_psrav_d_512: 440 case Intrinsic::x86_avx512_psrav_q_512: 441 case Intrinsic::x86_avx512_psrav_w_128: 442 case Intrinsic::x86_avx512_psrav_w_256: 443 case Intrinsic::x86_avx512_psrav_w_512: 444 LogicalShift = false; 445 ShiftLeft = false; 446 break; 447 case Intrinsic::x86_avx2_psrlv_d: 448 case Intrinsic::x86_avx2_psrlv_d_256: 449 case Intrinsic::x86_avx2_psrlv_q: 450 case Intrinsic::x86_avx2_psrlv_q_256: 451 case Intrinsic::x86_avx512_psrlv_d_512: 452 case Intrinsic::x86_avx512_psrlv_q_512: 453 case Intrinsic::x86_avx512_psrlv_w_128: 454 case Intrinsic::x86_avx512_psrlv_w_256: 455 case Intrinsic::x86_avx512_psrlv_w_512: 456 LogicalShift = true; 457 ShiftLeft = false; 458 break; 459 case Intrinsic::x86_avx2_psllv_d: 460 case Intrinsic::x86_avx2_psllv_d_256: 461 case Intrinsic::x86_avx2_psllv_q: 462 case Intrinsic::x86_avx2_psllv_q_256: 463 case Intrinsic::x86_avx512_psllv_d_512: 464 case Intrinsic::x86_avx512_psllv_q_512: 465 case Intrinsic::x86_avx512_psllv_w_128: 466 case Intrinsic::x86_avx512_psllv_w_256: 467 case Intrinsic::x86_avx512_psllv_w_512: 468 LogicalShift = true; 469 ShiftLeft = true; 470 break; 471 } 472 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 473 474 // Simplify if all shift amounts are constant/undef. 475 auto *CShift = dyn_cast<Constant>(II.getArgOperand(1)); 476 if (!CShift) 477 return nullptr; 478 479 auto Vec = II.getArgOperand(0); 480 auto VT = cast<VectorType>(II.getType()); 481 auto SVT = VT->getVectorElementType(); 482 int NumElts = VT->getNumElements(); 483 int BitWidth = SVT->getIntegerBitWidth(); 484 485 // Collect each element's shift amount. 486 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 487 bool AnyOutOfRange = false; 488 SmallVector<int, 8> ShiftAmts; 489 for (int I = 0; I < NumElts; ++I) { 490 auto *CElt = CShift->getAggregateElement(I); 491 if (CElt && isa<UndefValue>(CElt)) { 492 ShiftAmts.push_back(-1); 493 continue; 494 } 495 496 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 497 if (!COp) 498 return nullptr; 499 500 // Handle out of range shifts. 501 // If LogicalShift - set to BitWidth (special case). 502 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 503 APInt ShiftVal = COp->getValue(); 504 if (ShiftVal.uge(BitWidth)) { 505 AnyOutOfRange = LogicalShift; 506 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 507 continue; 508 } 509 510 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 511 } 512 513 // If all elements out of range or UNDEF, return vector of zeros/undefs. 514 // ArithmeticShift should only hit this if they are all UNDEF. 515 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 516 if (llvm::all_of(ShiftAmts, OutOfRange)) { 517 SmallVector<Constant *, 8> ConstantVec; 518 for (int Idx : ShiftAmts) { 519 if (Idx < 0) { 520 ConstantVec.push_back(UndefValue::get(SVT)); 521 } else { 522 assert(LogicalShift && "Logical shift expected"); 523 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 524 } 525 } 526 return ConstantVector::get(ConstantVec); 527 } 528 529 // We can't handle only some out of range values with generic logical shifts. 530 if (AnyOutOfRange) 531 return nullptr; 532 533 // Build the shift amount constant vector. 534 SmallVector<Constant *, 8> ShiftVecAmts; 535 for (int Idx : ShiftAmts) { 536 if (Idx < 0) 537 ShiftVecAmts.push_back(UndefValue::get(SVT)); 538 else 539 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 540 } 541 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 542 543 if (ShiftLeft) 544 return Builder.CreateShl(Vec, ShiftVec); 545 546 if (LogicalShift) 547 return Builder.CreateLShr(Vec, ShiftVec); 548 549 return Builder.CreateAShr(Vec, ShiftVec); 550 } 551 552 static Value *simplifyX86pack(IntrinsicInst &II, 553 InstCombiner::BuilderTy &Builder, bool IsSigned) { 554 Value *Arg0 = II.getArgOperand(0); 555 Value *Arg1 = II.getArgOperand(1); 556 Type *ResTy = II.getType(); 557 558 // Fast all undef handling. 559 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 560 return UndefValue::get(ResTy); 561 562 Type *ArgTy = Arg0->getType(); 563 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 564 unsigned NumSrcElts = ArgTy->getVectorNumElements(); 565 assert(ResTy->getVectorNumElements() == (2 * NumSrcElts) && 566 "Unexpected packing types"); 567 568 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 569 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 570 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 571 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 572 "Unexpected packing types"); 573 574 // Constant folding. 575 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 576 return nullptr; 577 578 // Clamp Values - signed/unsigned both use signed clamp values, but they 579 // differ on the min/max values. 580 APInt MinValue, MaxValue; 581 if (IsSigned) { 582 // PACKSS: Truncate signed value with signed saturation. 583 // Source values less than dst minint are saturated to minint. 584 // Source values greater than dst maxint are saturated to maxint. 585 MinValue = 586 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 587 MaxValue = 588 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 589 } else { 590 // PACKUS: Truncate signed value with unsigned saturation. 591 // Source values less than zero are saturated to zero. 592 // Source values greater than dst maxuint are saturated to maxuint. 593 MinValue = APInt::getNullValue(SrcScalarSizeInBits); 594 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 595 } 596 597 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 598 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 599 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 600 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 601 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 602 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 603 604 // Shuffle clamped args together at the lane level. 605 SmallVector<unsigned, 32> PackMask; 606 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 607 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 608 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 609 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 610 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 611 } 612 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 613 614 // Truncate to dst size. 615 return Builder.CreateTrunc(Shuffle, ResTy); 616 } 617 618 static Value *simplifyX86movmsk(const IntrinsicInst &II, 619 InstCombiner::BuilderTy &Builder) { 620 Value *Arg = II.getArgOperand(0); 621 Type *ResTy = II.getType(); 622 Type *ArgTy = Arg->getType(); 623 624 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 625 if (isa<UndefValue>(Arg)) 626 return Constant::getNullValue(ResTy); 627 628 // We can't easily peek through x86_mmx types. 629 if (!ArgTy->isVectorTy()) 630 return nullptr; 631 632 // Expand MOVMSK to compare/bitcast/zext: 633 // e.g. PMOVMSKB(v16i8 x): 634 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 635 // %int = bitcast <16 x i1> %cmp to i16 636 // %res = zext i16 %int to i32 637 unsigned NumElts = ArgTy->getVectorNumElements(); 638 Type *IntegerVecTy = VectorType::getInteger(cast<VectorType>(ArgTy)); 639 Type *IntegerTy = Builder.getIntNTy(NumElts); 640 641 Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); 642 Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); 643 Res = Builder.CreateBitCast(Res, IntegerTy); 644 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 645 return Res; 646 } 647 648 static Value *simplifyX86addcarry(const IntrinsicInst &II, 649 InstCombiner::BuilderTy &Builder) { 650 Value *CarryIn = II.getArgOperand(0); 651 Value *Op1 = II.getArgOperand(1); 652 Value *Op2 = II.getArgOperand(2); 653 Type *RetTy = II.getType(); 654 Type *OpTy = Op1->getType(); 655 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 656 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 657 "Unexpected types for x86 addcarry"); 658 659 // If carry-in is zero, this is just an unsigned add with overflow. 660 if (match(CarryIn, m_ZeroInt())) { 661 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 662 { Op1, Op2 }); 663 // The types have to be adjusted to match the x86 call types. 664 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 665 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 666 Builder.getInt8Ty()); 667 Value *Res = UndefValue::get(RetTy); 668 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 669 return Builder.CreateInsertValue(Res, UAddResult, 1); 670 } 671 672 return nullptr; 673 } 674 675 static Value *simplifyX86insertps(const IntrinsicInst &II, 676 InstCombiner::BuilderTy &Builder) { 677 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 678 if (!CInt) 679 return nullptr; 680 681 VectorType *VecTy = cast<VectorType>(II.getType()); 682 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 683 684 // The immediate permute control byte looks like this: 685 // [3:0] - zero mask for each 32-bit lane 686 // [5:4] - select one 32-bit destination lane 687 // [7:6] - select one 32-bit source lane 688 689 uint8_t Imm = CInt->getZExtValue(); 690 uint8_t ZMask = Imm & 0xf; 691 uint8_t DestLane = (Imm >> 4) & 0x3; 692 uint8_t SourceLane = (Imm >> 6) & 0x3; 693 694 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 695 696 // If all zero mask bits are set, this was just a weird way to 697 // generate a zero vector. 698 if (ZMask == 0xf) 699 return ZeroVector; 700 701 // Initialize by passing all of the first source bits through. 702 uint32_t ShuffleMask[4] = { 0, 1, 2, 3 }; 703 704 // We may replace the second operand with the zero vector. 705 Value *V1 = II.getArgOperand(1); 706 707 if (ZMask) { 708 // If the zero mask is being used with a single input or the zero mask 709 // overrides the destination lane, this is a shuffle with the zero vector. 710 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 711 (ZMask & (1 << DestLane))) { 712 V1 = ZeroVector; 713 // We may still move 32-bits of the first source vector from one lane 714 // to another. 715 ShuffleMask[DestLane] = SourceLane; 716 // The zero mask may override the previous insert operation. 717 for (unsigned i = 0; i < 4; ++i) 718 if ((ZMask >> i) & 0x1) 719 ShuffleMask[i] = i + 4; 720 } else { 721 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 722 return nullptr; 723 } 724 } else { 725 // Replace the selected destination lane with the selected source lane. 726 ShuffleMask[DestLane] = SourceLane + 4; 727 } 728 729 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 730 } 731 732 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 733 /// or conversion to a shuffle vector. 734 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 735 ConstantInt *CILength, ConstantInt *CIIndex, 736 InstCombiner::BuilderTy &Builder) { 737 auto LowConstantHighUndef = [&](uint64_t Val) { 738 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 739 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 740 UndefValue::get(IntTy64)}; 741 return ConstantVector::get(Args); 742 }; 743 744 // See if we're dealing with constant values. 745 Constant *C0 = dyn_cast<Constant>(Op0); 746 ConstantInt *CI0 = 747 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 748 : nullptr; 749 750 // Attempt to constant fold. 751 if (CILength && CIIndex) { 752 // From AMD documentation: "The bit index and field length are each six 753 // bits in length other bits of the field are ignored." 754 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 755 APInt APLength = CILength->getValue().zextOrTrunc(6); 756 757 unsigned Index = APIndex.getZExtValue(); 758 759 // From AMD documentation: "a value of zero in the field length is 760 // defined as length of 64". 761 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 762 763 // From AMD documentation: "If the sum of the bit index + length field 764 // is greater than 64, the results are undefined". 765 unsigned End = Index + Length; 766 767 // Note that both field index and field length are 8-bit quantities. 768 // Since variables 'Index' and 'Length' are unsigned values 769 // obtained from zero-extending field index and field length 770 // respectively, their sum should never wrap around. 771 if (End > 64) 772 return UndefValue::get(II.getType()); 773 774 // If we are inserting whole bytes, we can convert this to a shuffle. 775 // Lowering can recognize EXTRQI shuffle masks. 776 if ((Length % 8) == 0 && (Index % 8) == 0) { 777 // Convert bit indices to byte indices. 778 Length /= 8; 779 Index /= 8; 780 781 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 782 Type *IntTy32 = Type::getInt32Ty(II.getContext()); 783 VectorType *ShufTy = VectorType::get(IntTy8, 16); 784 785 SmallVector<Constant *, 16> ShuffleMask; 786 for (int i = 0; i != (int)Length; ++i) 787 ShuffleMask.push_back( 788 Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); 789 for (int i = Length; i != 8; ++i) 790 ShuffleMask.push_back( 791 Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); 792 for (int i = 8; i != 16; ++i) 793 ShuffleMask.push_back(UndefValue::get(IntTy32)); 794 795 Value *SV = Builder.CreateShuffleVector( 796 Builder.CreateBitCast(Op0, ShufTy), 797 ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask)); 798 return Builder.CreateBitCast(SV, II.getType()); 799 } 800 801 // Constant Fold - shift Index'th bit to lowest position and mask off 802 // Length bits. 803 if (CI0) { 804 APInt Elt = CI0->getValue(); 805 Elt.lshrInPlace(Index); 806 Elt = Elt.zextOrTrunc(Length); 807 return LowConstantHighUndef(Elt.getZExtValue()); 808 } 809 810 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 811 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 812 Value *Args[] = {Op0, CILength, CIIndex}; 813 Module *M = II.getModule(); 814 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 815 return Builder.CreateCall(F, Args); 816 } 817 } 818 819 // Constant Fold - extraction from zero is always {zero, undef}. 820 if (CI0 && CI0->isZero()) 821 return LowConstantHighUndef(0); 822 823 return nullptr; 824 } 825 826 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 827 /// folding or conversion to a shuffle vector. 828 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 829 APInt APLength, APInt APIndex, 830 InstCombiner::BuilderTy &Builder) { 831 // From AMD documentation: "The bit index and field length are each six bits 832 // in length other bits of the field are ignored." 833 APIndex = APIndex.zextOrTrunc(6); 834 APLength = APLength.zextOrTrunc(6); 835 836 // Attempt to constant fold. 837 unsigned Index = APIndex.getZExtValue(); 838 839 // From AMD documentation: "a value of zero in the field length is 840 // defined as length of 64". 841 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 842 843 // From AMD documentation: "If the sum of the bit index + length field 844 // is greater than 64, the results are undefined". 845 unsigned End = Index + Length; 846 847 // Note that both field index and field length are 8-bit quantities. 848 // Since variables 'Index' and 'Length' are unsigned values 849 // obtained from zero-extending field index and field length 850 // respectively, their sum should never wrap around. 851 if (End > 64) 852 return UndefValue::get(II.getType()); 853 854 // If we are inserting whole bytes, we can convert this to a shuffle. 855 // Lowering can recognize INSERTQI shuffle masks. 856 if ((Length % 8) == 0 && (Index % 8) == 0) { 857 // Convert bit indices to byte indices. 858 Length /= 8; 859 Index /= 8; 860 861 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 862 Type *IntTy32 = Type::getInt32Ty(II.getContext()); 863 VectorType *ShufTy = VectorType::get(IntTy8, 16); 864 865 SmallVector<Constant *, 16> ShuffleMask; 866 for (int i = 0; i != (int)Index; ++i) 867 ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); 868 for (int i = 0; i != (int)Length; ++i) 869 ShuffleMask.push_back( 870 Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); 871 for (int i = Index + Length; i != 8; ++i) 872 ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); 873 for (int i = 8; i != 16; ++i) 874 ShuffleMask.push_back(UndefValue::get(IntTy32)); 875 876 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 877 Builder.CreateBitCast(Op1, ShufTy), 878 ConstantVector::get(ShuffleMask)); 879 return Builder.CreateBitCast(SV, II.getType()); 880 } 881 882 // See if we're dealing with constant values. 883 Constant *C0 = dyn_cast<Constant>(Op0); 884 Constant *C1 = dyn_cast<Constant>(Op1); 885 ConstantInt *CI00 = 886 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 887 : nullptr; 888 ConstantInt *CI10 = 889 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 890 : nullptr; 891 892 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 893 if (CI00 && CI10) { 894 APInt V00 = CI00->getValue(); 895 APInt V10 = CI10->getValue(); 896 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 897 V00 = V00 & ~Mask; 898 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 899 APInt Val = V00 | V10; 900 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 901 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 902 UndefValue::get(IntTy64)}; 903 return ConstantVector::get(Args); 904 } 905 906 // If we were an INSERTQ call, we'll save demanded elements if we convert to 907 // INSERTQI. 908 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 909 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 910 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 911 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 912 913 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 914 Module *M = II.getModule(); 915 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 916 return Builder.CreateCall(F, Args); 917 } 918 919 return nullptr; 920 } 921 922 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 923 static Value *simplifyX86pshufb(const IntrinsicInst &II, 924 InstCombiner::BuilderTy &Builder) { 925 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 926 if (!V) 927 return nullptr; 928 929 auto *VecTy = cast<VectorType>(II.getType()); 930 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 931 unsigned NumElts = VecTy->getNumElements(); 932 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 933 "Unexpected number of elements in shuffle mask!"); 934 935 // Construct a shuffle mask from constant integers or UNDEFs. 936 Constant *Indexes[64] = {nullptr}; 937 938 // Each byte in the shuffle control mask forms an index to permute the 939 // corresponding byte in the destination operand. 940 for (unsigned I = 0; I < NumElts; ++I) { 941 Constant *COp = V->getAggregateElement(I); 942 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 943 return nullptr; 944 945 if (isa<UndefValue>(COp)) { 946 Indexes[I] = UndefValue::get(MaskEltTy); 947 continue; 948 } 949 950 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 951 952 // If the most significant bit (bit[7]) of each byte of the shuffle 953 // control mask is set, then zero is written in the result byte. 954 // The zero vector is in the right-hand side of the resulting 955 // shufflevector. 956 957 // The value of each index for the high 128-bit lane is the least 958 // significant 4 bits of the respective shuffle control byte. 959 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 960 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 961 } 962 963 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); 964 auto V1 = II.getArgOperand(0); 965 auto V2 = Constant::getNullValue(VecTy); 966 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 967 } 968 969 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 970 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 971 InstCombiner::BuilderTy &Builder) { 972 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 973 if (!V) 974 return nullptr; 975 976 auto *VecTy = cast<VectorType>(II.getType()); 977 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 978 unsigned NumElts = VecTy->getVectorNumElements(); 979 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 980 unsigned NumLaneElts = IsPD ? 2 : 4; 981 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 982 983 // Construct a shuffle mask from constant integers or UNDEFs. 984 Constant *Indexes[16] = {nullptr}; 985 986 // The intrinsics only read one or two bits, clear the rest. 987 for (unsigned I = 0; I < NumElts; ++I) { 988 Constant *COp = V->getAggregateElement(I); 989 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 990 return nullptr; 991 992 if (isa<UndefValue>(COp)) { 993 Indexes[I] = UndefValue::get(MaskEltTy); 994 continue; 995 } 996 997 APInt Index = cast<ConstantInt>(COp)->getValue(); 998 Index = Index.zextOrTrunc(32).getLoBits(2); 999 1000 // The PD variants uses bit 1 to select per-lane element index, so 1001 // shift down to convert to generic shuffle mask index. 1002 if (IsPD) 1003 Index.lshrInPlace(1); 1004 1005 // The _256 variants are a bit trickier since the mask bits always index 1006 // into the corresponding 128 half. In order to convert to a generic 1007 // shuffle, we have to make that explicit. 1008 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 1009 1010 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1011 } 1012 1013 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); 1014 auto V1 = II.getArgOperand(0); 1015 auto V2 = UndefValue::get(V1->getType()); 1016 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1017 } 1018 1019 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 1020 static Value *simplifyX86vpermv(const IntrinsicInst &II, 1021 InstCombiner::BuilderTy &Builder) { 1022 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1023 if (!V) 1024 return nullptr; 1025 1026 auto *VecTy = cast<VectorType>(II.getType()); 1027 auto *MaskEltTy = Type::getInt32Ty(II.getContext()); 1028 unsigned Size = VecTy->getNumElements(); 1029 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 1030 "Unexpected shuffle mask size"); 1031 1032 // Construct a shuffle mask from constant integers or UNDEFs. 1033 Constant *Indexes[64] = {nullptr}; 1034 1035 for (unsigned I = 0; I < Size; ++I) { 1036 Constant *COp = V->getAggregateElement(I); 1037 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1038 return nullptr; 1039 1040 if (isa<UndefValue>(COp)) { 1041 Indexes[I] = UndefValue::get(MaskEltTy); 1042 continue; 1043 } 1044 1045 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 1046 Index &= Size - 1; 1047 Indexes[I] = ConstantInt::get(MaskEltTy, Index); 1048 } 1049 1050 auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size)); 1051 auto V1 = II.getArgOperand(0); 1052 auto V2 = UndefValue::get(VecTy); 1053 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1054 } 1055 1056 // TODO, Obvious Missing Transforms: 1057 // * Narrow width by halfs excluding zero/undef lanes 1058 Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) { 1059 Value *LoadPtr = II.getArgOperand(0); 1060 const Align Alignment = 1061 cast<ConstantInt>(II.getArgOperand(1))->getAlignValue(); 1062 1063 // If the mask is all ones or undefs, this is a plain vector load of the 1st 1064 // argument. 1065 if (maskIsAllOneOrUndef(II.getArgOperand(2))) 1066 return Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, 1067 "unmaskedload"); 1068 1069 // If we can unconditionally load from this address, replace with a 1070 // load/select idiom. TODO: use DT for context sensitive query 1071 if (isDereferenceableAndAlignedPointer(LoadPtr, II.getType(), Alignment, 1072 II.getModule()->getDataLayout(), &II, 1073 nullptr)) { 1074 Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, 1075 "unmaskedload"); 1076 return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3)); 1077 } 1078 1079 return nullptr; 1080 } 1081 1082 // TODO, Obvious Missing Transforms: 1083 // * Single constant active lane -> store 1084 // * Narrow width by halfs excluding zero/undef lanes 1085 Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) { 1086 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1087 if (!ConstMask) 1088 return nullptr; 1089 1090 // If the mask is all zeros, this instruction does nothing. 1091 if (ConstMask->isNullValue()) 1092 return eraseInstFromFunction(II); 1093 1094 // If the mask is all ones, this is a plain vector store of the 1st argument. 1095 if (ConstMask->isAllOnesValue()) { 1096 Value *StorePtr = II.getArgOperand(1); 1097 MaybeAlign Alignment( 1098 cast<ConstantInt>(II.getArgOperand(2))->getZExtValue()); 1099 return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment); 1100 } 1101 1102 // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts 1103 APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); 1104 APInt UndefElts(DemandedElts.getBitWidth(), 0); 1105 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), 1106 DemandedElts, UndefElts)) { 1107 II.setOperand(0, V); 1108 return &II; 1109 } 1110 1111 return nullptr; 1112 } 1113 1114 // TODO, Obvious Missing Transforms: 1115 // * Single constant active lane load -> load 1116 // * Dereferenceable address & few lanes -> scalarize speculative load/selects 1117 // * Adjacent vector addresses -> masked.load 1118 // * Narrow width by halfs excluding zero/undef lanes 1119 // * Vector splat address w/known mask -> scalar load 1120 // * Vector incrementing address -> vector masked load 1121 Instruction *InstCombiner::simplifyMaskedGather(IntrinsicInst &II) { 1122 return nullptr; 1123 } 1124 1125 // TODO, Obvious Missing Transforms: 1126 // * Single constant active lane -> store 1127 // * Adjacent vector addresses -> masked.store 1128 // * Narrow store width by halfs excluding zero/undef lanes 1129 // * Vector splat address w/known mask -> scalar store 1130 // * Vector incrementing address -> vector masked store 1131 Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) { 1132 auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3)); 1133 if (!ConstMask) 1134 return nullptr; 1135 1136 // If the mask is all zeros, a scatter does nothing. 1137 if (ConstMask->isNullValue()) 1138 return eraseInstFromFunction(II); 1139 1140 // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts 1141 APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask); 1142 APInt UndefElts(DemandedElts.getBitWidth(), 0); 1143 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0), 1144 DemandedElts, UndefElts)) { 1145 II.setOperand(0, V); 1146 return &II; 1147 } 1148 if (Value *V = SimplifyDemandedVectorElts(II.getOperand(1), 1149 DemandedElts, UndefElts)) { 1150 II.setOperand(1, V); 1151 return &II; 1152 } 1153 1154 return nullptr; 1155 } 1156 1157 /// This function transforms launder.invariant.group and strip.invariant.group 1158 /// like: 1159 /// launder(launder(%x)) -> launder(%x) (the result is not the argument) 1160 /// launder(strip(%x)) -> launder(%x) 1161 /// strip(strip(%x)) -> strip(%x) (the result is not the argument) 1162 /// strip(launder(%x)) -> strip(%x) 1163 /// This is legal because it preserves the most recent information about 1164 /// the presence or absence of invariant.group. 1165 static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II, 1166 InstCombiner &IC) { 1167 auto *Arg = II.getArgOperand(0); 1168 auto *StrippedArg = Arg->stripPointerCasts(); 1169 auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups(); 1170 if (StrippedArg == StrippedInvariantGroupsArg) 1171 return nullptr; // No launders/strips to remove. 1172 1173 Value *Result = nullptr; 1174 1175 if (II.getIntrinsicID() == Intrinsic::launder_invariant_group) 1176 Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg); 1177 else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group) 1178 Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg); 1179 else 1180 llvm_unreachable( 1181 "simplifyInvariantGroupIntrinsic only handles launder and strip"); 1182 if (Result->getType()->getPointerAddressSpace() != 1183 II.getType()->getPointerAddressSpace()) 1184 Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType()); 1185 if (Result->getType() != II.getType()) 1186 Result = IC.Builder.CreateBitCast(Result, II.getType()); 1187 1188 return cast<Instruction>(Result); 1189 } 1190 1191 static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { 1192 assert((II.getIntrinsicID() == Intrinsic::cttz || 1193 II.getIntrinsicID() == Intrinsic::ctlz) && 1194 "Expected cttz or ctlz intrinsic"); 1195 bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; 1196 Value *Op0 = II.getArgOperand(0); 1197 Value *X; 1198 // ctlz(bitreverse(x)) -> cttz(x) 1199 // cttz(bitreverse(x)) -> ctlz(x) 1200 if (match(Op0, m_BitReverse(m_Value(X)))) { 1201 Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz; 1202 Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType()); 1203 return CallInst::Create(F, {X, II.getArgOperand(1)}); 1204 } 1205 1206 if (IsTZ) { 1207 // cttz(-x) -> cttz(x) 1208 if (match(Op0, m_Neg(m_Value(X)))) 1209 return IC.replaceOperand(II, 0, X); 1210 1211 // cttz(abs(x)) -> cttz(x) 1212 // cttz(nabs(x)) -> cttz(x) 1213 Value *Y; 1214 SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor; 1215 if (SPF == SPF_ABS || SPF == SPF_NABS) 1216 return IC.replaceOperand(II, 0, X); 1217 } 1218 1219 KnownBits Known = IC.computeKnownBits(Op0, 0, &II); 1220 1221 // Create a mask for bits above (ctlz) or below (cttz) the first known one. 1222 unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros() 1223 : Known.countMaxLeadingZeros(); 1224 unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros() 1225 : Known.countMinLeadingZeros(); 1226 1227 // If all bits above (ctlz) or below (cttz) the first known one are known 1228 // zero, this value is constant. 1229 // FIXME: This should be in InstSimplify because we're replacing an 1230 // instruction with a constant. 1231 if (PossibleZeros == DefiniteZeros) { 1232 auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros); 1233 return IC.replaceInstUsesWith(II, C); 1234 } 1235 1236 // If the input to cttz/ctlz is known to be non-zero, 1237 // then change the 'ZeroIsUndef' parameter to 'true' 1238 // because we know the zero behavior can't affect the result. 1239 if (!Known.One.isNullValue() || 1240 isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, 1241 &IC.getDominatorTree())) { 1242 if (!match(II.getArgOperand(1), m_One())) 1243 return IC.replaceOperand(II, 1, IC.Builder.getTrue()); 1244 } 1245 1246 // Add range metadata since known bits can't completely reflect what we know. 1247 // TODO: Handle splat vectors. 1248 auto *IT = dyn_cast<IntegerType>(Op0->getType()); 1249 if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1250 Metadata *LowAndHigh[] = { 1251 ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)), 1252 ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))}; 1253 II.setMetadata(LLVMContext::MD_range, 1254 MDNode::get(II.getContext(), LowAndHigh)); 1255 return &II; 1256 } 1257 1258 return nullptr; 1259 } 1260 1261 static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { 1262 assert(II.getIntrinsicID() == Intrinsic::ctpop && 1263 "Expected ctpop intrinsic"); 1264 Value *Op0 = II.getArgOperand(0); 1265 Value *X; 1266 // ctpop(bitreverse(x)) -> ctpop(x) 1267 // ctpop(bswap(x)) -> ctpop(x) 1268 if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X)))) 1269 return IC.replaceOperand(II, 0, X); 1270 1271 // FIXME: Try to simplify vectors of integers. 1272 auto *IT = dyn_cast<IntegerType>(Op0->getType()); 1273 if (!IT) 1274 return nullptr; 1275 1276 unsigned BitWidth = IT->getBitWidth(); 1277 KnownBits Known(BitWidth); 1278 IC.computeKnownBits(Op0, Known, 0, &II); 1279 1280 unsigned MinCount = Known.countMinPopulation(); 1281 unsigned MaxCount = Known.countMaxPopulation(); 1282 1283 // Add range metadata since known bits can't completely reflect what we know. 1284 if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) { 1285 Metadata *LowAndHigh[] = { 1286 ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)), 1287 ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))}; 1288 II.setMetadata(LLVMContext::MD_range, 1289 MDNode::get(II.getContext(), LowAndHigh)); 1290 return &II; 1291 } 1292 1293 return nullptr; 1294 } 1295 1296 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 1297 // XMM register mask efficiently, we could transform all x86 masked intrinsics 1298 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1299 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 1300 Value *Ptr = II.getOperand(0); 1301 Value *Mask = II.getOperand(1); 1302 Constant *ZeroVec = Constant::getNullValue(II.getType()); 1303 1304 // Special case a zero mask since that's not a ConstantDataVector. 1305 // This masked load instruction creates a zero vector. 1306 if (isa<ConstantAggregateZero>(Mask)) 1307 return IC.replaceInstUsesWith(II, ZeroVec); 1308 1309 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1310 if (!ConstMask) 1311 return nullptr; 1312 1313 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1314 // to allow target-independent optimizations. 1315 1316 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1317 // the LLVM intrinsic definition for the pointer argument. 1318 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1319 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 1320 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 1321 1322 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1323 // on each element's most significant bit (the sign bit). 1324 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1325 1326 // The pass-through vector for an x86 masked load is a zero vector. 1327 CallInst *NewMaskedLoad = 1328 IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); 1329 return IC.replaceInstUsesWith(II, NewMaskedLoad); 1330 } 1331 1332 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 1333 // XMM register mask efficiently, we could transform all x86 masked intrinsics 1334 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 1335 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 1336 Value *Ptr = II.getOperand(0); 1337 Value *Mask = II.getOperand(1); 1338 Value *Vec = II.getOperand(2); 1339 1340 // Special case a zero mask since that's not a ConstantDataVector: 1341 // this masked store instruction does nothing. 1342 if (isa<ConstantAggregateZero>(Mask)) { 1343 IC.eraseInstFromFunction(II); 1344 return true; 1345 } 1346 1347 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 1348 // anything else at this level. 1349 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 1350 return false; 1351 1352 auto *ConstMask = dyn_cast<ConstantDataVector>(Mask); 1353 if (!ConstMask) 1354 return false; 1355 1356 // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic 1357 // to allow target-independent optimizations. 1358 1359 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 1360 // the LLVM intrinsic definition for the pointer argument. 1361 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1362 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 1363 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 1364 1365 // Second, convert the x86 XMM integer vector mask to a vector of bools based 1366 // on each element's most significant bit (the sign bit). 1367 Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); 1368 1369 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 1370 1371 // 'Replace uses' doesn't work for stores. Erase the original masked store. 1372 IC.eraseInstFromFunction(II); 1373 return true; 1374 } 1375 1376 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 1377 // 1378 // A single NaN input is folded to minnum, so we rely on that folding for 1379 // handling NaNs. 1380 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 1381 const APFloat &Src2) { 1382 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 1383 1384 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 1385 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 1386 if (Cmp0 == APFloat::cmpEqual) 1387 return maxnum(Src1, Src2); 1388 1389 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 1390 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 1391 if (Cmp1 == APFloat::cmpEqual) 1392 return maxnum(Src0, Src2); 1393 1394 return maxnum(Src0, Src1); 1395 } 1396 1397 /// Convert a table lookup to shufflevector if the mask is constant. 1398 /// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in 1399 /// which case we could lower the shufflevector with rev64 instructions 1400 /// as it's actually a byte reverse. 1401 static Value *simplifyNeonTbl1(const IntrinsicInst &II, 1402 InstCombiner::BuilderTy &Builder) { 1403 // Bail out if the mask is not a constant. 1404 auto *C = dyn_cast<Constant>(II.getArgOperand(1)); 1405 if (!C) 1406 return nullptr; 1407 1408 auto *VecTy = cast<VectorType>(II.getType()); 1409 unsigned NumElts = VecTy->getNumElements(); 1410 1411 // Only perform this transformation for <8 x i8> vector types. 1412 if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8) 1413 return nullptr; 1414 1415 uint32_t Indexes[8]; 1416 1417 for (unsigned I = 0; I < NumElts; ++I) { 1418 Constant *COp = C->getAggregateElement(I); 1419 1420 if (!COp || !isa<ConstantInt>(COp)) 1421 return nullptr; 1422 1423 Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue(); 1424 1425 // Make sure the mask indices are in range. 1426 if (Indexes[I] >= NumElts) 1427 return nullptr; 1428 } 1429 1430 auto *ShuffleMask = ConstantDataVector::get(II.getContext(), 1431 makeArrayRef(Indexes)); 1432 auto *V1 = II.getArgOperand(0); 1433 auto *V2 = Constant::getNullValue(V1->getType()); 1434 return Builder.CreateShuffleVector(V1, V2, ShuffleMask); 1435 } 1436 1437 /// Convert a vector load intrinsic into a simple llvm load instruction. 1438 /// This is beneficial when the underlying object being addressed comes 1439 /// from a constant, since we get constant-folding for free. 1440 static Value *simplifyNeonVld1(const IntrinsicInst &II, 1441 unsigned MemAlign, 1442 InstCombiner::BuilderTy &Builder) { 1443 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); 1444 1445 if (!IntrAlign) 1446 return nullptr; 1447 1448 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ? 1449 MemAlign : IntrAlign->getLimitedValue(); 1450 1451 if (!isPowerOf2_32(Alignment)) 1452 return nullptr; 1453 1454 auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), 1455 PointerType::get(II.getType(), 0)); 1456 return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); 1457 } 1458 1459 // Returns true iff the 2 intrinsics have the same operands, limiting the 1460 // comparison to the first NumOperands. 1461 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, 1462 unsigned NumOperands) { 1463 assert(I.getNumArgOperands() >= NumOperands && "Not enough operands"); 1464 assert(E.getNumArgOperands() >= NumOperands && "Not enough operands"); 1465 for (unsigned i = 0; i < NumOperands; i++) 1466 if (I.getArgOperand(i) != E.getArgOperand(i)) 1467 return false; 1468 return true; 1469 } 1470 1471 // Remove trivially empty start/end intrinsic ranges, i.e. a start 1472 // immediately followed by an end (ignoring debuginfo or other 1473 // start/end intrinsics in between). As this handles only the most trivial 1474 // cases, tracking the nesting level is not needed: 1475 // 1476 // call @llvm.foo.start(i1 0) 1477 // call @llvm.foo.start(i1 0) ; This one won't be skipped: it will be removed 1478 // call @llvm.foo.end(i1 0) 1479 // call @llvm.foo.end(i1 0) ; &I 1480 static bool removeTriviallyEmptyRange( 1481 IntrinsicInst &EndI, InstCombiner &IC, 1482 std::function<bool(const IntrinsicInst &)> IsStart) { 1483 // We start from the end intrinsic and scan backwards, so that InstCombine 1484 // has already processed (and potentially removed) all the instructions 1485 // before the end intrinsic. 1486 BasicBlock::reverse_iterator BI(EndI), BE(EndI.getParent()->rend()); 1487 for (; BI != BE; ++BI) { 1488 if (auto *I = dyn_cast<IntrinsicInst>(&*BI)) { 1489 if (isa<DbgInfoIntrinsic>(I) || 1490 I->getIntrinsicID() == EndI.getIntrinsicID()) 1491 continue; 1492 if (IsStart(*I)) { 1493 if (haveSameOperands(EndI, *I, EndI.getNumArgOperands())) { 1494 IC.eraseInstFromFunction(*I); 1495 IC.eraseInstFromFunction(EndI); 1496 return true; 1497 } 1498 // Skip start intrinsics that don't pair with this end intrinsic. 1499 continue; 1500 } 1501 } 1502 break; 1503 } 1504 1505 return false; 1506 } 1507 1508 // Convert NVVM intrinsics to target-generic LLVM code where possible. 1509 static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) { 1510 // Each NVVM intrinsic we can simplify can be replaced with one of: 1511 // 1512 // * an LLVM intrinsic, 1513 // * an LLVM cast operation, 1514 // * an LLVM binary operation, or 1515 // * ad-hoc LLVM IR for the particular operation. 1516 1517 // Some transformations are only valid when the module's 1518 // flush-denormals-to-zero (ftz) setting is true/false, whereas other 1519 // transformations are valid regardless of the module's ftz setting. 1520 enum FtzRequirementTy { 1521 FTZ_Any, // Any ftz setting is ok. 1522 FTZ_MustBeOn, // Transformation is valid only if ftz is on. 1523 FTZ_MustBeOff, // Transformation is valid only if ftz is off. 1524 }; 1525 // Classes of NVVM intrinsics that can't be replaced one-to-one with a 1526 // target-generic intrinsic, cast op, or binary op but that we can nonetheless 1527 // simplify. 1528 enum SpecialCase { 1529 SPC_Reciprocal, 1530 }; 1531 1532 // SimplifyAction is a poor-man's variant (plus an additional flag) that 1533 // represents how to replace an NVVM intrinsic with target-generic LLVM IR. 1534 struct SimplifyAction { 1535 // Invariant: At most one of these Optionals has a value. 1536 Optional<Intrinsic::ID> IID; 1537 Optional<Instruction::CastOps> CastOp; 1538 Optional<Instruction::BinaryOps> BinaryOp; 1539 Optional<SpecialCase> Special; 1540 1541 FtzRequirementTy FtzRequirement = FTZ_Any; 1542 1543 SimplifyAction() = default; 1544 1545 SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) 1546 : IID(IID), FtzRequirement(FtzReq) {} 1547 1548 // Cast operations don't have anything to do with FTZ, so we skip that 1549 // argument. 1550 SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} 1551 1552 SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) 1553 : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} 1554 1555 SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) 1556 : Special(Special), FtzRequirement(FtzReq) {} 1557 }; 1558 1559 // Try to generate a SimplifyAction describing how to replace our 1560 // IntrinsicInstr with target-generic LLVM IR. 1561 const SimplifyAction Action = [II]() -> SimplifyAction { 1562 switch (II->getIntrinsicID()) { 1563 // NVVM intrinsics that map directly to LLVM intrinsics. 1564 case Intrinsic::nvvm_ceil_d: 1565 return {Intrinsic::ceil, FTZ_Any}; 1566 case Intrinsic::nvvm_ceil_f: 1567 return {Intrinsic::ceil, FTZ_MustBeOff}; 1568 case Intrinsic::nvvm_ceil_ftz_f: 1569 return {Intrinsic::ceil, FTZ_MustBeOn}; 1570 case Intrinsic::nvvm_fabs_d: 1571 return {Intrinsic::fabs, FTZ_Any}; 1572 case Intrinsic::nvvm_fabs_f: 1573 return {Intrinsic::fabs, FTZ_MustBeOff}; 1574 case Intrinsic::nvvm_fabs_ftz_f: 1575 return {Intrinsic::fabs, FTZ_MustBeOn}; 1576 case Intrinsic::nvvm_floor_d: 1577 return {Intrinsic::floor, FTZ_Any}; 1578 case Intrinsic::nvvm_floor_f: 1579 return {Intrinsic::floor, FTZ_MustBeOff}; 1580 case Intrinsic::nvvm_floor_ftz_f: 1581 return {Intrinsic::floor, FTZ_MustBeOn}; 1582 case Intrinsic::nvvm_fma_rn_d: 1583 return {Intrinsic::fma, FTZ_Any}; 1584 case Intrinsic::nvvm_fma_rn_f: 1585 return {Intrinsic::fma, FTZ_MustBeOff}; 1586 case Intrinsic::nvvm_fma_rn_ftz_f: 1587 return {Intrinsic::fma, FTZ_MustBeOn}; 1588 case Intrinsic::nvvm_fmax_d: 1589 return {Intrinsic::maxnum, FTZ_Any}; 1590 case Intrinsic::nvvm_fmax_f: 1591 return {Intrinsic::maxnum, FTZ_MustBeOff}; 1592 case Intrinsic::nvvm_fmax_ftz_f: 1593 return {Intrinsic::maxnum, FTZ_MustBeOn}; 1594 case Intrinsic::nvvm_fmin_d: 1595 return {Intrinsic::minnum, FTZ_Any}; 1596 case Intrinsic::nvvm_fmin_f: 1597 return {Intrinsic::minnum, FTZ_MustBeOff}; 1598 case Intrinsic::nvvm_fmin_ftz_f: 1599 return {Intrinsic::minnum, FTZ_MustBeOn}; 1600 case Intrinsic::nvvm_round_d: 1601 return {Intrinsic::round, FTZ_Any}; 1602 case Intrinsic::nvvm_round_f: 1603 return {Intrinsic::round, FTZ_MustBeOff}; 1604 case Intrinsic::nvvm_round_ftz_f: 1605 return {Intrinsic::round, FTZ_MustBeOn}; 1606 case Intrinsic::nvvm_sqrt_rn_d: 1607 return {Intrinsic::sqrt, FTZ_Any}; 1608 case Intrinsic::nvvm_sqrt_f: 1609 // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the 1610 // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts 1611 // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are 1612 // the versions with explicit ftz-ness. 1613 return {Intrinsic::sqrt, FTZ_Any}; 1614 case Intrinsic::nvvm_sqrt_rn_f: 1615 return {Intrinsic::sqrt, FTZ_MustBeOff}; 1616 case Intrinsic::nvvm_sqrt_rn_ftz_f: 1617 return {Intrinsic::sqrt, FTZ_MustBeOn}; 1618 case Intrinsic::nvvm_trunc_d: 1619 return {Intrinsic::trunc, FTZ_Any}; 1620 case Intrinsic::nvvm_trunc_f: 1621 return {Intrinsic::trunc, FTZ_MustBeOff}; 1622 case Intrinsic::nvvm_trunc_ftz_f: 1623 return {Intrinsic::trunc, FTZ_MustBeOn}; 1624 1625 // NVVM intrinsics that map to LLVM cast operations. 1626 // 1627 // Note that llvm's target-generic conversion operators correspond to the rz 1628 // (round to zero) versions of the nvvm conversion intrinsics, even though 1629 // most everything else here uses the rn (round to nearest even) nvvm ops. 1630 case Intrinsic::nvvm_d2i_rz: 1631 case Intrinsic::nvvm_f2i_rz: 1632 case Intrinsic::nvvm_d2ll_rz: 1633 case Intrinsic::nvvm_f2ll_rz: 1634 return {Instruction::FPToSI}; 1635 case Intrinsic::nvvm_d2ui_rz: 1636 case Intrinsic::nvvm_f2ui_rz: 1637 case Intrinsic::nvvm_d2ull_rz: 1638 case Intrinsic::nvvm_f2ull_rz: 1639 return {Instruction::FPToUI}; 1640 case Intrinsic::nvvm_i2d_rz: 1641 case Intrinsic::nvvm_i2f_rz: 1642 case Intrinsic::nvvm_ll2d_rz: 1643 case Intrinsic::nvvm_ll2f_rz: 1644 return {Instruction::SIToFP}; 1645 case Intrinsic::nvvm_ui2d_rz: 1646 case Intrinsic::nvvm_ui2f_rz: 1647 case Intrinsic::nvvm_ull2d_rz: 1648 case Intrinsic::nvvm_ull2f_rz: 1649 return {Instruction::UIToFP}; 1650 1651 // NVVM intrinsics that map to LLVM binary ops. 1652 case Intrinsic::nvvm_add_rn_d: 1653 return {Instruction::FAdd, FTZ_Any}; 1654 case Intrinsic::nvvm_add_rn_f: 1655 return {Instruction::FAdd, FTZ_MustBeOff}; 1656 case Intrinsic::nvvm_add_rn_ftz_f: 1657 return {Instruction::FAdd, FTZ_MustBeOn}; 1658 case Intrinsic::nvvm_mul_rn_d: 1659 return {Instruction::FMul, FTZ_Any}; 1660 case Intrinsic::nvvm_mul_rn_f: 1661 return {Instruction::FMul, FTZ_MustBeOff}; 1662 case Intrinsic::nvvm_mul_rn_ftz_f: 1663 return {Instruction::FMul, FTZ_MustBeOn}; 1664 case Intrinsic::nvvm_div_rn_d: 1665 return {Instruction::FDiv, FTZ_Any}; 1666 case Intrinsic::nvvm_div_rn_f: 1667 return {Instruction::FDiv, FTZ_MustBeOff}; 1668 case Intrinsic::nvvm_div_rn_ftz_f: 1669 return {Instruction::FDiv, FTZ_MustBeOn}; 1670 1671 // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but 1672 // need special handling. 1673 // 1674 // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just 1675 // as well. 1676 case Intrinsic::nvvm_rcp_rn_d: 1677 return {SPC_Reciprocal, FTZ_Any}; 1678 case Intrinsic::nvvm_rcp_rn_f: 1679 return {SPC_Reciprocal, FTZ_MustBeOff}; 1680 case Intrinsic::nvvm_rcp_rn_ftz_f: 1681 return {SPC_Reciprocal, FTZ_MustBeOn}; 1682 1683 // We do not currently simplify intrinsics that give an approximate answer. 1684 // These include: 1685 // 1686 // - nvvm_cos_approx_{f,ftz_f} 1687 // - nvvm_ex2_approx_{d,f,ftz_f} 1688 // - nvvm_lg2_approx_{d,f,ftz_f} 1689 // - nvvm_sin_approx_{f,ftz_f} 1690 // - nvvm_sqrt_approx_{f,ftz_f} 1691 // - nvvm_rsqrt_approx_{d,f,ftz_f} 1692 // - nvvm_div_approx_{ftz_d,ftz_f,f} 1693 // - nvvm_rcp_approx_ftz_d 1694 // 1695 // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" 1696 // means that fastmath is enabled in the intrinsic. Unfortunately only 1697 // binary operators (currently) have a fastmath bit in SelectionDAG, so this 1698 // information gets lost and we can't select on it. 1699 // 1700 // TODO: div and rcp are lowered to a binary op, so these we could in theory 1701 // lower them to "fast fdiv". 1702 1703 default: 1704 return {}; 1705 } 1706 }(); 1707 1708 // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we 1709 // can bail out now. (Notice that in the case that IID is not an NVVM 1710 // intrinsic, we don't have to look up any module metadata, as 1711 // FtzRequirementTy will be FTZ_Any.) 1712 if (Action.FtzRequirement != FTZ_Any) { 1713 StringRef Attr = II->getFunction() 1714 ->getFnAttribute("denormal-fp-math-f32") 1715 .getValueAsString(); 1716 DenormalMode Mode = parseDenormalFPAttribute(Attr); 1717 bool FtzEnabled = Mode.Output != DenormalMode::IEEE; 1718 1719 if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) 1720 return nullptr; 1721 } 1722 1723 // Simplify to target-generic intrinsic. 1724 if (Action.IID) { 1725 SmallVector<Value *, 4> Args(II->arg_operands()); 1726 // All the target-generic intrinsics currently of interest to us have one 1727 // type argument, equal to that of the nvvm intrinsic's argument. 1728 Type *Tys[] = {II->getArgOperand(0)->getType()}; 1729 return CallInst::Create( 1730 Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); 1731 } 1732 1733 // Simplify to target-generic binary op. 1734 if (Action.BinaryOp) 1735 return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), 1736 II->getArgOperand(1), II->getName()); 1737 1738 // Simplify to target-generic cast op. 1739 if (Action.CastOp) 1740 return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), 1741 II->getName()); 1742 1743 // All that's left are the special cases. 1744 if (!Action.Special) 1745 return nullptr; 1746 1747 switch (*Action.Special) { 1748 case SPC_Reciprocal: 1749 // Simplify reciprocal. 1750 return BinaryOperator::Create( 1751 Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), 1752 II->getArgOperand(0), II->getName()); 1753 } 1754 llvm_unreachable("All SpecialCase enumerators should be handled in switch."); 1755 } 1756 1757 Instruction *InstCombiner::visitVAEndInst(VAEndInst &I) { 1758 removeTriviallyEmptyRange(I, *this, [](const IntrinsicInst &I) { 1759 return I.getIntrinsicID() == Intrinsic::vastart || 1760 I.getIntrinsicID() == Intrinsic::vacopy; 1761 }); 1762 return nullptr; 1763 } 1764 1765 static Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) { 1766 assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap"); 1767 Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1); 1768 if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) { 1769 Call.setArgOperand(0, Arg1); 1770 Call.setArgOperand(1, Arg0); 1771 return &Call; 1772 } 1773 return nullptr; 1774 } 1775 1776 Instruction *InstCombiner::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) { 1777 WithOverflowInst *WO = cast<WithOverflowInst>(II); 1778 Value *OperationResult = nullptr; 1779 Constant *OverflowResult = nullptr; 1780 if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(), 1781 WO->getRHS(), *WO, OperationResult, OverflowResult)) 1782 return CreateOverflowTuple(WO, OperationResult, OverflowResult); 1783 return nullptr; 1784 } 1785 1786 /// CallInst simplification. This mostly only handles folding of intrinsic 1787 /// instructions. For normal calls, it allows visitCallBase to do the heavy 1788 /// lifting. 1789 Instruction *InstCombiner::visitCallInst(CallInst &CI) { 1790 // Don't try to simplify calls without uses. It will not do anything useful, 1791 // but will result in the following folds being skipped. 1792 if (!CI.use_empty()) 1793 if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI))) 1794 return replaceInstUsesWith(CI, V); 1795 1796 if (isFreeCall(&CI, &TLI)) 1797 return visitFree(CI); 1798 1799 // If the caller function is nounwind, mark the call as nounwind, even if the 1800 // callee isn't. 1801 if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) { 1802 CI.setDoesNotThrow(); 1803 return &CI; 1804 } 1805 1806 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI); 1807 if (!II) return visitCallBase(CI); 1808 1809 // For atomic unordered mem intrinsics if len is not a positive or 1810 // not a multiple of element size then behavior is undefined. 1811 if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(II)) 1812 if (ConstantInt *NumBytes = dyn_cast<ConstantInt>(AMI->getLength())) 1813 if (NumBytes->getSExtValue() < 0 || 1814 (NumBytes->getZExtValue() % AMI->getElementSizeInBytes() != 0)) { 1815 CreateNonTerminatorUnreachable(AMI); 1816 assert(AMI->getType()->isVoidTy() && 1817 "non void atomic unordered mem intrinsic"); 1818 return eraseInstFromFunction(*AMI); 1819 } 1820 1821 // Intrinsics cannot occur in an invoke or a callbr, so handle them here 1822 // instead of in visitCallBase. 1823 if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) { 1824 bool Changed = false; 1825 1826 // memmove/cpy/set of zero bytes is a noop. 1827 if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) { 1828 if (NumBytes->isNullValue()) 1829 return eraseInstFromFunction(CI); 1830 1831 if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) 1832 if (CI->getZExtValue() == 1) { 1833 // Replace the instruction with just byte operations. We would 1834 // transform other cases to loads/stores, but we don't know if 1835 // alignment is sufficient. 1836 } 1837 } 1838 1839 // No other transformations apply to volatile transfers. 1840 if (auto *M = dyn_cast<MemIntrinsic>(MI)) 1841 if (M->isVolatile()) 1842 return nullptr; 1843 1844 // If we have a memmove and the source operation is a constant global, 1845 // then the source and dest pointers can't alias, so we can change this 1846 // into a call to memcpy. 1847 if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) { 1848 if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) 1849 if (GVSrc->isConstant()) { 1850 Module *M = CI.getModule(); 1851 Intrinsic::ID MemCpyID = 1852 isa<AtomicMemMoveInst>(MMI) 1853 ? Intrinsic::memcpy_element_unordered_atomic 1854 : Intrinsic::memcpy; 1855 Type *Tys[3] = { CI.getArgOperand(0)->getType(), 1856 CI.getArgOperand(1)->getType(), 1857 CI.getArgOperand(2)->getType() }; 1858 CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys)); 1859 Changed = true; 1860 } 1861 } 1862 1863 if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) { 1864 // memmove(x,x,size) -> noop. 1865 if (MTI->getSource() == MTI->getDest()) 1866 return eraseInstFromFunction(CI); 1867 } 1868 1869 // If we can determine a pointer alignment that is bigger than currently 1870 // set, update the alignment. 1871 if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) { 1872 if (Instruction *I = SimplifyAnyMemTransfer(MTI)) 1873 return I; 1874 } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) { 1875 if (Instruction *I = SimplifyAnyMemSet(MSI)) 1876 return I; 1877 } 1878 1879 if (Changed) return II; 1880 } 1881 1882 // For vector result intrinsics, use the generic demanded vector support. 1883 if (II->getType()->isVectorTy()) { 1884 auto VWidth = II->getType()->getVectorNumElements(); 1885 APInt UndefElts(VWidth, 0); 1886 APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); 1887 if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { 1888 if (V != II) 1889 return replaceInstUsesWith(*II, V); 1890 return II; 1891 } 1892 } 1893 1894 if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) 1895 return I; 1896 1897 auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, 1898 unsigned DemandedWidth) { 1899 APInt UndefElts(Width, 0); 1900 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 1901 return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 1902 }; 1903 1904 Intrinsic::ID IID = II->getIntrinsicID(); 1905 switch (IID) { 1906 default: break; 1907 case Intrinsic::objectsize: 1908 if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false)) 1909 return replaceInstUsesWith(CI, V); 1910 return nullptr; 1911 case Intrinsic::bswap: { 1912 Value *IIOperand = II->getArgOperand(0); 1913 Value *X = nullptr; 1914 1915 // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) 1916 if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { 1917 unsigned C = X->getType()->getPrimitiveSizeInBits() - 1918 IIOperand->getType()->getPrimitiveSizeInBits(); 1919 Value *CV = ConstantInt::get(X->getType(), C); 1920 Value *V = Builder.CreateLShr(X, CV); 1921 return new TruncInst(V, IIOperand->getType()); 1922 } 1923 break; 1924 } 1925 case Intrinsic::masked_load: 1926 if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II)) 1927 return replaceInstUsesWith(CI, SimplifiedMaskedOp); 1928 break; 1929 case Intrinsic::masked_store: 1930 return simplifyMaskedStore(*II); 1931 case Intrinsic::masked_gather: 1932 return simplifyMaskedGather(*II); 1933 case Intrinsic::masked_scatter: 1934 return simplifyMaskedScatter(*II); 1935 case Intrinsic::launder_invariant_group: 1936 case Intrinsic::strip_invariant_group: 1937 if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this)) 1938 return replaceInstUsesWith(*II, SkippedBarrier); 1939 break; 1940 case Intrinsic::powi: 1941 if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 1942 // 0 and 1 are handled in instsimplify 1943 1944 // powi(x, -1) -> 1/x 1945 if (Power->isMinusOne()) 1946 return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0), 1947 II->getArgOperand(0)); 1948 // powi(x, 2) -> x*x 1949 if (Power->equalsInt(2)) 1950 return BinaryOperator::CreateFMul(II->getArgOperand(0), 1951 II->getArgOperand(0)); 1952 } 1953 break; 1954 1955 case Intrinsic::cttz: 1956 case Intrinsic::ctlz: 1957 if (auto *I = foldCttzCtlz(*II, *this)) 1958 return I; 1959 break; 1960 1961 case Intrinsic::ctpop: 1962 if (auto *I = foldCtpop(*II, *this)) 1963 return I; 1964 break; 1965 1966 case Intrinsic::fshl: 1967 case Intrinsic::fshr: { 1968 Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1); 1969 Type *Ty = II->getType(); 1970 unsigned BitWidth = Ty->getScalarSizeInBits(); 1971 Constant *ShAmtC; 1972 if (match(II->getArgOperand(2), m_Constant(ShAmtC)) && 1973 !isa<ConstantExpr>(ShAmtC) && !ShAmtC->containsConstantExpression()) { 1974 // Canonicalize a shift amount constant operand to modulo the bit-width. 1975 Constant *WidthC = ConstantInt::get(Ty, BitWidth); 1976 Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC); 1977 if (ModuloC != ShAmtC) 1978 return replaceOperand(*II, 2, ModuloC); 1979 1980 assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) == 1981 ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) && 1982 "Shift amount expected to be modulo bitwidth"); 1983 1984 // Canonicalize funnel shift right by constant to funnel shift left. This 1985 // is not entirely arbitrary. For historical reasons, the backend may 1986 // recognize rotate left patterns but miss rotate right patterns. 1987 if (IID == Intrinsic::fshr) { 1988 // fshr X, Y, C --> fshl X, Y, (BitWidth - C) 1989 Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC); 1990 Module *Mod = II->getModule(); 1991 Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty); 1992 return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC }); 1993 } 1994 assert(IID == Intrinsic::fshl && 1995 "All funnel shifts by simple constants should go left"); 1996 1997 // fshl(X, 0, C) --> shl X, C 1998 // fshl(X, undef, C) --> shl X, C 1999 if (match(Op1, m_ZeroInt()) || match(Op1, m_Undef())) 2000 return BinaryOperator::CreateShl(Op0, ShAmtC); 2001 2002 // fshl(0, X, C) --> lshr X, (BW-C) 2003 // fshl(undef, X, C) --> lshr X, (BW-C) 2004 if (match(Op0, m_ZeroInt()) || match(Op0, m_Undef())) 2005 return BinaryOperator::CreateLShr(Op1, 2006 ConstantExpr::getSub(WidthC, ShAmtC)); 2007 2008 // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form) 2009 if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) { 2010 Module *Mod = II->getModule(); 2011 Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty); 2012 return CallInst::Create(Bswap, { Op0 }); 2013 } 2014 } 2015 2016 // Left or right might be masked. 2017 if (SimplifyDemandedInstructionBits(*II)) 2018 return &CI; 2019 2020 // The shift amount (operand 2) of a funnel shift is modulo the bitwidth, 2021 // so only the low bits of the shift amount are demanded if the bitwidth is 2022 // a power-of-2. 2023 if (!isPowerOf2_32(BitWidth)) 2024 break; 2025 APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth)); 2026 KnownBits Op2Known(BitWidth); 2027 if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known)) 2028 return &CI; 2029 break; 2030 } 2031 case Intrinsic::uadd_with_overflow: 2032 case Intrinsic::sadd_with_overflow: { 2033 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2034 return I; 2035 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2036 return I; 2037 2038 // Given 2 constant operands whose sum does not overflow: 2039 // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1 2040 // saddo (X +nsw C0), C1 -> saddo X, C0 + C1 2041 Value *X; 2042 const APInt *C0, *C1; 2043 Value *Arg0 = II->getArgOperand(0); 2044 Value *Arg1 = II->getArgOperand(1); 2045 bool IsSigned = IID == Intrinsic::sadd_with_overflow; 2046 bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0))) 2047 : match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0))); 2048 if (HasNWAdd && match(Arg1, m_APInt(C1))) { 2049 bool Overflow; 2050 APInt NewC = 2051 IsSigned ? C1->sadd_ov(*C0, Overflow) : C1->uadd_ov(*C0, Overflow); 2052 if (!Overflow) 2053 return replaceInstUsesWith( 2054 *II, Builder.CreateBinaryIntrinsic( 2055 IID, X, ConstantInt::get(Arg1->getType(), NewC))); 2056 } 2057 break; 2058 } 2059 2060 case Intrinsic::umul_with_overflow: 2061 case Intrinsic::smul_with_overflow: 2062 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2063 return I; 2064 LLVM_FALLTHROUGH; 2065 2066 case Intrinsic::usub_with_overflow: 2067 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2068 return I; 2069 break; 2070 2071 case Intrinsic::ssub_with_overflow: { 2072 if (Instruction *I = foldIntrinsicWithOverflowCommon(II)) 2073 return I; 2074 2075 Constant *C; 2076 Value *Arg0 = II->getArgOperand(0); 2077 Value *Arg1 = II->getArgOperand(1); 2078 // Given a constant C that is not the minimum signed value 2079 // for an integer of a given bit width: 2080 // 2081 // ssubo X, C -> saddo X, -C 2082 if (match(Arg1, m_Constant(C)) && C->isNotMinSignedValue()) { 2083 Value *NegVal = ConstantExpr::getNeg(C); 2084 // Build a saddo call that is equivalent to the discovered 2085 // ssubo call. 2086 return replaceInstUsesWith( 2087 *II, Builder.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, 2088 Arg0, NegVal)); 2089 } 2090 2091 break; 2092 } 2093 2094 case Intrinsic::uadd_sat: 2095 case Intrinsic::sadd_sat: 2096 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2097 return I; 2098 LLVM_FALLTHROUGH; 2099 case Intrinsic::usub_sat: 2100 case Intrinsic::ssub_sat: { 2101 SaturatingInst *SI = cast<SaturatingInst>(II); 2102 Type *Ty = SI->getType(); 2103 Value *Arg0 = SI->getLHS(); 2104 Value *Arg1 = SI->getRHS(); 2105 2106 // Make use of known overflow information. 2107 OverflowResult OR = computeOverflow(SI->getBinaryOp(), SI->isSigned(), 2108 Arg0, Arg1, SI); 2109 switch (OR) { 2110 case OverflowResult::MayOverflow: 2111 break; 2112 case OverflowResult::NeverOverflows: 2113 if (SI->isSigned()) 2114 return BinaryOperator::CreateNSW(SI->getBinaryOp(), Arg0, Arg1); 2115 else 2116 return BinaryOperator::CreateNUW(SI->getBinaryOp(), Arg0, Arg1); 2117 case OverflowResult::AlwaysOverflowsLow: { 2118 unsigned BitWidth = Ty->getScalarSizeInBits(); 2119 APInt Min = APSInt::getMinValue(BitWidth, !SI->isSigned()); 2120 return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Min)); 2121 } 2122 case OverflowResult::AlwaysOverflowsHigh: { 2123 unsigned BitWidth = Ty->getScalarSizeInBits(); 2124 APInt Max = APSInt::getMaxValue(BitWidth, !SI->isSigned()); 2125 return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Max)); 2126 } 2127 } 2128 2129 // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN 2130 Constant *C; 2131 if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) && 2132 C->isNotMinSignedValue()) { 2133 Value *NegVal = ConstantExpr::getNeg(C); 2134 return replaceInstUsesWith( 2135 *II, Builder.CreateBinaryIntrinsic( 2136 Intrinsic::sadd_sat, Arg0, NegVal)); 2137 } 2138 2139 // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2)) 2140 // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2)) 2141 // if Val and Val2 have the same sign 2142 if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) { 2143 Value *X; 2144 const APInt *Val, *Val2; 2145 APInt NewVal; 2146 bool IsUnsigned = 2147 IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat; 2148 if (Other->getIntrinsicID() == IID && 2149 match(Arg1, m_APInt(Val)) && 2150 match(Other->getArgOperand(0), m_Value(X)) && 2151 match(Other->getArgOperand(1), m_APInt(Val2))) { 2152 if (IsUnsigned) 2153 NewVal = Val->uadd_sat(*Val2); 2154 else if (Val->isNonNegative() == Val2->isNonNegative()) { 2155 bool Overflow; 2156 NewVal = Val->sadd_ov(*Val2, Overflow); 2157 if (Overflow) { 2158 // Both adds together may add more than SignedMaxValue 2159 // without saturating the final result. 2160 break; 2161 } 2162 } else { 2163 // Cannot fold saturated addition with different signs. 2164 break; 2165 } 2166 2167 return replaceInstUsesWith( 2168 *II, Builder.CreateBinaryIntrinsic( 2169 IID, X, ConstantInt::get(II->getType(), NewVal))); 2170 } 2171 } 2172 break; 2173 } 2174 2175 case Intrinsic::minnum: 2176 case Intrinsic::maxnum: 2177 case Intrinsic::minimum: 2178 case Intrinsic::maximum: { 2179 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2180 return I; 2181 Value *Arg0 = II->getArgOperand(0); 2182 Value *Arg1 = II->getArgOperand(1); 2183 Value *X, *Y; 2184 if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) && 2185 (Arg0->hasOneUse() || Arg1->hasOneUse())) { 2186 // If both operands are negated, invert the call and negate the result: 2187 // min(-X, -Y) --> -(max(X, Y)) 2188 // max(-X, -Y) --> -(min(X, Y)) 2189 Intrinsic::ID NewIID; 2190 switch (IID) { 2191 case Intrinsic::maxnum: 2192 NewIID = Intrinsic::minnum; 2193 break; 2194 case Intrinsic::minnum: 2195 NewIID = Intrinsic::maxnum; 2196 break; 2197 case Intrinsic::maximum: 2198 NewIID = Intrinsic::minimum; 2199 break; 2200 case Intrinsic::minimum: 2201 NewIID = Intrinsic::maximum; 2202 break; 2203 default: 2204 llvm_unreachable("unexpected intrinsic ID"); 2205 } 2206 Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II); 2207 Instruction *FNeg = UnaryOperator::CreateFNeg(NewCall); 2208 FNeg->copyIRFlags(II); 2209 return FNeg; 2210 } 2211 2212 // m(m(X, C2), C1) -> m(X, C) 2213 const APFloat *C1, *C2; 2214 if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) { 2215 if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) && 2216 ((match(M->getArgOperand(0), m_Value(X)) && 2217 match(M->getArgOperand(1), m_APFloat(C2))) || 2218 (match(M->getArgOperand(1), m_Value(X)) && 2219 match(M->getArgOperand(0), m_APFloat(C2))))) { 2220 APFloat Res(0.0); 2221 switch (IID) { 2222 case Intrinsic::maxnum: 2223 Res = maxnum(*C1, *C2); 2224 break; 2225 case Intrinsic::minnum: 2226 Res = minnum(*C1, *C2); 2227 break; 2228 case Intrinsic::maximum: 2229 Res = maximum(*C1, *C2); 2230 break; 2231 case Intrinsic::minimum: 2232 Res = minimum(*C1, *C2); 2233 break; 2234 default: 2235 llvm_unreachable("unexpected intrinsic ID"); 2236 } 2237 Instruction *NewCall = Builder.CreateBinaryIntrinsic( 2238 IID, X, ConstantFP::get(Arg0->getType(), Res)); 2239 NewCall->copyIRFlags(II); 2240 return replaceInstUsesWith(*II, NewCall); 2241 } 2242 } 2243 2244 break; 2245 } 2246 case Intrinsic::fmuladd: { 2247 // Canonicalize fast fmuladd to the separate fmul + fadd. 2248 if (II->isFast()) { 2249 BuilderTy::FastMathFlagGuard Guard(Builder); 2250 Builder.setFastMathFlags(II->getFastMathFlags()); 2251 Value *Mul = Builder.CreateFMul(II->getArgOperand(0), 2252 II->getArgOperand(1)); 2253 Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2)); 2254 Add->takeName(II); 2255 return replaceInstUsesWith(*II, Add); 2256 } 2257 2258 // Try to simplify the underlying FMul. 2259 if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1), 2260 II->getFastMathFlags(), 2261 SQ.getWithInstruction(II))) { 2262 auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); 2263 FAdd->copyFastMathFlags(II); 2264 return FAdd; 2265 } 2266 2267 LLVM_FALLTHROUGH; 2268 } 2269 case Intrinsic::fma: { 2270 if (Instruction *I = canonicalizeConstantArg0ToArg1(CI)) 2271 return I; 2272 2273 // fma fneg(x), fneg(y), z -> fma x, y, z 2274 Value *Src0 = II->getArgOperand(0); 2275 Value *Src1 = II->getArgOperand(1); 2276 Value *X, *Y; 2277 if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) { 2278 replaceOperand(*II, 0, X); 2279 replaceOperand(*II, 1, Y); 2280 return II; 2281 } 2282 2283 // fma fabs(x), fabs(x), z -> fma x, x, z 2284 if (match(Src0, m_FAbs(m_Value(X))) && 2285 match(Src1, m_FAbs(m_Specific(X)))) { 2286 replaceOperand(*II, 0, X); 2287 replaceOperand(*II, 1, X); 2288 return II; 2289 } 2290 2291 // Try to simplify the underlying FMul. We can only apply simplifications 2292 // that do not require rounding. 2293 if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1), 2294 II->getFastMathFlags(), 2295 SQ.getWithInstruction(II))) { 2296 auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); 2297 FAdd->copyFastMathFlags(II); 2298 return FAdd; 2299 } 2300 2301 break; 2302 } 2303 case Intrinsic::copysign: { 2304 if (SignBitMustBeZero(II->getArgOperand(1), &TLI)) { 2305 // If we know that the sign argument is positive, reduce to FABS: 2306 // copysign X, Pos --> fabs X 2307 Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, 2308 II->getArgOperand(0), II); 2309 return replaceInstUsesWith(*II, Fabs); 2310 } 2311 // TODO: There should be a ValueTracking sibling like SignBitMustBeOne. 2312 const APFloat *C; 2313 if (match(II->getArgOperand(1), m_APFloat(C)) && C->isNegative()) { 2314 // If we know that the sign argument is negative, reduce to FNABS: 2315 // copysign X, Neg --> fneg (fabs X) 2316 Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, 2317 II->getArgOperand(0), II); 2318 return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II)); 2319 } 2320 2321 // Propagate sign argument through nested calls: 2322 // copysign X, (copysign ?, SignArg) --> copysign X, SignArg 2323 Value *SignArg; 2324 if (match(II->getArgOperand(1), 2325 m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(SignArg)))) 2326 return replaceOperand(*II, 1, SignArg); 2327 2328 break; 2329 } 2330 case Intrinsic::fabs: { 2331 Value *Cond; 2332 Constant *LHS, *RHS; 2333 if (match(II->getArgOperand(0), 2334 m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) { 2335 CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS}); 2336 CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS}); 2337 return SelectInst::Create(Cond, Call0, Call1); 2338 } 2339 2340 LLVM_FALLTHROUGH; 2341 } 2342 case Intrinsic::ceil: 2343 case Intrinsic::floor: 2344 case Intrinsic::round: 2345 case Intrinsic::nearbyint: 2346 case Intrinsic::rint: 2347 case Intrinsic::trunc: { 2348 Value *ExtSrc; 2349 if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) { 2350 // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x) 2351 Value *NarrowII = Builder.CreateUnaryIntrinsic(IID, ExtSrc, II); 2352 return new FPExtInst(NarrowII, II->getType()); 2353 } 2354 break; 2355 } 2356 case Intrinsic::cos: 2357 case Intrinsic::amdgcn_cos: { 2358 Value *X; 2359 Value *Src = II->getArgOperand(0); 2360 if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) { 2361 // cos(-x) -> cos(x) 2362 // cos(fabs(x)) -> cos(x) 2363 return replaceOperand(*II, 0, X); 2364 } 2365 break; 2366 } 2367 case Intrinsic::sin: { 2368 Value *X; 2369 if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) { 2370 // sin(-x) --> -sin(x) 2371 Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II); 2372 Instruction *FNeg = UnaryOperator::CreateFNeg(NewSin); 2373 FNeg->copyFastMathFlags(II); 2374 return FNeg; 2375 } 2376 break; 2377 } 2378 case Intrinsic::ppc_altivec_lvx: 2379 case Intrinsic::ppc_altivec_lvxl: 2380 // Turn PPC lvx -> load if the pointer is known aligned. 2381 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, 2382 &DT) >= 16) { 2383 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2384 PointerType::getUnqual(II->getType())); 2385 return new LoadInst(II->getType(), Ptr); 2386 } 2387 break; 2388 case Intrinsic::ppc_vsx_lxvw4x: 2389 case Intrinsic::ppc_vsx_lxvd2x: { 2390 // Turn PPC VSX loads into normal loads. 2391 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2392 PointerType::getUnqual(II->getType())); 2393 return new LoadInst(II->getType(), Ptr, Twine(""), false, Align(1)); 2394 } 2395 case Intrinsic::ppc_altivec_stvx: 2396 case Intrinsic::ppc_altivec_stvxl: 2397 // Turn stvx -> store if the pointer is known aligned. 2398 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, 2399 &DT) >= 16) { 2400 Type *OpPtrTy = 2401 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2402 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2403 return new StoreInst(II->getArgOperand(0), Ptr); 2404 } 2405 break; 2406 case Intrinsic::ppc_vsx_stxvw4x: 2407 case Intrinsic::ppc_vsx_stxvd2x: { 2408 // Turn PPC VSX stores into normal stores. 2409 Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); 2410 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2411 return new StoreInst(II->getArgOperand(0), Ptr, false, Align(1)); 2412 } 2413 case Intrinsic::ppc_qpx_qvlfs: 2414 // Turn PPC QPX qvlfs -> load if the pointer is known aligned. 2415 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, 2416 &DT) >= 16) { 2417 Type *VTy = VectorType::get(Builder.getFloatTy(), 2418 II->getType()->getVectorNumElements()); 2419 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2420 PointerType::getUnqual(VTy)); 2421 Value *Load = Builder.CreateLoad(VTy, Ptr); 2422 return new FPExtInst(Load, II->getType()); 2423 } 2424 break; 2425 case Intrinsic::ppc_qpx_qvlfd: 2426 // Turn PPC QPX qvlfd -> load if the pointer is known aligned. 2427 if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC, 2428 &DT) >= 32) { 2429 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), 2430 PointerType::getUnqual(II->getType())); 2431 return new LoadInst(II->getType(), Ptr); 2432 } 2433 break; 2434 case Intrinsic::ppc_qpx_qvstfs: 2435 // Turn PPC QPX qvstfs -> store if the pointer is known aligned. 2436 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, 2437 &DT) >= 16) { 2438 Type *VTy = VectorType::get(Builder.getFloatTy(), 2439 II->getArgOperand(0)->getType()->getVectorNumElements()); 2440 Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy); 2441 Type *OpPtrTy = PointerType::getUnqual(VTy); 2442 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2443 return new StoreInst(TOp, Ptr); 2444 } 2445 break; 2446 case Intrinsic::ppc_qpx_qvstfd: 2447 // Turn PPC QPX qvstfd -> store if the pointer is known aligned. 2448 if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC, 2449 &DT) >= 32) { 2450 Type *OpPtrTy = 2451 PointerType::getUnqual(II->getArgOperand(0)->getType()); 2452 Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); 2453 return new StoreInst(II->getArgOperand(0), Ptr); 2454 } 2455 break; 2456 2457 case Intrinsic::x86_bmi_bextr_32: 2458 case Intrinsic::x86_bmi_bextr_64: 2459 case Intrinsic::x86_tbm_bextri_u32: 2460 case Intrinsic::x86_tbm_bextri_u64: 2461 // If the RHS is a constant we can try some simplifications. 2462 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2463 uint64_t Shift = C->getZExtValue(); 2464 uint64_t Length = (Shift >> 8) & 0xff; 2465 Shift &= 0xff; 2466 unsigned BitWidth = II->getType()->getIntegerBitWidth(); 2467 // If the length is 0 or the shift is out of range, replace with zero. 2468 if (Length == 0 || Shift >= BitWidth) 2469 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2470 // If the LHS is also a constant, we can completely constant fold this. 2471 if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2472 uint64_t Result = InC->getZExtValue() >> Shift; 2473 if (Length > BitWidth) 2474 Length = BitWidth; 2475 Result &= maskTrailingOnes<uint64_t>(Length); 2476 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2477 } 2478 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 2479 // are only masking bits that a shift already cleared? 2480 } 2481 break; 2482 2483 case Intrinsic::x86_bmi_bzhi_32: 2484 case Intrinsic::x86_bmi_bzhi_64: 2485 // If the RHS is a constant we can try some simplifications. 2486 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2487 uint64_t Index = C->getZExtValue() & 0xff; 2488 unsigned BitWidth = II->getType()->getIntegerBitWidth(); 2489 if (Index >= BitWidth) 2490 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2491 if (Index == 0) 2492 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2493 // If the LHS is also a constant, we can completely constant fold this. 2494 if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2495 uint64_t Result = InC->getZExtValue(); 2496 Result &= maskTrailingOnes<uint64_t>(Index); 2497 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2498 } 2499 // TODO should we convert this to an AND if the RHS is constant? 2500 } 2501 break; 2502 case Intrinsic::x86_bmi_pext_32: 2503 case Intrinsic::x86_bmi_pext_64: 2504 if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2505 if (MaskC->isNullValue()) 2506 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2507 if (MaskC->isAllOnesValue()) 2508 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2509 2510 if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2511 uint64_t Src = SrcC->getZExtValue(); 2512 uint64_t Mask = MaskC->getZExtValue(); 2513 uint64_t Result = 0; 2514 uint64_t BitToSet = 1; 2515 2516 while (Mask) { 2517 // Isolate lowest set bit. 2518 uint64_t BitToTest = Mask & -Mask; 2519 if (BitToTest & Src) 2520 Result |= BitToSet; 2521 2522 BitToSet <<= 1; 2523 // Clear lowest set bit. 2524 Mask &= Mask - 1; 2525 } 2526 2527 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2528 } 2529 } 2530 break; 2531 case Intrinsic::x86_bmi_pdep_32: 2532 case Intrinsic::x86_bmi_pdep_64: 2533 if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { 2534 if (MaskC->isNullValue()) 2535 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); 2536 if (MaskC->isAllOnesValue()) 2537 return replaceInstUsesWith(CI, II->getArgOperand(0)); 2538 2539 if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { 2540 uint64_t Src = SrcC->getZExtValue(); 2541 uint64_t Mask = MaskC->getZExtValue(); 2542 uint64_t Result = 0; 2543 uint64_t BitToTest = 1; 2544 2545 while (Mask) { 2546 // Isolate lowest set bit. 2547 uint64_t BitToSet = Mask & -Mask; 2548 if (BitToTest & Src) 2549 Result |= BitToSet; 2550 2551 BitToTest <<= 1; 2552 // Clear lowest set bit; 2553 Mask &= Mask - 1; 2554 } 2555 2556 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); 2557 } 2558 } 2559 break; 2560 2561 case Intrinsic::x86_sse_cvtss2si: 2562 case Intrinsic::x86_sse_cvtss2si64: 2563 case Intrinsic::x86_sse_cvttss2si: 2564 case Intrinsic::x86_sse_cvttss2si64: 2565 case Intrinsic::x86_sse2_cvtsd2si: 2566 case Intrinsic::x86_sse2_cvtsd2si64: 2567 case Intrinsic::x86_sse2_cvttsd2si: 2568 case Intrinsic::x86_sse2_cvttsd2si64: 2569 case Intrinsic::x86_avx512_vcvtss2si32: 2570 case Intrinsic::x86_avx512_vcvtss2si64: 2571 case Intrinsic::x86_avx512_vcvtss2usi32: 2572 case Intrinsic::x86_avx512_vcvtss2usi64: 2573 case Intrinsic::x86_avx512_vcvtsd2si32: 2574 case Intrinsic::x86_avx512_vcvtsd2si64: 2575 case Intrinsic::x86_avx512_vcvtsd2usi32: 2576 case Intrinsic::x86_avx512_vcvtsd2usi64: 2577 case Intrinsic::x86_avx512_cvttss2si: 2578 case Intrinsic::x86_avx512_cvttss2si64: 2579 case Intrinsic::x86_avx512_cvttss2usi: 2580 case Intrinsic::x86_avx512_cvttss2usi64: 2581 case Intrinsic::x86_avx512_cvttsd2si: 2582 case Intrinsic::x86_avx512_cvttsd2si64: 2583 case Intrinsic::x86_avx512_cvttsd2usi: 2584 case Intrinsic::x86_avx512_cvttsd2usi64: { 2585 // These intrinsics only demand the 0th element of their input vectors. If 2586 // we can simplify the input based on that, do so now. 2587 Value *Arg = II->getArgOperand(0); 2588 unsigned VWidth = Arg->getType()->getVectorNumElements(); 2589 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 2590 II->setArgOperand(0, V); 2591 return II; 2592 } 2593 break; 2594 } 2595 2596 case Intrinsic::x86_mmx_pmovmskb: 2597 case Intrinsic::x86_sse_movmsk_ps: 2598 case Intrinsic::x86_sse2_movmsk_pd: 2599 case Intrinsic::x86_sse2_pmovmskb_128: 2600 case Intrinsic::x86_avx_movmsk_pd_256: 2601 case Intrinsic::x86_avx_movmsk_ps_256: 2602 case Intrinsic::x86_avx2_pmovmskb: 2603 if (Value *V = simplifyX86movmsk(*II, Builder)) 2604 return replaceInstUsesWith(*II, V); 2605 break; 2606 2607 case Intrinsic::x86_sse_comieq_ss: 2608 case Intrinsic::x86_sse_comige_ss: 2609 case Intrinsic::x86_sse_comigt_ss: 2610 case Intrinsic::x86_sse_comile_ss: 2611 case Intrinsic::x86_sse_comilt_ss: 2612 case Intrinsic::x86_sse_comineq_ss: 2613 case Intrinsic::x86_sse_ucomieq_ss: 2614 case Intrinsic::x86_sse_ucomige_ss: 2615 case Intrinsic::x86_sse_ucomigt_ss: 2616 case Intrinsic::x86_sse_ucomile_ss: 2617 case Intrinsic::x86_sse_ucomilt_ss: 2618 case Intrinsic::x86_sse_ucomineq_ss: 2619 case Intrinsic::x86_sse2_comieq_sd: 2620 case Intrinsic::x86_sse2_comige_sd: 2621 case Intrinsic::x86_sse2_comigt_sd: 2622 case Intrinsic::x86_sse2_comile_sd: 2623 case Intrinsic::x86_sse2_comilt_sd: 2624 case Intrinsic::x86_sse2_comineq_sd: 2625 case Intrinsic::x86_sse2_ucomieq_sd: 2626 case Intrinsic::x86_sse2_ucomige_sd: 2627 case Intrinsic::x86_sse2_ucomigt_sd: 2628 case Intrinsic::x86_sse2_ucomile_sd: 2629 case Intrinsic::x86_sse2_ucomilt_sd: 2630 case Intrinsic::x86_sse2_ucomineq_sd: 2631 case Intrinsic::x86_avx512_vcomi_ss: 2632 case Intrinsic::x86_avx512_vcomi_sd: 2633 case Intrinsic::x86_avx512_mask_cmp_ss: 2634 case Intrinsic::x86_avx512_mask_cmp_sd: { 2635 // These intrinsics only demand the 0th element of their input vectors. If 2636 // we can simplify the input based on that, do so now. 2637 bool MadeChange = false; 2638 Value *Arg0 = II->getArgOperand(0); 2639 Value *Arg1 = II->getArgOperand(1); 2640 unsigned VWidth = Arg0->getType()->getVectorNumElements(); 2641 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2642 II->setArgOperand(0, V); 2643 MadeChange = true; 2644 } 2645 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2646 II->setArgOperand(1, V); 2647 MadeChange = true; 2648 } 2649 if (MadeChange) 2650 return II; 2651 break; 2652 } 2653 case Intrinsic::x86_avx512_cmp_pd_128: 2654 case Intrinsic::x86_avx512_cmp_pd_256: 2655 case Intrinsic::x86_avx512_cmp_pd_512: 2656 case Intrinsic::x86_avx512_cmp_ps_128: 2657 case Intrinsic::x86_avx512_cmp_ps_256: 2658 case Intrinsic::x86_avx512_cmp_ps_512: { 2659 // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a) 2660 Value *Arg0 = II->getArgOperand(0); 2661 Value *Arg1 = II->getArgOperand(1); 2662 bool Arg0IsZero = match(Arg0, m_PosZeroFP()); 2663 if (Arg0IsZero) 2664 std::swap(Arg0, Arg1); 2665 Value *A, *B; 2666 // This fold requires only the NINF(not +/- inf) since inf minus 2667 // inf is nan. 2668 // NSZ(No Signed Zeros) is not needed because zeros of any sign are 2669 // equal for both compares. 2670 // NNAN is not needed because nans compare the same for both compares. 2671 // The compare intrinsic uses the above assumptions and therefore 2672 // doesn't require additional flags. 2673 if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) && 2674 match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) && 2675 cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) { 2676 if (Arg0IsZero) 2677 std::swap(A, B); 2678 replaceOperand(*II, 0, A); 2679 replaceOperand(*II, 1, B); 2680 return II; 2681 } 2682 break; 2683 } 2684 2685 case Intrinsic::x86_avx512_add_ps_512: 2686 case Intrinsic::x86_avx512_div_ps_512: 2687 case Intrinsic::x86_avx512_mul_ps_512: 2688 case Intrinsic::x86_avx512_sub_ps_512: 2689 case Intrinsic::x86_avx512_add_pd_512: 2690 case Intrinsic::x86_avx512_div_pd_512: 2691 case Intrinsic::x86_avx512_mul_pd_512: 2692 case Intrinsic::x86_avx512_sub_pd_512: 2693 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2694 // IR operations. 2695 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) { 2696 if (R->getValue() == 4) { 2697 Value *Arg0 = II->getArgOperand(0); 2698 Value *Arg1 = II->getArgOperand(1); 2699 2700 Value *V; 2701 switch (IID) { 2702 default: llvm_unreachable("Case stmts out of sync!"); 2703 case Intrinsic::x86_avx512_add_ps_512: 2704 case Intrinsic::x86_avx512_add_pd_512: 2705 V = Builder.CreateFAdd(Arg0, Arg1); 2706 break; 2707 case Intrinsic::x86_avx512_sub_ps_512: 2708 case Intrinsic::x86_avx512_sub_pd_512: 2709 V = Builder.CreateFSub(Arg0, Arg1); 2710 break; 2711 case Intrinsic::x86_avx512_mul_ps_512: 2712 case Intrinsic::x86_avx512_mul_pd_512: 2713 V = Builder.CreateFMul(Arg0, Arg1); 2714 break; 2715 case Intrinsic::x86_avx512_div_ps_512: 2716 case Intrinsic::x86_avx512_div_pd_512: 2717 V = Builder.CreateFDiv(Arg0, Arg1); 2718 break; 2719 } 2720 2721 return replaceInstUsesWith(*II, V); 2722 } 2723 } 2724 break; 2725 2726 case Intrinsic::x86_avx512_mask_add_ss_round: 2727 case Intrinsic::x86_avx512_mask_div_ss_round: 2728 case Intrinsic::x86_avx512_mask_mul_ss_round: 2729 case Intrinsic::x86_avx512_mask_sub_ss_round: 2730 case Intrinsic::x86_avx512_mask_add_sd_round: 2731 case Intrinsic::x86_avx512_mask_div_sd_round: 2732 case Intrinsic::x86_avx512_mask_mul_sd_round: 2733 case Intrinsic::x86_avx512_mask_sub_sd_round: 2734 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2735 // IR operations. 2736 if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { 2737 if (R->getValue() == 4) { 2738 // Extract the element as scalars. 2739 Value *Arg0 = II->getArgOperand(0); 2740 Value *Arg1 = II->getArgOperand(1); 2741 Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0); 2742 Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0); 2743 2744 Value *V; 2745 switch (IID) { 2746 default: llvm_unreachable("Case stmts out of sync!"); 2747 case Intrinsic::x86_avx512_mask_add_ss_round: 2748 case Intrinsic::x86_avx512_mask_add_sd_round: 2749 V = Builder.CreateFAdd(LHS, RHS); 2750 break; 2751 case Intrinsic::x86_avx512_mask_sub_ss_round: 2752 case Intrinsic::x86_avx512_mask_sub_sd_round: 2753 V = Builder.CreateFSub(LHS, RHS); 2754 break; 2755 case Intrinsic::x86_avx512_mask_mul_ss_round: 2756 case Intrinsic::x86_avx512_mask_mul_sd_round: 2757 V = Builder.CreateFMul(LHS, RHS); 2758 break; 2759 case Intrinsic::x86_avx512_mask_div_ss_round: 2760 case Intrinsic::x86_avx512_mask_div_sd_round: 2761 V = Builder.CreateFDiv(LHS, RHS); 2762 break; 2763 } 2764 2765 // Handle the masking aspect of the intrinsic. 2766 Value *Mask = II->getArgOperand(3); 2767 auto *C = dyn_cast<ConstantInt>(Mask); 2768 // We don't need a select if we know the mask bit is a 1. 2769 if (!C || !C->getValue()[0]) { 2770 // Cast the mask to an i1 vector and then extract the lowest element. 2771 auto *MaskTy = VectorType::get(Builder.getInt1Ty(), 2772 cast<IntegerType>(Mask->getType())->getBitWidth()); 2773 Mask = Builder.CreateBitCast(Mask, MaskTy); 2774 Mask = Builder.CreateExtractElement(Mask, (uint64_t)0); 2775 // Extract the lowest element from the passthru operand. 2776 Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2), 2777 (uint64_t)0); 2778 V = Builder.CreateSelect(Mask, V, Passthru); 2779 } 2780 2781 // Insert the result back into the original argument 0. 2782 V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2783 2784 return replaceInstUsesWith(*II, V); 2785 } 2786 } 2787 break; 2788 2789 // Constant fold ashr( <A x Bi>, Ci ). 2790 // Constant fold lshr( <A x Bi>, Ci ). 2791 // Constant fold shl( <A x Bi>, Ci ). 2792 case Intrinsic::x86_sse2_psrai_d: 2793 case Intrinsic::x86_sse2_psrai_w: 2794 case Intrinsic::x86_avx2_psrai_d: 2795 case Intrinsic::x86_avx2_psrai_w: 2796 case Intrinsic::x86_avx512_psrai_q_128: 2797 case Intrinsic::x86_avx512_psrai_q_256: 2798 case Intrinsic::x86_avx512_psrai_d_512: 2799 case Intrinsic::x86_avx512_psrai_q_512: 2800 case Intrinsic::x86_avx512_psrai_w_512: 2801 case Intrinsic::x86_sse2_psrli_d: 2802 case Intrinsic::x86_sse2_psrli_q: 2803 case Intrinsic::x86_sse2_psrli_w: 2804 case Intrinsic::x86_avx2_psrli_d: 2805 case Intrinsic::x86_avx2_psrli_q: 2806 case Intrinsic::x86_avx2_psrli_w: 2807 case Intrinsic::x86_avx512_psrli_d_512: 2808 case Intrinsic::x86_avx512_psrli_q_512: 2809 case Intrinsic::x86_avx512_psrli_w_512: 2810 case Intrinsic::x86_sse2_pslli_d: 2811 case Intrinsic::x86_sse2_pslli_q: 2812 case Intrinsic::x86_sse2_pslli_w: 2813 case Intrinsic::x86_avx2_pslli_d: 2814 case Intrinsic::x86_avx2_pslli_q: 2815 case Intrinsic::x86_avx2_pslli_w: 2816 case Intrinsic::x86_avx512_pslli_d_512: 2817 case Intrinsic::x86_avx512_pslli_q_512: 2818 case Intrinsic::x86_avx512_pslli_w_512: 2819 if (Value *V = simplifyX86immShift(*II, Builder)) 2820 return replaceInstUsesWith(*II, V); 2821 break; 2822 2823 case Intrinsic::x86_sse2_psra_d: 2824 case Intrinsic::x86_sse2_psra_w: 2825 case Intrinsic::x86_avx2_psra_d: 2826 case Intrinsic::x86_avx2_psra_w: 2827 case Intrinsic::x86_avx512_psra_q_128: 2828 case Intrinsic::x86_avx512_psra_q_256: 2829 case Intrinsic::x86_avx512_psra_d_512: 2830 case Intrinsic::x86_avx512_psra_q_512: 2831 case Intrinsic::x86_avx512_psra_w_512: 2832 case Intrinsic::x86_sse2_psrl_d: 2833 case Intrinsic::x86_sse2_psrl_q: 2834 case Intrinsic::x86_sse2_psrl_w: 2835 case Intrinsic::x86_avx2_psrl_d: 2836 case Intrinsic::x86_avx2_psrl_q: 2837 case Intrinsic::x86_avx2_psrl_w: 2838 case Intrinsic::x86_avx512_psrl_d_512: 2839 case Intrinsic::x86_avx512_psrl_q_512: 2840 case Intrinsic::x86_avx512_psrl_w_512: 2841 case Intrinsic::x86_sse2_psll_d: 2842 case Intrinsic::x86_sse2_psll_q: 2843 case Intrinsic::x86_sse2_psll_w: 2844 case Intrinsic::x86_avx2_psll_d: 2845 case Intrinsic::x86_avx2_psll_q: 2846 case Intrinsic::x86_avx2_psll_w: 2847 case Intrinsic::x86_avx512_psll_d_512: 2848 case Intrinsic::x86_avx512_psll_q_512: 2849 case Intrinsic::x86_avx512_psll_w_512: { 2850 if (Value *V = simplifyX86immShift(*II, Builder)) 2851 return replaceInstUsesWith(*II, V); 2852 2853 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2854 // operand to compute the shift amount. 2855 Value *Arg1 = II->getArgOperand(1); 2856 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2857 "Unexpected packed shift size"); 2858 unsigned VWidth = Arg1->getType()->getVectorNumElements(); 2859 2860 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 2861 II->setArgOperand(1, V); 2862 return II; 2863 } 2864 break; 2865 } 2866 2867 case Intrinsic::x86_avx2_psllv_d: 2868 case Intrinsic::x86_avx2_psllv_d_256: 2869 case Intrinsic::x86_avx2_psllv_q: 2870 case Intrinsic::x86_avx2_psllv_q_256: 2871 case Intrinsic::x86_avx512_psllv_d_512: 2872 case Intrinsic::x86_avx512_psllv_q_512: 2873 case Intrinsic::x86_avx512_psllv_w_128: 2874 case Intrinsic::x86_avx512_psllv_w_256: 2875 case Intrinsic::x86_avx512_psllv_w_512: 2876 case Intrinsic::x86_avx2_psrav_d: 2877 case Intrinsic::x86_avx2_psrav_d_256: 2878 case Intrinsic::x86_avx512_psrav_q_128: 2879 case Intrinsic::x86_avx512_psrav_q_256: 2880 case Intrinsic::x86_avx512_psrav_d_512: 2881 case Intrinsic::x86_avx512_psrav_q_512: 2882 case Intrinsic::x86_avx512_psrav_w_128: 2883 case Intrinsic::x86_avx512_psrav_w_256: 2884 case Intrinsic::x86_avx512_psrav_w_512: 2885 case Intrinsic::x86_avx2_psrlv_d: 2886 case Intrinsic::x86_avx2_psrlv_d_256: 2887 case Intrinsic::x86_avx2_psrlv_q: 2888 case Intrinsic::x86_avx2_psrlv_q_256: 2889 case Intrinsic::x86_avx512_psrlv_d_512: 2890 case Intrinsic::x86_avx512_psrlv_q_512: 2891 case Intrinsic::x86_avx512_psrlv_w_128: 2892 case Intrinsic::x86_avx512_psrlv_w_256: 2893 case Intrinsic::x86_avx512_psrlv_w_512: 2894 if (Value *V = simplifyX86varShift(*II, Builder)) 2895 return replaceInstUsesWith(*II, V); 2896 break; 2897 2898 case Intrinsic::x86_sse2_packssdw_128: 2899 case Intrinsic::x86_sse2_packsswb_128: 2900 case Intrinsic::x86_avx2_packssdw: 2901 case Intrinsic::x86_avx2_packsswb: 2902 case Intrinsic::x86_avx512_packssdw_512: 2903 case Intrinsic::x86_avx512_packsswb_512: 2904 if (Value *V = simplifyX86pack(*II, Builder, true)) 2905 return replaceInstUsesWith(*II, V); 2906 break; 2907 2908 case Intrinsic::x86_sse2_packuswb_128: 2909 case Intrinsic::x86_sse41_packusdw: 2910 case Intrinsic::x86_avx2_packusdw: 2911 case Intrinsic::x86_avx2_packuswb: 2912 case Intrinsic::x86_avx512_packusdw_512: 2913 case Intrinsic::x86_avx512_packuswb_512: 2914 if (Value *V = simplifyX86pack(*II, Builder, false)) 2915 return replaceInstUsesWith(*II, V); 2916 break; 2917 2918 case Intrinsic::x86_pclmulqdq: 2919 case Intrinsic::x86_pclmulqdq_256: 2920 case Intrinsic::x86_pclmulqdq_512: { 2921 if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) { 2922 unsigned Imm = C->getZExtValue(); 2923 2924 bool MadeChange = false; 2925 Value *Arg0 = II->getArgOperand(0); 2926 Value *Arg1 = II->getArgOperand(1); 2927 unsigned VWidth = Arg0->getType()->getVectorNumElements(); 2928 2929 APInt UndefElts1(VWidth, 0); 2930 APInt DemandedElts1 = APInt::getSplat(VWidth, 2931 APInt(2, (Imm & 0x01) ? 2 : 1)); 2932 if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1, 2933 UndefElts1)) { 2934 II->setArgOperand(0, V); 2935 MadeChange = true; 2936 } 2937 2938 APInt UndefElts2(VWidth, 0); 2939 APInt DemandedElts2 = APInt::getSplat(VWidth, 2940 APInt(2, (Imm & 0x10) ? 2 : 1)); 2941 if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2, 2942 UndefElts2)) { 2943 II->setArgOperand(1, V); 2944 MadeChange = true; 2945 } 2946 2947 // If either input elements are undef, the result is zero. 2948 if (DemandedElts1.isSubsetOf(UndefElts1) || 2949 DemandedElts2.isSubsetOf(UndefElts2)) 2950 return replaceInstUsesWith(*II, 2951 ConstantAggregateZero::get(II->getType())); 2952 2953 if (MadeChange) 2954 return II; 2955 } 2956 break; 2957 } 2958 2959 case Intrinsic::x86_sse41_insertps: 2960 if (Value *V = simplifyX86insertps(*II, Builder)) 2961 return replaceInstUsesWith(*II, V); 2962 break; 2963 2964 case Intrinsic::x86_sse4a_extrq: { 2965 Value *Op0 = II->getArgOperand(0); 2966 Value *Op1 = II->getArgOperand(1); 2967 unsigned VWidth0 = Op0->getType()->getVectorNumElements(); 2968 unsigned VWidth1 = Op1->getType()->getVectorNumElements(); 2969 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2970 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2971 VWidth1 == 16 && "Unexpected operand sizes"); 2972 2973 // See if we're dealing with constant values. 2974 Constant *C1 = dyn_cast<Constant>(Op1); 2975 ConstantInt *CILength = 2976 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 2977 : nullptr; 2978 ConstantInt *CIIndex = 2979 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2980 : nullptr; 2981 2982 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 2983 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) 2984 return replaceInstUsesWith(*II, V); 2985 2986 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 2987 // operands and the lowest 16-bits of the second. 2988 bool MadeChange = false; 2989 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2990 II->setArgOperand(0, V); 2991 MadeChange = true; 2992 } 2993 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 2994 II->setArgOperand(1, V); 2995 MadeChange = true; 2996 } 2997 if (MadeChange) 2998 return II; 2999 break; 3000 } 3001 3002 case Intrinsic::x86_sse4a_extrqi: { 3003 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 3004 // bits of the lower 64-bits. The upper 64-bits are undefined. 3005 Value *Op0 = II->getArgOperand(0); 3006 unsigned VWidth = Op0->getType()->getVectorNumElements(); 3007 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 3008 "Unexpected operand size"); 3009 3010 // See if we're dealing with constant values. 3011 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3012 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3013 3014 // Attempt to simplify to a constant or shuffle vector. 3015 if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) 3016 return replaceInstUsesWith(*II, V); 3017 3018 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 3019 // operand. 3020 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 3021 II->setArgOperand(0, V); 3022 return II; 3023 } 3024 break; 3025 } 3026 3027 case Intrinsic::x86_sse4a_insertq: { 3028 Value *Op0 = II->getArgOperand(0); 3029 Value *Op1 = II->getArgOperand(1); 3030 unsigned VWidth = Op0->getType()->getVectorNumElements(); 3031 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 3032 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 3033 Op1->getType()->getVectorNumElements() == 2 && 3034 "Unexpected operand size"); 3035 3036 // See if we're dealing with constant values. 3037 Constant *C1 = dyn_cast<Constant>(Op1); 3038 ConstantInt *CI11 = 3039 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 3040 : nullptr; 3041 3042 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 3043 if (CI11) { 3044 const APInt &V11 = CI11->getValue(); 3045 APInt Len = V11.zextOrTrunc(6); 3046 APInt Idx = V11.lshr(8).zextOrTrunc(6); 3047 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) 3048 return replaceInstUsesWith(*II, V); 3049 } 3050 3051 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 3052 // operand. 3053 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 3054 II->setArgOperand(0, V); 3055 return II; 3056 } 3057 break; 3058 } 3059 3060 case Intrinsic::x86_sse4a_insertqi: { 3061 // INSERTQI: Extract lowest Length bits from lower half of second source and 3062 // insert over first source starting at Index bit. The upper 64-bits are 3063 // undefined. 3064 Value *Op0 = II->getArgOperand(0); 3065 Value *Op1 = II->getArgOperand(1); 3066 unsigned VWidth0 = Op0->getType()->getVectorNumElements(); 3067 unsigned VWidth1 = Op1->getType()->getVectorNumElements(); 3068 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 3069 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 3070 VWidth1 == 2 && "Unexpected operand sizes"); 3071 3072 // See if we're dealing with constant values. 3073 ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3074 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3)); 3075 3076 // Attempt to simplify to a constant or shuffle vector. 3077 if (CILength && CIIndex) { 3078 APInt Len = CILength->getValue().zextOrTrunc(6); 3079 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 3080 if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) 3081 return replaceInstUsesWith(*II, V); 3082 } 3083 3084 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 3085 // operands. 3086 bool MadeChange = false; 3087 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 3088 II->setArgOperand(0, V); 3089 MadeChange = true; 3090 } 3091 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 3092 II->setArgOperand(1, V); 3093 MadeChange = true; 3094 } 3095 if (MadeChange) 3096 return II; 3097 break; 3098 } 3099 3100 case Intrinsic::x86_sse41_pblendvb: 3101 case Intrinsic::x86_sse41_blendvps: 3102 case Intrinsic::x86_sse41_blendvpd: 3103 case Intrinsic::x86_avx_blendv_ps_256: 3104 case Intrinsic::x86_avx_blendv_pd_256: 3105 case Intrinsic::x86_avx2_pblendvb: { 3106 // fold (blend A, A, Mask) -> A 3107 Value *Op0 = II->getArgOperand(0); 3108 Value *Op1 = II->getArgOperand(1); 3109 Value *Mask = II->getArgOperand(2); 3110 if (Op0 == Op1) 3111 return replaceInstUsesWith(CI, Op0); 3112 3113 // Zero Mask - select 1st argument. 3114 if (isa<ConstantAggregateZero>(Mask)) 3115 return replaceInstUsesWith(CI, Op0); 3116 3117 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 3118 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 3119 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 3120 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 3121 } 3122 3123 // Convert to a vector select if we can bypass casts and find a boolean 3124 // vector condition value. 3125 Value *BoolVec; 3126 Mask = peekThroughBitcast(Mask); 3127 if (match(Mask, m_SExt(m_Value(BoolVec))) && 3128 BoolVec->getType()->isVectorTy() && 3129 BoolVec->getType()->getScalarSizeInBits() == 1) { 3130 assert(Mask->getType()->getPrimitiveSizeInBits() == 3131 II->getType()->getPrimitiveSizeInBits() && 3132 "Not expecting mask and operands with different sizes"); 3133 3134 unsigned NumMaskElts = Mask->getType()->getVectorNumElements(); 3135 unsigned NumOperandElts = II->getType()->getVectorNumElements(); 3136 if (NumMaskElts == NumOperandElts) 3137 return SelectInst::Create(BoolVec, Op1, Op0); 3138 3139 // If the mask has less elements than the operands, each mask bit maps to 3140 // multiple elements of the operands. Bitcast back and forth. 3141 if (NumMaskElts < NumOperandElts) { 3142 Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType()); 3143 Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType()); 3144 Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 3145 return new BitCastInst(Sel, II->getType()); 3146 } 3147 } 3148 3149 break; 3150 } 3151 3152 case Intrinsic::x86_ssse3_pshuf_b_128: 3153 case Intrinsic::x86_avx2_pshuf_b: 3154 case Intrinsic::x86_avx512_pshuf_b_512: 3155 if (Value *V = simplifyX86pshufb(*II, Builder)) 3156 return replaceInstUsesWith(*II, V); 3157 break; 3158 3159 case Intrinsic::x86_avx_vpermilvar_ps: 3160 case Intrinsic::x86_avx_vpermilvar_ps_256: 3161 case Intrinsic::x86_avx512_vpermilvar_ps_512: 3162 case Intrinsic::x86_avx_vpermilvar_pd: 3163 case Intrinsic::x86_avx_vpermilvar_pd_256: 3164 case Intrinsic::x86_avx512_vpermilvar_pd_512: 3165 if (Value *V = simplifyX86vpermilvar(*II, Builder)) 3166 return replaceInstUsesWith(*II, V); 3167 break; 3168 3169 case Intrinsic::x86_avx2_permd: 3170 case Intrinsic::x86_avx2_permps: 3171 case Intrinsic::x86_avx512_permvar_df_256: 3172 case Intrinsic::x86_avx512_permvar_df_512: 3173 case Intrinsic::x86_avx512_permvar_di_256: 3174 case Intrinsic::x86_avx512_permvar_di_512: 3175 case Intrinsic::x86_avx512_permvar_hi_128: 3176 case Intrinsic::x86_avx512_permvar_hi_256: 3177 case Intrinsic::x86_avx512_permvar_hi_512: 3178 case Intrinsic::x86_avx512_permvar_qi_128: 3179 case Intrinsic::x86_avx512_permvar_qi_256: 3180 case Intrinsic::x86_avx512_permvar_qi_512: 3181 case Intrinsic::x86_avx512_permvar_sf_512: 3182 case Intrinsic::x86_avx512_permvar_si_512: 3183 if (Value *V = simplifyX86vpermv(*II, Builder)) 3184 return replaceInstUsesWith(*II, V); 3185 break; 3186 3187 case Intrinsic::x86_avx_maskload_ps: 3188 case Intrinsic::x86_avx_maskload_pd: 3189 case Intrinsic::x86_avx_maskload_ps_256: 3190 case Intrinsic::x86_avx_maskload_pd_256: 3191 case Intrinsic::x86_avx2_maskload_d: 3192 case Intrinsic::x86_avx2_maskload_q: 3193 case Intrinsic::x86_avx2_maskload_d_256: 3194 case Intrinsic::x86_avx2_maskload_q_256: 3195 if (Instruction *I = simplifyX86MaskedLoad(*II, *this)) 3196 return I; 3197 break; 3198 3199 case Intrinsic::x86_sse2_maskmov_dqu: 3200 case Intrinsic::x86_avx_maskstore_ps: 3201 case Intrinsic::x86_avx_maskstore_pd: 3202 case Intrinsic::x86_avx_maskstore_ps_256: 3203 case Intrinsic::x86_avx_maskstore_pd_256: 3204 case Intrinsic::x86_avx2_maskstore_d: 3205 case Intrinsic::x86_avx2_maskstore_q: 3206 case Intrinsic::x86_avx2_maskstore_d_256: 3207 case Intrinsic::x86_avx2_maskstore_q_256: 3208 if (simplifyX86MaskedStore(*II, *this)) 3209 return nullptr; 3210 break; 3211 3212 case Intrinsic::x86_addcarry_32: 3213 case Intrinsic::x86_addcarry_64: 3214 if (Value *V = simplifyX86addcarry(*II, Builder)) 3215 return replaceInstUsesWith(*II, V); 3216 break; 3217 3218 case Intrinsic::ppc_altivec_vperm: 3219 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. 3220 // Note that ppc_altivec_vperm has a big-endian bias, so when creating 3221 // a vectorshuffle for little endian, we must undo the transformation 3222 // performed on vec_perm in altivec.h. That is, we must complement 3223 // the permutation mask with respect to 31 and reverse the order of 3224 // V1 and V2. 3225 if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) { 3226 assert(Mask->getType()->getVectorNumElements() == 16 && 3227 "Bad type for intrinsic!"); 3228 3229 // Check that all of the elements are integer constants or undefs. 3230 bool AllEltsOk = true; 3231 for (unsigned i = 0; i != 16; ++i) { 3232 Constant *Elt = Mask->getAggregateElement(i); 3233 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) { 3234 AllEltsOk = false; 3235 break; 3236 } 3237 } 3238 3239 if (AllEltsOk) { 3240 // Cast the input vectors to byte vectors. 3241 Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0), 3242 Mask->getType()); 3243 Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1), 3244 Mask->getType()); 3245 Value *Result = UndefValue::get(Op0->getType()); 3246 3247 // Only extract each element once. 3248 Value *ExtractedElts[32]; 3249 memset(ExtractedElts, 0, sizeof(ExtractedElts)); 3250 3251 for (unsigned i = 0; i != 16; ++i) { 3252 if (isa<UndefValue>(Mask->getAggregateElement(i))) 3253 continue; 3254 unsigned Idx = 3255 cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue(); 3256 Idx &= 31; // Match the hardware behavior. 3257 if (DL.isLittleEndian()) 3258 Idx = 31 - Idx; 3259 3260 if (!ExtractedElts[Idx]) { 3261 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0; 3262 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1; 3263 ExtractedElts[Idx] = 3264 Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse, 3265 Builder.getInt32(Idx&15)); 3266 } 3267 3268 // Insert this value into the result vector. 3269 Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx], 3270 Builder.getInt32(i)); 3271 } 3272 return CastInst::Create(Instruction::BitCast, Result, CI.getType()); 3273 } 3274 } 3275 break; 3276 3277 case Intrinsic::arm_neon_vld1: { 3278 unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), 3279 DL, II, &AC, &DT); 3280 if (Value *V = simplifyNeonVld1(*II, MemAlign, Builder)) 3281 return replaceInstUsesWith(*II, V); 3282 break; 3283 } 3284 3285 case Intrinsic::arm_neon_vld2: 3286 case Intrinsic::arm_neon_vld3: 3287 case Intrinsic::arm_neon_vld4: 3288 case Intrinsic::arm_neon_vld2lane: 3289 case Intrinsic::arm_neon_vld3lane: 3290 case Intrinsic::arm_neon_vld4lane: 3291 case Intrinsic::arm_neon_vst1: 3292 case Intrinsic::arm_neon_vst2: 3293 case Intrinsic::arm_neon_vst3: 3294 case Intrinsic::arm_neon_vst4: 3295 case Intrinsic::arm_neon_vst2lane: 3296 case Intrinsic::arm_neon_vst3lane: 3297 case Intrinsic::arm_neon_vst4lane: { 3298 unsigned MemAlign = 3299 getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); 3300 unsigned AlignArg = II->getNumArgOperands() - 1; 3301 ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg)); 3302 if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) 3303 return replaceOperand(*II, AlignArg, 3304 ConstantInt::get(Type::getInt32Ty(II->getContext()), 3305 MemAlign, false)); 3306 break; 3307 } 3308 3309 case Intrinsic::arm_neon_vtbl1: 3310 case Intrinsic::aarch64_neon_tbl1: 3311 if (Value *V = simplifyNeonTbl1(*II, Builder)) 3312 return replaceInstUsesWith(*II, V); 3313 break; 3314 3315 case Intrinsic::arm_neon_vmulls: 3316 case Intrinsic::arm_neon_vmullu: 3317 case Intrinsic::aarch64_neon_smull: 3318 case Intrinsic::aarch64_neon_umull: { 3319 Value *Arg0 = II->getArgOperand(0); 3320 Value *Arg1 = II->getArgOperand(1); 3321 3322 // Handle mul by zero first: 3323 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) { 3324 return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); 3325 } 3326 3327 // Check for constant LHS & RHS - in this case we just simplify. 3328 bool Zext = (IID == Intrinsic::arm_neon_vmullu || 3329 IID == Intrinsic::aarch64_neon_umull); 3330 VectorType *NewVT = cast<VectorType>(II->getType()); 3331 if (Constant *CV0 = dyn_cast<Constant>(Arg0)) { 3332 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) { 3333 CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext); 3334 CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext); 3335 3336 return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1)); 3337 } 3338 3339 // Couldn't simplify - canonicalize constant to the RHS. 3340 std::swap(Arg0, Arg1); 3341 } 3342 3343 // Handle mul by one: 3344 if (Constant *CV1 = dyn_cast<Constant>(Arg1)) 3345 if (ConstantInt *Splat = 3346 dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) 3347 if (Splat->isOne()) 3348 return CastInst::CreateIntegerCast(Arg0, II->getType(), 3349 /*isSigned=*/!Zext); 3350 3351 break; 3352 } 3353 case Intrinsic::arm_neon_aesd: 3354 case Intrinsic::arm_neon_aese: 3355 case Intrinsic::aarch64_crypto_aesd: 3356 case Intrinsic::aarch64_crypto_aese: { 3357 Value *DataArg = II->getArgOperand(0); 3358 Value *KeyArg = II->getArgOperand(1); 3359 3360 // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR 3361 Value *Data, *Key; 3362 if (match(KeyArg, m_ZeroInt()) && 3363 match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) { 3364 replaceOperand(*II, 0, Data); 3365 replaceOperand(*II, 1, Key); 3366 return II; 3367 } 3368 break; 3369 } 3370 case Intrinsic::arm_mve_pred_i2v: { 3371 Value *Arg = II->getArgOperand(0); 3372 Value *ArgArg; 3373 if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg))) && 3374 II->getType() == ArgArg->getType()) 3375 return replaceInstUsesWith(*II, ArgArg); 3376 Constant *XorMask; 3377 if (match(Arg, 3378 m_Xor(m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg)), 3379 m_Constant(XorMask))) && 3380 II->getType() == ArgArg->getType()) { 3381 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { 3382 if (CI->getValue().trunc(16).isAllOnesValue()) { 3383 auto TrueVector = Builder.CreateVectorSplat( 3384 II->getType()->getVectorNumElements(), Builder.getTrue()); 3385 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); 3386 } 3387 } 3388 } 3389 KnownBits ScalarKnown(32); 3390 if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16), 3391 ScalarKnown, 0)) 3392 return II; 3393 break; 3394 } 3395 case Intrinsic::arm_mve_pred_v2i: { 3396 Value *Arg = II->getArgOperand(0); 3397 Value *ArgArg; 3398 if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(m_Value(ArgArg)))) 3399 return replaceInstUsesWith(*II, ArgArg); 3400 if (!II->getMetadata(LLVMContext::MD_range)) { 3401 Type *IntTy32 = Type::getInt32Ty(II->getContext()); 3402 Metadata *M[] = { 3403 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), 3404 ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF)) 3405 }; 3406 II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M)); 3407 return II; 3408 } 3409 break; 3410 } 3411 case Intrinsic::arm_mve_vadc: 3412 case Intrinsic::arm_mve_vadc_predicated: { 3413 unsigned CarryOp = 3414 (II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; 3415 assert(II->getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && 3416 "Bad type for intrinsic!"); 3417 3418 KnownBits CarryKnown(32); 3419 if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29), 3420 CarryKnown)) 3421 return II; 3422 break; 3423 } 3424 case Intrinsic::amdgcn_rcp: { 3425 Value *Src = II->getArgOperand(0); 3426 3427 // TODO: Move to ConstantFolding/InstSimplify? 3428 if (isa<UndefValue>(Src)) 3429 return replaceInstUsesWith(CI, Src); 3430 3431 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3432 const APFloat &ArgVal = C->getValueAPF(); 3433 APFloat Val(ArgVal.getSemantics(), 1); 3434 APFloat::opStatus Status = Val.divide(ArgVal, 3435 APFloat::rmNearestTiesToEven); 3436 // Only do this if it was exact and therefore not dependent on the 3437 // rounding mode. 3438 if (Status == APFloat::opOK) 3439 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); 3440 } 3441 3442 break; 3443 } 3444 case Intrinsic::amdgcn_rsq: { 3445 Value *Src = II->getArgOperand(0); 3446 3447 // TODO: Move to ConstantFolding/InstSimplify? 3448 if (isa<UndefValue>(Src)) 3449 return replaceInstUsesWith(CI, Src); 3450 break; 3451 } 3452 case Intrinsic::amdgcn_frexp_mant: 3453 case Intrinsic::amdgcn_frexp_exp: { 3454 Value *Src = II->getArgOperand(0); 3455 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 3456 int Exp; 3457 APFloat Significand = frexp(C->getValueAPF(), Exp, 3458 APFloat::rmNearestTiesToEven); 3459 3460 if (IID == Intrinsic::amdgcn_frexp_mant) { 3461 return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), 3462 Significand)); 3463 } 3464 3465 // Match instruction special case behavior. 3466 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 3467 Exp = 0; 3468 3469 return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp)); 3470 } 3471 3472 if (isa<UndefValue>(Src)) 3473 return replaceInstUsesWith(CI, UndefValue::get(II->getType())); 3474 3475 break; 3476 } 3477 case Intrinsic::amdgcn_class: { 3478 enum { 3479 S_NAN = 1 << 0, // Signaling NaN 3480 Q_NAN = 1 << 1, // Quiet NaN 3481 N_INFINITY = 1 << 2, // Negative infinity 3482 N_NORMAL = 1 << 3, // Negative normal 3483 N_SUBNORMAL = 1 << 4, // Negative subnormal 3484 N_ZERO = 1 << 5, // Negative zero 3485 P_ZERO = 1 << 6, // Positive zero 3486 P_SUBNORMAL = 1 << 7, // Positive subnormal 3487 P_NORMAL = 1 << 8, // Positive normal 3488 P_INFINITY = 1 << 9 // Positive infinity 3489 }; 3490 3491 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 3492 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY; 3493 3494 Value *Src0 = II->getArgOperand(0); 3495 Value *Src1 = II->getArgOperand(1); 3496 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 3497 if (!CMask) { 3498 if (isa<UndefValue>(Src0)) 3499 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3500 3501 if (isa<UndefValue>(Src1)) 3502 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3503 break; 3504 } 3505 3506 uint32_t Mask = CMask->getZExtValue(); 3507 3508 // If all tests are made, it doesn't matter what the value is. 3509 if ((Mask & FullMask) == FullMask) 3510 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true)); 3511 3512 if ((Mask & FullMask) == 0) 3513 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); 3514 3515 if (Mask == (S_NAN | Q_NAN)) { 3516 // Equivalent of isnan. Replace with standard fcmp. 3517 Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0); 3518 FCmp->takeName(II); 3519 return replaceInstUsesWith(*II, FCmp); 3520 } 3521 3522 if (Mask == (N_ZERO | P_ZERO)) { 3523 // Equivalent of == 0. 3524 Value *FCmp = Builder.CreateFCmpOEQ( 3525 Src0, ConstantFP::get(Src0->getType(), 0.0)); 3526 3527 FCmp->takeName(II); 3528 return replaceInstUsesWith(*II, FCmp); 3529 } 3530 3531 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 3532 if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI)) 3533 return replaceOperand(*II, 1, ConstantInt::get(Src1->getType(), 3534 Mask & ~(S_NAN | Q_NAN))); 3535 3536 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 3537 if (!CVal) { 3538 if (isa<UndefValue>(Src0)) 3539 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3540 3541 // Clamp mask to used bits 3542 if ((Mask & FullMask) != Mask) { 3543 CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(), 3544 { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) } 3545 ); 3546 3547 NewCall->takeName(II); 3548 return replaceInstUsesWith(*II, NewCall); 3549 } 3550 3551 break; 3552 } 3553 3554 const APFloat &Val = CVal->getValueAPF(); 3555 3556 bool Result = 3557 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 3558 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 3559 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 3560 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 3561 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 3562 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 3563 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 3564 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 3565 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 3566 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 3567 3568 return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result)); 3569 } 3570 case Intrinsic::amdgcn_cvt_pkrtz: { 3571 Value *Src0 = II->getArgOperand(0); 3572 Value *Src1 = II->getArgOperand(1); 3573 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3574 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3575 const fltSemantics &HalfSem 3576 = II->getType()->getScalarType()->getFltSemantics(); 3577 bool LosesInfo; 3578 APFloat Val0 = C0->getValueAPF(); 3579 APFloat Val1 = C1->getValueAPF(); 3580 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3581 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 3582 3583 Constant *Folded = ConstantVector::get({ 3584 ConstantFP::get(II->getContext(), Val0), 3585 ConstantFP::get(II->getContext(), Val1) }); 3586 return replaceInstUsesWith(*II, Folded); 3587 } 3588 } 3589 3590 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) 3591 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3592 3593 break; 3594 } 3595 case Intrinsic::amdgcn_cvt_pknorm_i16: 3596 case Intrinsic::amdgcn_cvt_pknorm_u16: 3597 case Intrinsic::amdgcn_cvt_pk_i16: 3598 case Intrinsic::amdgcn_cvt_pk_u16: { 3599 Value *Src0 = II->getArgOperand(0); 3600 Value *Src1 = II->getArgOperand(1); 3601 3602 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) 3603 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 3604 3605 break; 3606 } 3607 case Intrinsic::amdgcn_ubfe: 3608 case Intrinsic::amdgcn_sbfe: { 3609 // Decompose simple cases into standard shifts. 3610 Value *Src = II->getArgOperand(0); 3611 if (isa<UndefValue>(Src)) 3612 return replaceInstUsesWith(*II, Src); 3613 3614 unsigned Width; 3615 Type *Ty = II->getType(); 3616 unsigned IntSize = Ty->getIntegerBitWidth(); 3617 3618 ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2)); 3619 if (CWidth) { 3620 Width = CWidth->getZExtValue(); 3621 if ((Width & (IntSize - 1)) == 0) 3622 return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty)); 3623 3624 // Hardware ignores high bits, so remove those. 3625 if (Width >= IntSize) 3626 return replaceOperand(*II, 2, ConstantInt::get(CWidth->getType(), 3627 Width & (IntSize - 1))); 3628 } 3629 3630 unsigned Offset; 3631 ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1)); 3632 if (COffset) { 3633 Offset = COffset->getZExtValue(); 3634 if (Offset >= IntSize) 3635 return replaceOperand(*II, 1, ConstantInt::get(COffset->getType(), 3636 Offset & (IntSize - 1))); 3637 } 3638 3639 bool Signed = IID == Intrinsic::amdgcn_sbfe; 3640 3641 if (!CWidth || !COffset) 3642 break; 3643 3644 // The case of Width == 0 is handled above, which makes this tranformation 3645 // safe. If Width == 0, then the ashr and lshr instructions become poison 3646 // value since the shift amount would be equal to the bit size. 3647 assert(Width != 0); 3648 3649 // TODO: This allows folding to undef when the hardware has specific 3650 // behavior? 3651 if (Offset + Width < IntSize) { 3652 Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width); 3653 Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width) 3654 : Builder.CreateLShr(Shl, IntSize - Width); 3655 RightShift->takeName(II); 3656 return replaceInstUsesWith(*II, RightShift); 3657 } 3658 3659 Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset) 3660 : Builder.CreateLShr(Src, Offset); 3661 3662 RightShift->takeName(II); 3663 return replaceInstUsesWith(*II, RightShift); 3664 } 3665 case Intrinsic::amdgcn_exp: 3666 case Intrinsic::amdgcn_exp_compr: { 3667 ConstantInt *En = cast<ConstantInt>(II->getArgOperand(1)); 3668 unsigned EnBits = En->getZExtValue(); 3669 if (EnBits == 0xf) 3670 break; // All inputs enabled. 3671 3672 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 3673 bool Changed = false; 3674 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 3675 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 3676 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 3677 Value *Src = II->getArgOperand(I + 2); 3678 if (!isa<UndefValue>(Src)) { 3679 replaceOperand(*II, I + 2, UndefValue::get(Src->getType())); 3680 Changed = true; 3681 } 3682 } 3683 } 3684 3685 if (Changed) 3686 return II; 3687 3688 break; 3689 } 3690 case Intrinsic::amdgcn_fmed3: { 3691 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 3692 // for the shader. 3693 3694 Value *Src0 = II->getArgOperand(0); 3695 Value *Src1 = II->getArgOperand(1); 3696 Value *Src2 = II->getArgOperand(2); 3697 3698 // Checking for NaN before canonicalization provides better fidelity when 3699 // mapping other operations onto fmed3 since the order of operands is 3700 // unchanged. 3701 CallInst *NewCall = nullptr; 3702 if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) { 3703 NewCall = Builder.CreateMinNum(Src1, Src2); 3704 } else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) { 3705 NewCall = Builder.CreateMinNum(Src0, Src2); 3706 } else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) { 3707 NewCall = Builder.CreateMaxNum(Src0, Src1); 3708 } 3709 3710 if (NewCall) { 3711 NewCall->copyFastMathFlags(II); 3712 NewCall->takeName(II); 3713 return replaceInstUsesWith(*II, NewCall); 3714 } 3715 3716 bool Swap = false; 3717 // Canonicalize constants to RHS operands. 3718 // 3719 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 3720 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3721 std::swap(Src0, Src1); 3722 Swap = true; 3723 } 3724 3725 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 3726 std::swap(Src1, Src2); 3727 Swap = true; 3728 } 3729 3730 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 3731 std::swap(Src0, Src1); 3732 Swap = true; 3733 } 3734 3735 if (Swap) { 3736 II->setArgOperand(0, Src0); 3737 II->setArgOperand(1, Src1); 3738 II->setArgOperand(2, Src2); 3739 return II; 3740 } 3741 3742 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 3743 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 3744 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 3745 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 3746 C2->getValueAPF()); 3747 return replaceInstUsesWith(*II, 3748 ConstantFP::get(Builder.getContext(), Result)); 3749 } 3750 } 3751 } 3752 3753 break; 3754 } 3755 case Intrinsic::amdgcn_icmp: 3756 case Intrinsic::amdgcn_fcmp: { 3757 const ConstantInt *CC = cast<ConstantInt>(II->getArgOperand(2)); 3758 // Guard against invalid arguments. 3759 int64_t CCVal = CC->getZExtValue(); 3760 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 3761 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 3762 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 3763 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 3764 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 3765 break; 3766 3767 Value *Src0 = II->getArgOperand(0); 3768 Value *Src1 = II->getArgOperand(1); 3769 3770 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 3771 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 3772 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 3773 if (CCmp->isNullValue()) { 3774 return replaceInstUsesWith( 3775 *II, ConstantExpr::getSExt(CCmp, II->getType())); 3776 } 3777 3778 // The result of V_ICMP/V_FCMP assembly instructions (which this 3779 // intrinsic exposes) is one bit per thread, masked with the EXEC 3780 // register (which contains the bitmask of live threads). So a 3781 // comparison that always returns true is the same as a read of the 3782 // EXEC register. 3783 Function *NewF = Intrinsic::getDeclaration( 3784 II->getModule(), Intrinsic::read_register, II->getType()); 3785 Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")}; 3786 MDNode *MD = MDNode::get(II->getContext(), MDArgs); 3787 Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; 3788 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3789 NewCall->addAttribute(AttributeList::FunctionIndex, 3790 Attribute::Convergent); 3791 NewCall->takeName(II); 3792 return replaceInstUsesWith(*II, NewCall); 3793 } 3794 3795 // Canonicalize constants to RHS. 3796 CmpInst::Predicate SwapPred 3797 = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 3798 II->setArgOperand(0, Src1); 3799 II->setArgOperand(1, Src0); 3800 II->setArgOperand(2, ConstantInt::get(CC->getType(), 3801 static_cast<int>(SwapPred))); 3802 return II; 3803 } 3804 3805 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 3806 break; 3807 3808 // Canonicalize compare eq with true value to compare != 0 3809 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 3810 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 3811 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 3812 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 3813 Value *ExtSrc; 3814 if (CCVal == CmpInst::ICMP_EQ && 3815 ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) || 3816 (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) && 3817 ExtSrc->getType()->isIntegerTy(1)) { 3818 replaceOperand(*II, 1, ConstantInt::getNullValue(Src1->getType())); 3819 replaceOperand(*II, 2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 3820 return II; 3821 } 3822 3823 CmpInst::Predicate SrcPred; 3824 Value *SrcLHS; 3825 Value *SrcRHS; 3826 3827 // Fold compare eq/ne with 0 from a compare result as the predicate to the 3828 // intrinsic. The typical use is a wave vote function in the library, which 3829 // will be fed from a user code condition compared with 0. Fold in the 3830 // redundant compare. 3831 3832 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 3833 // -> llvm.amdgcn.[if]cmp(a, b, pred) 3834 // 3835 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 3836 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 3837 if (match(Src1, m_Zero()) && 3838 match(Src0, 3839 m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) { 3840 if (CCVal == CmpInst::ICMP_EQ) 3841 SrcPred = CmpInst::getInversePredicate(SrcPred); 3842 3843 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ? 3844 Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp; 3845 3846 Type *Ty = SrcLHS->getType(); 3847 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 3848 // Promote to next legal integer type. 3849 unsigned Width = CmpType->getBitWidth(); 3850 unsigned NewWidth = Width; 3851 3852 // Don't do anything for i1 comparisons. 3853 if (Width == 1) 3854 break; 3855 3856 if (Width <= 16) 3857 NewWidth = 16; 3858 else if (Width <= 32) 3859 NewWidth = 32; 3860 else if (Width <= 64) 3861 NewWidth = 64; 3862 else if (Width > 64) 3863 break; // Can't handle this. 3864 3865 if (Width != NewWidth) { 3866 IntegerType *CmpTy = Builder.getIntNTy(NewWidth); 3867 if (CmpInst::isSigned(SrcPred)) { 3868 SrcLHS = Builder.CreateSExt(SrcLHS, CmpTy); 3869 SrcRHS = Builder.CreateSExt(SrcRHS, CmpTy); 3870 } else { 3871 SrcLHS = Builder.CreateZExt(SrcLHS, CmpTy); 3872 SrcRHS = Builder.CreateZExt(SrcRHS, CmpTy); 3873 } 3874 } 3875 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 3876 break; 3877 3878 Function *NewF = 3879 Intrinsic::getDeclaration(II->getModule(), NewIID, 3880 { II->getType(), 3881 SrcLHS->getType() }); 3882 Value *Args[] = { SrcLHS, SrcRHS, 3883 ConstantInt::get(CC->getType(), SrcPred) }; 3884 CallInst *NewCall = Builder.CreateCall(NewF, Args); 3885 NewCall->takeName(II); 3886 return replaceInstUsesWith(*II, NewCall); 3887 } 3888 3889 break; 3890 } 3891 case Intrinsic::amdgcn_wqm_vote: { 3892 // wqm_vote is identity when the argument is constant. 3893 if (!isa<Constant>(II->getArgOperand(0))) 3894 break; 3895 3896 return replaceInstUsesWith(*II, II->getArgOperand(0)); 3897 } 3898 case Intrinsic::amdgcn_kill: { 3899 const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0)); 3900 if (!C || !C->getZExtValue()) 3901 break; 3902 3903 // amdgcn.kill(i1 1) is a no-op 3904 return eraseInstFromFunction(CI); 3905 } 3906 case Intrinsic::amdgcn_update_dpp: { 3907 Value *Old = II->getArgOperand(0); 3908 3909 auto BC = cast<ConstantInt>(II->getArgOperand(5)); 3910 auto RM = cast<ConstantInt>(II->getArgOperand(3)); 3911 auto BM = cast<ConstantInt>(II->getArgOperand(4)); 3912 if (BC->isZeroValue() || 3913 RM->getZExtValue() != 0xF || 3914 BM->getZExtValue() != 0xF || 3915 isa<UndefValue>(Old)) 3916 break; 3917 3918 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 3919 return replaceOperand(*II, 0, UndefValue::get(Old->getType())); 3920 } 3921 case Intrinsic::amdgcn_permlane16: 3922 case Intrinsic::amdgcn_permlanex16: { 3923 // Discard vdst_in if it's not going to be read. 3924 Value *VDstIn = II->getArgOperand(0); 3925 if (isa<UndefValue>(VDstIn)) 3926 break; 3927 3928 ConstantInt *FetchInvalid = cast<ConstantInt>(II->getArgOperand(4)); 3929 ConstantInt *BoundCtrl = cast<ConstantInt>(II->getArgOperand(5)); 3930 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 3931 break; 3932 3933 return replaceOperand(*II, 0, UndefValue::get(VDstIn->getType())); 3934 } 3935 case Intrinsic::amdgcn_readfirstlane: 3936 case Intrinsic::amdgcn_readlane: { 3937 // A constant value is trivially uniform. 3938 if (Constant *C = dyn_cast<Constant>(II->getArgOperand(0))) 3939 return replaceInstUsesWith(*II, C); 3940 3941 // The rest of these may not be safe if the exec may not be the same between 3942 // the def and use. 3943 Value *Src = II->getArgOperand(0); 3944 Instruction *SrcInst = dyn_cast<Instruction>(Src); 3945 if (SrcInst && SrcInst->getParent() != II->getParent()) 3946 break; 3947 3948 // readfirstlane (readfirstlane x) -> readfirstlane x 3949 // readlane (readfirstlane x), y -> readfirstlane x 3950 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) 3951 return replaceInstUsesWith(*II, Src); 3952 3953 if (IID == Intrinsic::amdgcn_readfirstlane) { 3954 // readfirstlane (readlane x, y) -> readlane x, y 3955 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>())) 3956 return replaceInstUsesWith(*II, Src); 3957 } else { 3958 // readlane (readlane x, y), y -> readlane x, y 3959 if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>( 3960 m_Value(), m_Specific(II->getArgOperand(1))))) 3961 return replaceInstUsesWith(*II, Src); 3962 } 3963 3964 break; 3965 } 3966 case Intrinsic::hexagon_V6_vandvrt: 3967 case Intrinsic::hexagon_V6_vandvrt_128B: { 3968 // Simplify Q -> V -> Q conversion. 3969 if (auto Op0 = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { 3970 Intrinsic::ID ID0 = Op0->getIntrinsicID(); 3971 if (ID0 != Intrinsic::hexagon_V6_vandqrt && 3972 ID0 != Intrinsic::hexagon_V6_vandqrt_128B) 3973 break; 3974 Value *Bytes = Op0->getArgOperand(1), *Mask = II->getArgOperand(1); 3975 uint64_t Bytes1 = computeKnownBits(Bytes, 0, Op0).One.getZExtValue(); 3976 uint64_t Mask1 = computeKnownBits(Mask, 0, II).One.getZExtValue(); 3977 // Check if every byte has common bits in Bytes and Mask. 3978 uint64_t C = Bytes1 & Mask1; 3979 if ((C & 0xFF) && (C & 0xFF00) && (C & 0xFF0000) && (C & 0xFF000000)) 3980 return replaceInstUsesWith(*II, Op0->getArgOperand(0)); 3981 } 3982 break; 3983 } 3984 case Intrinsic::stackrestore: { 3985 // If the save is right next to the restore, remove the restore. This can 3986 // happen when variable allocas are DCE'd. 3987 if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { 3988 if (SS->getIntrinsicID() == Intrinsic::stacksave) { 3989 // Skip over debug info. 3990 if (SS->getNextNonDebugInstruction() == II) { 3991 return eraseInstFromFunction(CI); 3992 } 3993 } 3994 } 3995 3996 // Scan down this block to see if there is another stack restore in the 3997 // same block without an intervening call/alloca. 3998 BasicBlock::iterator BI(II); 3999 Instruction *TI = II->getParent()->getTerminator(); 4000 bool CannotRemove = false; 4001 for (++BI; &*BI != TI; ++BI) { 4002 if (isa<AllocaInst>(BI)) { 4003 CannotRemove = true; 4004 break; 4005 } 4006 if (CallInst *BCI = dyn_cast<CallInst>(BI)) { 4007 if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) { 4008 // If there is a stackrestore below this one, remove this one. 4009 if (II2->getIntrinsicID() == Intrinsic::stackrestore) 4010 return eraseInstFromFunction(CI); 4011 4012 // Bail if we cross over an intrinsic with side effects, such as 4013 // llvm.stacksave, or llvm.read_register. 4014 if (II2->mayHaveSideEffects()) { 4015 CannotRemove = true; 4016 break; 4017 } 4018 } else { 4019 // If we found a non-intrinsic call, we can't remove the stack 4020 // restore. 4021 CannotRemove = true; 4022 break; 4023 } 4024 } 4025 } 4026 4027 // If the stack restore is in a return, resume, or unwind block and if there 4028 // are no allocas or calls between the restore and the return, nuke the 4029 // restore. 4030 if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI))) 4031 return eraseInstFromFunction(CI); 4032 break; 4033 } 4034 case Intrinsic::lifetime_end: 4035 // Asan needs to poison memory to detect invalid access which is possible 4036 // even for empty lifetime range. 4037 if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) || 4038 II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) || 4039 II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress)) 4040 break; 4041 4042 if (removeTriviallyEmptyRange(*II, *this, [](const IntrinsicInst &I) { 4043 return I.getIntrinsicID() == Intrinsic::lifetime_start; 4044 })) 4045 return nullptr; 4046 break; 4047 case Intrinsic::assume: { 4048 Value *IIOperand = II->getArgOperand(0); 4049 // Remove an assume if it is followed by an identical assume. 4050 // TODO: Do we need this? Unless there are conflicting assumptions, the 4051 // computeKnownBits(IIOperand) below here eliminates redundant assumes. 4052 Instruction *Next = II->getNextNonDebugInstruction(); 4053 if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand)))) 4054 return eraseInstFromFunction(CI); 4055 4056 // Canonicalize assume(a && b) -> assume(a); assume(b); 4057 // Note: New assumption intrinsics created here are registered by 4058 // the InstCombineIRInserter object. 4059 FunctionType *AssumeIntrinsicTy = II->getFunctionType(); 4060 Value *AssumeIntrinsic = II->getCalledValue(); 4061 Value *A, *B; 4062 if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) { 4063 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, II->getName()); 4064 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName()); 4065 return eraseInstFromFunction(*II); 4066 } 4067 // assume(!(a || b)) -> assume(!a); assume(!b); 4068 if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) { 4069 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, 4070 Builder.CreateNot(A), II->getName()); 4071 Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, 4072 Builder.CreateNot(B), II->getName()); 4073 return eraseInstFromFunction(*II); 4074 } 4075 4076 // assume( (load addr) != null ) -> add 'nonnull' metadata to load 4077 // (if assume is valid at the load) 4078 CmpInst::Predicate Pred; 4079 Instruction *LHS; 4080 if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) && 4081 Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load && 4082 LHS->getType()->isPointerTy() && 4083 isValidAssumeForContext(II, LHS, &DT)) { 4084 MDNode *MD = MDNode::get(II->getContext(), None); 4085 LHS->setMetadata(LLVMContext::MD_nonnull, MD); 4086 return eraseInstFromFunction(*II); 4087 4088 // TODO: apply nonnull return attributes to calls and invokes 4089 // TODO: apply range metadata for range check patterns? 4090 } 4091 4092 // If there is a dominating assume with the same condition as this one, 4093 // then this one is redundant, and should be removed. 4094 KnownBits Known(1); 4095 computeKnownBits(IIOperand, Known, 0, II); 4096 if (Known.isAllOnes()) 4097 return eraseInstFromFunction(*II); 4098 4099 // Update the cache of affected values for this assumption (we might be 4100 // here because we just simplified the condition). 4101 AC.updateAffectedValues(II); 4102 break; 4103 } 4104 case Intrinsic::experimental_gc_relocate: { 4105 auto &GCR = *cast<GCRelocateInst>(II); 4106 4107 // If we have two copies of the same pointer in the statepoint argument 4108 // list, canonicalize to one. This may let us common gc.relocates. 4109 if (GCR.getBasePtr() == GCR.getDerivedPtr() && 4110 GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) { 4111 auto *OpIntTy = GCR.getOperand(2)->getType(); 4112 return replaceOperand(*II, 2, 4113 ConstantInt::get(OpIntTy, GCR.getBasePtrIndex())); 4114 } 4115 4116 // Translate facts known about a pointer before relocating into 4117 // facts about the relocate value, while being careful to 4118 // preserve relocation semantics. 4119 Value *DerivedPtr = GCR.getDerivedPtr(); 4120 4121 // Remove the relocation if unused, note that this check is required 4122 // to prevent the cases below from looping forever. 4123 if (II->use_empty()) 4124 return eraseInstFromFunction(*II); 4125 4126 // Undef is undef, even after relocation. 4127 // TODO: provide a hook for this in GCStrategy. This is clearly legal for 4128 // most practical collectors, but there was discussion in the review thread 4129 // about whether it was legal for all possible collectors. 4130 if (isa<UndefValue>(DerivedPtr)) 4131 // Use undef of gc_relocate's type to replace it. 4132 return replaceInstUsesWith(*II, UndefValue::get(II->getType())); 4133 4134 if (auto *PT = dyn_cast<PointerType>(II->getType())) { 4135 // The relocation of null will be null for most any collector. 4136 // TODO: provide a hook for this in GCStrategy. There might be some 4137 // weird collector this property does not hold for. 4138 if (isa<ConstantPointerNull>(DerivedPtr)) 4139 // Use null-pointer of gc_relocate's type to replace it. 4140 return replaceInstUsesWith(*II, ConstantPointerNull::get(PT)); 4141 4142 // isKnownNonNull -> nonnull attribute 4143 if (!II->hasRetAttr(Attribute::NonNull) && 4144 isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) { 4145 II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); 4146 return II; 4147 } 4148 } 4149 4150 // TODO: bitcast(relocate(p)) -> relocate(bitcast(p)) 4151 // Canonicalize on the type from the uses to the defs 4152 4153 // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...) 4154 break; 4155 } 4156 4157 case Intrinsic::experimental_guard: { 4158 // Is this guard followed by another guard? We scan forward over a small 4159 // fixed window of instructions to handle common cases with conditions 4160 // computed between guards. 4161 Instruction *NextInst = II->getNextNonDebugInstruction(); 4162 for (unsigned i = 0; i < GuardWideningWindow; i++) { 4163 // Note: Using context-free form to avoid compile time blow up 4164 if (!isSafeToSpeculativelyExecute(NextInst)) 4165 break; 4166 NextInst = NextInst->getNextNonDebugInstruction(); 4167 } 4168 Value *NextCond = nullptr; 4169 if (match(NextInst, 4170 m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) { 4171 Value *CurrCond = II->getArgOperand(0); 4172 4173 // Remove a guard that it is immediately preceded by an identical guard. 4174 // Otherwise canonicalize guard(a); guard(b) -> guard(a & b). 4175 if (CurrCond != NextCond) { 4176 Instruction *MoveI = II->getNextNonDebugInstruction(); 4177 while (MoveI != NextInst) { 4178 auto *Temp = MoveI; 4179 MoveI = MoveI->getNextNonDebugInstruction(); 4180 Temp->moveBefore(II); 4181 } 4182 replaceOperand(*II, 0, Builder.CreateAnd(CurrCond, NextCond)); 4183 } 4184 eraseInstFromFunction(*NextInst); 4185 return II; 4186 } 4187 break; 4188 } 4189 } 4190 return visitCallBase(*II); 4191 } 4192 4193 // Fence instruction simplification 4194 Instruction *InstCombiner::visitFenceInst(FenceInst &FI) { 4195 // Remove identical consecutive fences. 4196 Instruction *Next = FI.getNextNonDebugInstruction(); 4197 if (auto *NFI = dyn_cast<FenceInst>(Next)) 4198 if (FI.isIdenticalTo(NFI)) 4199 return eraseInstFromFunction(FI); 4200 return nullptr; 4201 } 4202 4203 // InvokeInst simplification 4204 Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { 4205 return visitCallBase(II); 4206 } 4207 4208 // CallBrInst simplification 4209 Instruction *InstCombiner::visitCallBrInst(CallBrInst &CBI) { 4210 return visitCallBase(CBI); 4211 } 4212 4213 /// If this cast does not affect the value passed through the varargs area, we 4214 /// can eliminate the use of the cast. 4215 static bool isSafeToEliminateVarargsCast(const CallBase &Call, 4216 const DataLayout &DL, 4217 const CastInst *const CI, 4218 const int ix) { 4219 if (!CI->isLosslessCast()) 4220 return false; 4221 4222 // If this is a GC intrinsic, avoid munging types. We need types for 4223 // statepoint reconstruction in SelectionDAG. 4224 // TODO: This is probably something which should be expanded to all 4225 // intrinsics since the entire point of intrinsics is that 4226 // they are understandable by the optimizer. 4227 if (isStatepoint(&Call) || isGCRelocate(&Call) || isGCResult(&Call)) 4228 return false; 4229 4230 // The size of ByVal or InAlloca arguments is derived from the type, so we 4231 // can't change to a type with a different size. If the size were 4232 // passed explicitly we could avoid this check. 4233 if (!Call.isByValOrInAllocaArgument(ix)) 4234 return true; 4235 4236 Type* SrcTy = 4237 cast<PointerType>(CI->getOperand(0)->getType())->getElementType(); 4238 Type *DstTy = Call.isByValArgument(ix) 4239 ? Call.getParamByValType(ix) 4240 : cast<PointerType>(CI->getType())->getElementType(); 4241 if (!SrcTy->isSized() || !DstTy->isSized()) 4242 return false; 4243 if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy)) 4244 return false; 4245 return true; 4246 } 4247 4248 Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { 4249 if (!CI->getCalledFunction()) return nullptr; 4250 4251 auto InstCombineRAUW = [this](Instruction *From, Value *With) { 4252 replaceInstUsesWith(*From, With); 4253 }; 4254 auto InstCombineErase = [this](Instruction *I) { 4255 eraseInstFromFunction(*I); 4256 }; 4257 LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW, 4258 InstCombineErase); 4259 if (Value *With = Simplifier.optimizeCall(CI, Builder)) { 4260 ++NumSimplified; 4261 return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); 4262 } 4263 4264 return nullptr; 4265 } 4266 4267 static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) { 4268 // Strip off at most one level of pointer casts, looking for an alloca. This 4269 // is good enough in practice and simpler than handling any number of casts. 4270 Value *Underlying = TrampMem->stripPointerCasts(); 4271 if (Underlying != TrampMem && 4272 (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem)) 4273 return nullptr; 4274 if (!isa<AllocaInst>(Underlying)) 4275 return nullptr; 4276 4277 IntrinsicInst *InitTrampoline = nullptr; 4278 for (User *U : TrampMem->users()) { 4279 IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); 4280 if (!II) 4281 return nullptr; 4282 if (II->getIntrinsicID() == Intrinsic::init_trampoline) { 4283 if (InitTrampoline) 4284 // More than one init_trampoline writes to this value. Give up. 4285 return nullptr; 4286 InitTrampoline = II; 4287 continue; 4288 } 4289 if (II->getIntrinsicID() == Intrinsic::adjust_trampoline) 4290 // Allow any number of calls to adjust.trampoline. 4291 continue; 4292 return nullptr; 4293 } 4294 4295 // No call to init.trampoline found. 4296 if (!InitTrampoline) 4297 return nullptr; 4298 4299 // Check that the alloca is being used in the expected way. 4300 if (InitTrampoline->getOperand(0) != TrampMem) 4301 return nullptr; 4302 4303 return InitTrampoline; 4304 } 4305 4306 static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp, 4307 Value *TrampMem) { 4308 // Visit all the previous instructions in the basic block, and try to find a 4309 // init.trampoline which has a direct path to the adjust.trampoline. 4310 for (BasicBlock::iterator I = AdjustTramp->getIterator(), 4311 E = AdjustTramp->getParent()->begin(); 4312 I != E;) { 4313 Instruction *Inst = &*--I; 4314 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) 4315 if (II->getIntrinsicID() == Intrinsic::init_trampoline && 4316 II->getOperand(0) == TrampMem) 4317 return II; 4318 if (Inst->mayWriteToMemory()) 4319 return nullptr; 4320 } 4321 return nullptr; 4322 } 4323 4324 // Given a call to llvm.adjust.trampoline, find and return the corresponding 4325 // call to llvm.init.trampoline if the call to the trampoline can be optimized 4326 // to a direct call to a function. Otherwise return NULL. 4327 static IntrinsicInst *findInitTrampoline(Value *Callee) { 4328 Callee = Callee->stripPointerCasts(); 4329 IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee); 4330 if (!AdjustTramp || 4331 AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline) 4332 return nullptr; 4333 4334 Value *TrampMem = AdjustTramp->getOperand(0); 4335 4336 if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem)) 4337 return IT; 4338 if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem)) 4339 return IT; 4340 return nullptr; 4341 } 4342 4343 static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) { 4344 unsigned NumArgs = Call.getNumArgOperands(); 4345 ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0)); 4346 ConstantInt *Op1C = 4347 (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1)); 4348 // Bail out if the allocation size is zero. 4349 if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue())) 4350 return; 4351 4352 if (isMallocLikeFn(&Call, TLI) && Op0C) { 4353 if (isOpNewLikeFn(&Call, TLI)) 4354 Call.addAttribute(AttributeList::ReturnIndex, 4355 Attribute::getWithDereferenceableBytes( 4356 Call.getContext(), Op0C->getZExtValue())); 4357 else 4358 Call.addAttribute(AttributeList::ReturnIndex, 4359 Attribute::getWithDereferenceableOrNullBytes( 4360 Call.getContext(), Op0C->getZExtValue())); 4361 } else if (isReallocLikeFn(&Call, TLI) && Op1C) { 4362 Call.addAttribute(AttributeList::ReturnIndex, 4363 Attribute::getWithDereferenceableOrNullBytes( 4364 Call.getContext(), Op1C->getZExtValue())); 4365 } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) { 4366 bool Overflow; 4367 const APInt &N = Op0C->getValue(); 4368 APInt Size = N.umul_ov(Op1C->getValue(), Overflow); 4369 if (!Overflow) 4370 Call.addAttribute(AttributeList::ReturnIndex, 4371 Attribute::getWithDereferenceableOrNullBytes( 4372 Call.getContext(), Size.getZExtValue())); 4373 } else if (isStrdupLikeFn(&Call, TLI)) { 4374 uint64_t Len = GetStringLength(Call.getOperand(0)); 4375 if (Len) { 4376 // strdup 4377 if (NumArgs == 1) 4378 Call.addAttribute(AttributeList::ReturnIndex, 4379 Attribute::getWithDereferenceableOrNullBytes( 4380 Call.getContext(), Len)); 4381 // strndup 4382 else if (NumArgs == 2 && Op1C) 4383 Call.addAttribute( 4384 AttributeList::ReturnIndex, 4385 Attribute::getWithDereferenceableOrNullBytes( 4386 Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1))); 4387 } 4388 } 4389 } 4390 4391 /// Improvements for call, callbr and invoke instructions. 4392 Instruction *InstCombiner::visitCallBase(CallBase &Call) { 4393 if (isAllocationFn(&Call, &TLI)) 4394 annotateAnyAllocSite(Call, &TLI); 4395 4396 bool Changed = false; 4397 4398 // Mark any parameters that are known to be non-null with the nonnull 4399 // attribute. This is helpful for inlining calls to functions with null 4400 // checks on their arguments. 4401 SmallVector<unsigned, 4> ArgNos; 4402 unsigned ArgNo = 0; 4403 4404 for (Value *V : Call.args()) { 4405 if (V->getType()->isPointerTy() && 4406 !Call.paramHasAttr(ArgNo, Attribute::NonNull) && 4407 isKnownNonZero(V, DL, 0, &AC, &Call, &DT)) 4408 ArgNos.push_back(ArgNo); 4409 ArgNo++; 4410 } 4411 4412 assert(ArgNo == Call.arg_size() && "sanity check"); 4413 4414 if (!ArgNos.empty()) { 4415 AttributeList AS = Call.getAttributes(); 4416 LLVMContext &Ctx = Call.getContext(); 4417 AS = AS.addParamAttribute(Ctx, ArgNos, 4418 Attribute::get(Ctx, Attribute::NonNull)); 4419 Call.setAttributes(AS); 4420 Changed = true; 4421 } 4422 4423 // If the callee is a pointer to a function, attempt to move any casts to the 4424 // arguments of the call/callbr/invoke. 4425 Value *Callee = Call.getCalledValue(); 4426 if (!isa<Function>(Callee) && transformConstExprCastCall(Call)) 4427 return nullptr; 4428 4429 if (Function *CalleeF = dyn_cast<Function>(Callee)) { 4430 // Remove the convergent attr on calls when the callee is not convergent. 4431 if (Call.isConvergent() && !CalleeF->isConvergent() && 4432 !CalleeF->isIntrinsic()) { 4433 LLVM_DEBUG(dbgs() << "Removing convergent attr from instr " << Call 4434 << "\n"); 4435 Call.setNotConvergent(); 4436 return &Call; 4437 } 4438 4439 // If the call and callee calling conventions don't match, this call must 4440 // be unreachable, as the call is undefined. 4441 if (CalleeF->getCallingConv() != Call.getCallingConv() && 4442 // Only do this for calls to a function with a body. A prototype may 4443 // not actually end up matching the implementation's calling conv for a 4444 // variety of reasons (e.g. it may be written in assembly). 4445 !CalleeF->isDeclaration()) { 4446 Instruction *OldCall = &Call; 4447 CreateNonTerminatorUnreachable(OldCall); 4448 // If OldCall does not return void then replaceAllUsesWith undef. 4449 // This allows ValueHandlers and custom metadata to adjust itself. 4450 if (!OldCall->getType()->isVoidTy()) 4451 replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType())); 4452 if (isa<CallInst>(OldCall)) 4453 return eraseInstFromFunction(*OldCall); 4454 4455 // We cannot remove an invoke or a callbr, because it would change thexi 4456 // CFG, just change the callee to a null pointer. 4457 cast<CallBase>(OldCall)->setCalledFunction( 4458 CalleeF->getFunctionType(), 4459 Constant::getNullValue(CalleeF->getType())); 4460 return nullptr; 4461 } 4462 } 4463 4464 if ((isa<ConstantPointerNull>(Callee) && 4465 !NullPointerIsDefined(Call.getFunction())) || 4466 isa<UndefValue>(Callee)) { 4467 // If Call does not return void then replaceAllUsesWith undef. 4468 // This allows ValueHandlers and custom metadata to adjust itself. 4469 if (!Call.getType()->isVoidTy()) 4470 replaceInstUsesWith(Call, UndefValue::get(Call.getType())); 4471 4472 if (Call.isTerminator()) { 4473 // Can't remove an invoke or callbr because we cannot change the CFG. 4474 return nullptr; 4475 } 4476 4477 // This instruction is not reachable, just remove it. 4478 CreateNonTerminatorUnreachable(&Call); 4479 return eraseInstFromFunction(Call); 4480 } 4481 4482 if (IntrinsicInst *II = findInitTrampoline(Callee)) 4483 return transformCallThroughTrampoline(Call, *II); 4484 4485 PointerType *PTy = cast<PointerType>(Callee->getType()); 4486 FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); 4487 if (FTy->isVarArg()) { 4488 int ix = FTy->getNumParams(); 4489 // See if we can optimize any arguments passed through the varargs area of 4490 // the call. 4491 for (auto I = Call.arg_begin() + FTy->getNumParams(), E = Call.arg_end(); 4492 I != E; ++I, ++ix) { 4493 CastInst *CI = dyn_cast<CastInst>(*I); 4494 if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) { 4495 *I = CI->getOperand(0); 4496 4497 // Update the byval type to match the argument type. 4498 if (Call.isByValArgument(ix)) { 4499 Call.removeParamAttr(ix, Attribute::ByVal); 4500 Call.addParamAttr( 4501 ix, Attribute::getWithByValType( 4502 Call.getContext(), 4503 CI->getOperand(0)->getType()->getPointerElementType())); 4504 } 4505 Changed = true; 4506 } 4507 } 4508 } 4509 4510 if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) { 4511 // Inline asm calls cannot throw - mark them 'nounwind'. 4512 Call.setDoesNotThrow(); 4513 Changed = true; 4514 } 4515 4516 // Try to optimize the call if possible, we require DataLayout for most of 4517 // this. None of these calls are seen as possibly dead so go ahead and 4518 // delete the instruction now. 4519 if (CallInst *CI = dyn_cast<CallInst>(&Call)) { 4520 Instruction *I = tryOptimizeCall(CI); 4521 // If we changed something return the result, etc. Otherwise let 4522 // the fallthrough check. 4523 if (I) return eraseInstFromFunction(*I); 4524 } 4525 4526 if (isAllocLikeFn(&Call, &TLI)) 4527 return visitAllocSite(Call); 4528 4529 return Changed ? &Call : nullptr; 4530 } 4531 4532 /// If the callee is a constexpr cast of a function, attempt to move the cast to 4533 /// the arguments of the call/callbr/invoke. 4534 bool InstCombiner::transformConstExprCastCall(CallBase &Call) { 4535 auto *Callee = dyn_cast<Function>(Call.getCalledValue()->stripPointerCasts()); 4536 if (!Callee) 4537 return false; 4538 4539 // If this is a call to a thunk function, don't remove the cast. Thunks are 4540 // used to transparently forward all incoming parameters and outgoing return 4541 // values, so it's important to leave the cast in place. 4542 if (Callee->hasFnAttribute("thunk")) 4543 return false; 4544 4545 // If this is a musttail call, the callee's prototype must match the caller's 4546 // prototype with the exception of pointee types. The code below doesn't 4547 // implement that, so we can't do this transform. 4548 // TODO: Do the transform if it only requires adding pointer casts. 4549 if (Call.isMustTailCall()) 4550 return false; 4551 4552 Instruction *Caller = &Call; 4553 const AttributeList &CallerPAL = Call.getAttributes(); 4554 4555 // Okay, this is a cast from a function to a different type. Unless doing so 4556 // would cause a type conversion of one of our arguments, change this call to 4557 // be a direct call with arguments casted to the appropriate types. 4558 FunctionType *FT = Callee->getFunctionType(); 4559 Type *OldRetTy = Caller->getType(); 4560 Type *NewRetTy = FT->getReturnType(); 4561 4562 // Check to see if we are changing the return type... 4563 if (OldRetTy != NewRetTy) { 4564 4565 if (NewRetTy->isStructTy()) 4566 return false; // TODO: Handle multiple return values. 4567 4568 if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) { 4569 if (Callee->isDeclaration()) 4570 return false; // Cannot transform this return value. 4571 4572 if (!Caller->use_empty() && 4573 // void -> non-void is handled specially 4574 !NewRetTy->isVoidTy()) 4575 return false; // Cannot transform this return value. 4576 } 4577 4578 if (!CallerPAL.isEmpty() && !Caller->use_empty()) { 4579 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4580 if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy))) 4581 return false; // Attribute not compatible with transformed value. 4582 } 4583 4584 // If the callbase is an invoke/callbr instruction, and the return value is 4585 // used by a PHI node in a successor, we cannot change the return type of 4586 // the call because there is no place to put the cast instruction (without 4587 // breaking the critical edge). Bail out in this case. 4588 if (!Caller->use_empty()) { 4589 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) 4590 for (User *U : II->users()) 4591 if (PHINode *PN = dyn_cast<PHINode>(U)) 4592 if (PN->getParent() == II->getNormalDest() || 4593 PN->getParent() == II->getUnwindDest()) 4594 return false; 4595 // FIXME: Be conservative for callbr to avoid a quadratic search. 4596 if (isa<CallBrInst>(Caller)) 4597 return false; 4598 } 4599 } 4600 4601 unsigned NumActualArgs = Call.arg_size(); 4602 unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs); 4603 4604 // Prevent us turning: 4605 // declare void @takes_i32_inalloca(i32* inalloca) 4606 // call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0) 4607 // 4608 // into: 4609 // call void @takes_i32_inalloca(i32* null) 4610 // 4611 // Similarly, avoid folding away bitcasts of byval calls. 4612 if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) || 4613 Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal)) 4614 return false; 4615 4616 auto AI = Call.arg_begin(); 4617 for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) { 4618 Type *ParamTy = FT->getParamType(i); 4619 Type *ActTy = (*AI)->getType(); 4620 4621 if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) 4622 return false; // Cannot transform this parameter value. 4623 4624 if (AttrBuilder(CallerPAL.getParamAttributes(i)) 4625 .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) 4626 return false; // Attribute not compatible with transformed value. 4627 4628 if (Call.isInAllocaArgument(i)) 4629 return false; // Cannot transform to and from inalloca. 4630 4631 // If the parameter is passed as a byval argument, then we have to have a 4632 // sized type and the sized type has to have the same size as the old type. 4633 if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 4634 PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); 4635 if (!ParamPTy || !ParamPTy->getElementType()->isSized()) 4636 return false; 4637 4638 Type *CurElTy = Call.getParamByValType(i); 4639 if (DL.getTypeAllocSize(CurElTy) != 4640 DL.getTypeAllocSize(ParamPTy->getElementType())) 4641 return false; 4642 } 4643 } 4644 4645 if (Callee->isDeclaration()) { 4646 // Do not delete arguments unless we have a function body. 4647 if (FT->getNumParams() < NumActualArgs && !FT->isVarArg()) 4648 return false; 4649 4650 // If the callee is just a declaration, don't change the varargsness of the 4651 // call. We don't want to introduce a varargs call where one doesn't 4652 // already exist. 4653 PointerType *APTy = cast<PointerType>(Call.getCalledValue()->getType()); 4654 if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg()) 4655 return false; 4656 4657 // If both the callee and the cast type are varargs, we still have to make 4658 // sure the number of fixed parameters are the same or we have the same 4659 // ABI issues as if we introduce a varargs call. 4660 if (FT->isVarArg() && 4661 cast<FunctionType>(APTy->getElementType())->isVarArg() && 4662 FT->getNumParams() != 4663 cast<FunctionType>(APTy->getElementType())->getNumParams()) 4664 return false; 4665 } 4666 4667 if (FT->getNumParams() < NumActualArgs && FT->isVarArg() && 4668 !CallerPAL.isEmpty()) { 4669 // In this case we have more arguments than the new function type, but we 4670 // won't be dropping them. Check that these extra arguments have attributes 4671 // that are compatible with being a vararg call argument. 4672 unsigned SRetIdx; 4673 if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) && 4674 SRetIdx > FT->getNumParams()) 4675 return false; 4676 } 4677 4678 // Okay, we decided that this is a safe thing to do: go ahead and start 4679 // inserting cast instructions as necessary. 4680 SmallVector<Value *, 8> Args; 4681 SmallVector<AttributeSet, 8> ArgAttrs; 4682 Args.reserve(NumActualArgs); 4683 ArgAttrs.reserve(NumActualArgs); 4684 4685 // Get any return attributes. 4686 AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex); 4687 4688 // If the return value is not being used, the type may not be compatible 4689 // with the existing attributes. Wipe out any problematic attributes. 4690 RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy)); 4691 4692 LLVMContext &Ctx = Call.getContext(); 4693 AI = Call.arg_begin(); 4694 for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { 4695 Type *ParamTy = FT->getParamType(i); 4696 4697 Value *NewArg = *AI; 4698 if ((*AI)->getType() != ParamTy) 4699 NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy); 4700 Args.push_back(NewArg); 4701 4702 // Add any parameter attributes. 4703 if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) { 4704 AttrBuilder AB(CallerPAL.getParamAttributes(i)); 4705 AB.addByValAttr(NewArg->getType()->getPointerElementType()); 4706 ArgAttrs.push_back(AttributeSet::get(Ctx, AB)); 4707 } else 4708 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4709 } 4710 4711 // If the function takes more arguments than the call was taking, add them 4712 // now. 4713 for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) { 4714 Args.push_back(Constant::getNullValue(FT->getParamType(i))); 4715 ArgAttrs.push_back(AttributeSet()); 4716 } 4717 4718 // If we are removing arguments to the function, emit an obnoxious warning. 4719 if (FT->getNumParams() < NumActualArgs) { 4720 // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722 4721 if (FT->isVarArg()) { 4722 // Add all of the arguments in their promoted form to the arg list. 4723 for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) { 4724 Type *PTy = getPromotedType((*AI)->getType()); 4725 Value *NewArg = *AI; 4726 if (PTy != (*AI)->getType()) { 4727 // Must promote to pass through va_arg area! 4728 Instruction::CastOps opcode = 4729 CastInst::getCastOpcode(*AI, false, PTy, false); 4730 NewArg = Builder.CreateCast(opcode, *AI, PTy); 4731 } 4732 Args.push_back(NewArg); 4733 4734 // Add any parameter attributes. 4735 ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); 4736 } 4737 } 4738 } 4739 4740 AttributeSet FnAttrs = CallerPAL.getFnAttributes(); 4741 4742 if (NewRetTy->isVoidTy()) 4743 Caller->setName(""); // Void type should not have a name. 4744 4745 assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) && 4746 "missing argument attributes"); 4747 AttributeList NewCallerPAL = AttributeList::get( 4748 Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs); 4749 4750 SmallVector<OperandBundleDef, 1> OpBundles; 4751 Call.getOperandBundlesAsDefs(OpBundles); 4752 4753 CallBase *NewCall; 4754 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4755 NewCall = Builder.CreateInvoke(Callee, II->getNormalDest(), 4756 II->getUnwindDest(), Args, OpBundles); 4757 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { 4758 NewCall = Builder.CreateCallBr(Callee, CBI->getDefaultDest(), 4759 CBI->getIndirectDests(), Args, OpBundles); 4760 } else { 4761 NewCall = Builder.CreateCall(Callee, Args, OpBundles); 4762 cast<CallInst>(NewCall)->setTailCallKind( 4763 cast<CallInst>(Caller)->getTailCallKind()); 4764 } 4765 NewCall->takeName(Caller); 4766 NewCall->setCallingConv(Call.getCallingConv()); 4767 NewCall->setAttributes(NewCallerPAL); 4768 4769 // Preserve the weight metadata for the new call instruction. The metadata 4770 // is used by SamplePGO to check callsite's hotness. 4771 uint64_t W; 4772 if (Caller->extractProfTotalWeight(W)) 4773 NewCall->setProfWeight(W); 4774 4775 // Insert a cast of the return type as necessary. 4776 Instruction *NC = NewCall; 4777 Value *NV = NC; 4778 if (OldRetTy != NV->getType() && !Caller->use_empty()) { 4779 if (!NV->getType()->isVoidTy()) { 4780 NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy); 4781 NC->setDebugLoc(Caller->getDebugLoc()); 4782 4783 // If this is an invoke/callbr instruction, we should insert it after the 4784 // first non-phi instruction in the normal successor block. 4785 if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { 4786 BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt(); 4787 InsertNewInstBefore(NC, *I); 4788 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) { 4789 BasicBlock::iterator I = CBI->getDefaultDest()->getFirstInsertionPt(); 4790 InsertNewInstBefore(NC, *I); 4791 } else { 4792 // Otherwise, it's a call, just insert cast right after the call. 4793 InsertNewInstBefore(NC, *Caller); 4794 } 4795 Worklist.pushUsersToWorkList(*Caller); 4796 } else { 4797 NV = UndefValue::get(Caller->getType()); 4798 } 4799 } 4800 4801 if (!Caller->use_empty()) 4802 replaceInstUsesWith(*Caller, NV); 4803 else if (Caller->hasValueHandle()) { 4804 if (OldRetTy == NV->getType()) 4805 ValueHandleBase::ValueIsRAUWd(Caller, NV); 4806 else 4807 // We cannot call ValueIsRAUWd with a different type, and the 4808 // actual tracked value will disappear. 4809 ValueHandleBase::ValueIsDeleted(Caller); 4810 } 4811 4812 eraseInstFromFunction(*Caller); 4813 return true; 4814 } 4815 4816 /// Turn a call to a function created by init_trampoline / adjust_trampoline 4817 /// intrinsic pair into a direct call to the underlying function. 4818 Instruction * 4819 InstCombiner::transformCallThroughTrampoline(CallBase &Call, 4820 IntrinsicInst &Tramp) { 4821 Value *Callee = Call.getCalledValue(); 4822 Type *CalleeTy = Callee->getType(); 4823 FunctionType *FTy = Call.getFunctionType(); 4824 AttributeList Attrs = Call.getAttributes(); 4825 4826 // If the call already has the 'nest' attribute somewhere then give up - 4827 // otherwise 'nest' would occur twice after splicing in the chain. 4828 if (Attrs.hasAttrSomewhere(Attribute::Nest)) 4829 return nullptr; 4830 4831 Function *NestF = cast<Function>(Tramp.getArgOperand(1)->stripPointerCasts()); 4832 FunctionType *NestFTy = NestF->getFunctionType(); 4833 4834 AttributeList NestAttrs = NestF->getAttributes(); 4835 if (!NestAttrs.isEmpty()) { 4836 unsigned NestArgNo = 0; 4837 Type *NestTy = nullptr; 4838 AttributeSet NestAttr; 4839 4840 // Look for a parameter marked with the 'nest' attribute. 4841 for (FunctionType::param_iterator I = NestFTy->param_begin(), 4842 E = NestFTy->param_end(); 4843 I != E; ++NestArgNo, ++I) { 4844 AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo); 4845 if (AS.hasAttribute(Attribute::Nest)) { 4846 // Record the parameter type and any other attributes. 4847 NestTy = *I; 4848 NestAttr = AS; 4849 break; 4850 } 4851 } 4852 4853 if (NestTy) { 4854 std::vector<Value*> NewArgs; 4855 std::vector<AttributeSet> NewArgAttrs; 4856 NewArgs.reserve(Call.arg_size() + 1); 4857 NewArgAttrs.reserve(Call.arg_size()); 4858 4859 // Insert the nest argument into the call argument list, which may 4860 // mean appending it. Likewise for attributes. 4861 4862 { 4863 unsigned ArgNo = 0; 4864 auto I = Call.arg_begin(), E = Call.arg_end(); 4865 do { 4866 if (ArgNo == NestArgNo) { 4867 // Add the chain argument and attributes. 4868 Value *NestVal = Tramp.getArgOperand(2); 4869 if (NestVal->getType() != NestTy) 4870 NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest"); 4871 NewArgs.push_back(NestVal); 4872 NewArgAttrs.push_back(NestAttr); 4873 } 4874 4875 if (I == E) 4876 break; 4877 4878 // Add the original argument and attributes. 4879 NewArgs.push_back(*I); 4880 NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); 4881 4882 ++ArgNo; 4883 ++I; 4884 } while (true); 4885 } 4886 4887 // The trampoline may have been bitcast to a bogus type (FTy). 4888 // Handle this by synthesizing a new function type, equal to FTy 4889 // with the chain parameter inserted. 4890 4891 std::vector<Type*> NewTypes; 4892 NewTypes.reserve(FTy->getNumParams()+1); 4893 4894 // Insert the chain's type into the list of parameter types, which may 4895 // mean appending it. 4896 { 4897 unsigned ArgNo = 0; 4898 FunctionType::param_iterator I = FTy->param_begin(), 4899 E = FTy->param_end(); 4900 4901 do { 4902 if (ArgNo == NestArgNo) 4903 // Add the chain's type. 4904 NewTypes.push_back(NestTy); 4905 4906 if (I == E) 4907 break; 4908 4909 // Add the original type. 4910 NewTypes.push_back(*I); 4911 4912 ++ArgNo; 4913 ++I; 4914 } while (true); 4915 } 4916 4917 // Replace the trampoline call with a direct call. Let the generic 4918 // code sort out any function type mismatches. 4919 FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes, 4920 FTy->isVarArg()); 4921 Constant *NewCallee = 4922 NestF->getType() == PointerType::getUnqual(NewFTy) ? 4923 NestF : ConstantExpr::getBitCast(NestF, 4924 PointerType::getUnqual(NewFTy)); 4925 AttributeList NewPAL = 4926 AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(), 4927 Attrs.getRetAttributes(), NewArgAttrs); 4928 4929 SmallVector<OperandBundleDef, 1> OpBundles; 4930 Call.getOperandBundlesAsDefs(OpBundles); 4931 4932 Instruction *NewCaller; 4933 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { 4934 NewCaller = InvokeInst::Create(NewFTy, NewCallee, 4935 II->getNormalDest(), II->getUnwindDest(), 4936 NewArgs, OpBundles); 4937 cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv()); 4938 cast<InvokeInst>(NewCaller)->setAttributes(NewPAL); 4939 } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(&Call)) { 4940 NewCaller = 4941 CallBrInst::Create(NewFTy, NewCallee, CBI->getDefaultDest(), 4942 CBI->getIndirectDests(), NewArgs, OpBundles); 4943 cast<CallBrInst>(NewCaller)->setCallingConv(CBI->getCallingConv()); 4944 cast<CallBrInst>(NewCaller)->setAttributes(NewPAL); 4945 } else { 4946 NewCaller = CallInst::Create(NewFTy, NewCallee, NewArgs, OpBundles); 4947 cast<CallInst>(NewCaller)->setTailCallKind( 4948 cast<CallInst>(Call).getTailCallKind()); 4949 cast<CallInst>(NewCaller)->setCallingConv( 4950 cast<CallInst>(Call).getCallingConv()); 4951 cast<CallInst>(NewCaller)->setAttributes(NewPAL); 4952 } 4953 NewCaller->setDebugLoc(Call.getDebugLoc()); 4954 4955 return NewCaller; 4956 } 4957 } 4958 4959 // Replace the trampoline call with a direct call. Since there is no 'nest' 4960 // parameter, there is no need to adjust the argument list. Let the generic 4961 // code sort out any function type mismatches. 4962 Constant *NewCallee = ConstantExpr::getBitCast(NestF, CalleeTy); 4963 Call.setCalledFunction(FTy, NewCallee); 4964 return &Call; 4965 } 4966