1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a TargetTransformInfo analysis pass specific to the
10 /// X86 target machine. It uses the target's detailed information to provide
11 /// more precise answers to certain TTI queries, while letting the target
12 /// independent and default TTI implementations handle the rest.
13 ///
14 //===----------------------------------------------------------------------===//
15
16 #include "X86TargetTransformInfo.h"
17 #include "llvm/IR/IntrinsicInst.h"
18 #include "llvm/IR/IntrinsicsX86.h"
19 #include "llvm/Support/KnownBits.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21
22 using namespace llvm;
23
24 #define DEBUG_TYPE "x86tti"
25
26 /// Return a constant boolean vector that has true elements in all positions
27 /// where the input constant data vector has an element with the sign bit set.
getNegativeIsTrueBoolVec(Constant * V)28 static Constant *getNegativeIsTrueBoolVec(Constant *V) {
29 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
30 V = ConstantExpr::getBitCast(V, IntTy);
31 V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
32 V);
33 return V;
34 }
35
36 /// Convert the x86 XMM integer vector mask to a vector of bools based on
37 /// each element's most significant bit (the sign bit).
getBoolVecFromMask(Value * Mask)38 static Value *getBoolVecFromMask(Value *Mask) {
39 // Fold Constant Mask.
40 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
41 return getNegativeIsTrueBoolVec(ConstantMask);
42
43 // Mask was extended from a boolean vector.
44 Value *ExtMask;
45 if (PatternMatch::match(
46 Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
47 ExtMask->getType()->isIntOrIntVectorTy(1))
48 return ExtMask;
49
50 return nullptr;
51 }
52
53 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
54 // XMM register mask efficiently, we could transform all x86 masked intrinsics
55 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
simplifyX86MaskedLoad(IntrinsicInst & II,InstCombiner & IC)56 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
57 Value *Ptr = II.getOperand(0);
58 Value *Mask = II.getOperand(1);
59 Constant *ZeroVec = Constant::getNullValue(II.getType());
60
61 // Zero Mask - masked load instruction creates a zero vector.
62 if (isa<ConstantAggregateZero>(Mask))
63 return IC.replaceInstUsesWith(II, ZeroVec);
64
65 // The mask is constant or extended from a bool vector. Convert this x86
66 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
67 if (Value *BoolMask = getBoolVecFromMask(Mask)) {
68 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
69 // the LLVM intrinsic definition for the pointer argument.
70 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
71 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
72 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
73
74 // The pass-through vector for an x86 masked load is a zero vector.
75 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
76 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
77 return IC.replaceInstUsesWith(II, NewMaskedLoad);
78 }
79
80 return nullptr;
81 }
82
83 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
84 // XMM register mask efficiently, we could transform all x86 masked intrinsics
85 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
simplifyX86MaskedStore(IntrinsicInst & II,InstCombiner & IC)86 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
87 Value *Ptr = II.getOperand(0);
88 Value *Mask = II.getOperand(1);
89 Value *Vec = II.getOperand(2);
90
91 // Zero Mask - this masked store instruction does nothing.
92 if (isa<ConstantAggregateZero>(Mask)) {
93 IC.eraseInstFromFunction(II);
94 return true;
95 }
96
97 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
98 // anything else at this level.
99 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
100 return false;
101
102 // The mask is constant or extended from a bool vector. Convert this x86
103 // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
104 if (Value *BoolMask = getBoolVecFromMask(Mask)) {
105 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
106 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
107 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
108
109 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
110
111 // 'Replace uses' doesn't work for stores. Erase the original masked store.
112 IC.eraseInstFromFunction(II);
113 return true;
114 }
115
116 return false;
117 }
118
simplifyX86immShift(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)119 static Value *simplifyX86immShift(const IntrinsicInst &II,
120 InstCombiner::BuilderTy &Builder) {
121 bool LogicalShift = false;
122 bool ShiftLeft = false;
123 bool IsImm = false;
124
125 switch (II.getIntrinsicID()) {
126 default:
127 llvm_unreachable("Unexpected intrinsic!");
128 case Intrinsic::x86_sse2_psrai_d:
129 case Intrinsic::x86_sse2_psrai_w:
130 case Intrinsic::x86_avx2_psrai_d:
131 case Intrinsic::x86_avx2_psrai_w:
132 case Intrinsic::x86_avx512_psrai_q_128:
133 case Intrinsic::x86_avx512_psrai_q_256:
134 case Intrinsic::x86_avx512_psrai_d_512:
135 case Intrinsic::x86_avx512_psrai_q_512:
136 case Intrinsic::x86_avx512_psrai_w_512:
137 IsImm = true;
138 LLVM_FALLTHROUGH;
139 case Intrinsic::x86_sse2_psra_d:
140 case Intrinsic::x86_sse2_psra_w:
141 case Intrinsic::x86_avx2_psra_d:
142 case Intrinsic::x86_avx2_psra_w:
143 case Intrinsic::x86_avx512_psra_q_128:
144 case Intrinsic::x86_avx512_psra_q_256:
145 case Intrinsic::x86_avx512_psra_d_512:
146 case Intrinsic::x86_avx512_psra_q_512:
147 case Intrinsic::x86_avx512_psra_w_512:
148 LogicalShift = false;
149 ShiftLeft = false;
150 break;
151 case Intrinsic::x86_sse2_psrli_d:
152 case Intrinsic::x86_sse2_psrli_q:
153 case Intrinsic::x86_sse2_psrli_w:
154 case Intrinsic::x86_avx2_psrli_d:
155 case Intrinsic::x86_avx2_psrli_q:
156 case Intrinsic::x86_avx2_psrli_w:
157 case Intrinsic::x86_avx512_psrli_d_512:
158 case Intrinsic::x86_avx512_psrli_q_512:
159 case Intrinsic::x86_avx512_psrli_w_512:
160 IsImm = true;
161 LLVM_FALLTHROUGH;
162 case Intrinsic::x86_sse2_psrl_d:
163 case Intrinsic::x86_sse2_psrl_q:
164 case Intrinsic::x86_sse2_psrl_w:
165 case Intrinsic::x86_avx2_psrl_d:
166 case Intrinsic::x86_avx2_psrl_q:
167 case Intrinsic::x86_avx2_psrl_w:
168 case Intrinsic::x86_avx512_psrl_d_512:
169 case Intrinsic::x86_avx512_psrl_q_512:
170 case Intrinsic::x86_avx512_psrl_w_512:
171 LogicalShift = true;
172 ShiftLeft = false;
173 break;
174 case Intrinsic::x86_sse2_pslli_d:
175 case Intrinsic::x86_sse2_pslli_q:
176 case Intrinsic::x86_sse2_pslli_w:
177 case Intrinsic::x86_avx2_pslli_d:
178 case Intrinsic::x86_avx2_pslli_q:
179 case Intrinsic::x86_avx2_pslli_w:
180 case Intrinsic::x86_avx512_pslli_d_512:
181 case Intrinsic::x86_avx512_pslli_q_512:
182 case Intrinsic::x86_avx512_pslli_w_512:
183 IsImm = true;
184 LLVM_FALLTHROUGH;
185 case Intrinsic::x86_sse2_psll_d:
186 case Intrinsic::x86_sse2_psll_q:
187 case Intrinsic::x86_sse2_psll_w:
188 case Intrinsic::x86_avx2_psll_d:
189 case Intrinsic::x86_avx2_psll_q:
190 case Intrinsic::x86_avx2_psll_w:
191 case Intrinsic::x86_avx512_psll_d_512:
192 case Intrinsic::x86_avx512_psll_q_512:
193 case Intrinsic::x86_avx512_psll_w_512:
194 LogicalShift = true;
195 ShiftLeft = true;
196 break;
197 }
198 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
199
200 Value *Vec = II.getArgOperand(0);
201 Value *Amt = II.getArgOperand(1);
202 auto *VT = cast<FixedVectorType>(Vec->getType());
203 Type *SVT = VT->getElementType();
204 Type *AmtVT = Amt->getType();
205 unsigned VWidth = VT->getNumElements();
206 unsigned BitWidth = SVT->getPrimitiveSizeInBits();
207
208 // If the shift amount is guaranteed to be in-range we can replace it with a
209 // generic shift. If its guaranteed to be out of range, logical shifts combine
210 // to zero and arithmetic shifts are clamped to (BitWidth - 1).
211 if (IsImm) {
212 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
213 KnownBits KnownAmtBits =
214 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
215 if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
216 Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
217 Amt = Builder.CreateVectorSplat(VWidth, Amt);
218 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
219 : Builder.CreateLShr(Vec, Amt))
220 : Builder.CreateAShr(Vec, Amt));
221 }
222 if (KnownAmtBits.getMinValue().uge(BitWidth)) {
223 if (LogicalShift)
224 return ConstantAggregateZero::get(VT);
225 Amt = ConstantInt::get(SVT, BitWidth - 1);
226 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
227 }
228 } else {
229 // Ensure the first element has an in-range value and the rest of the
230 // elements in the bottom 64 bits are zero.
231 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
232 cast<VectorType>(AmtVT)->getElementType() == SVT &&
233 "Unexpected shift-by-scalar type");
234 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
235 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
236 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
237 KnownBits KnownLowerBits = llvm::computeKnownBits(
238 Amt, DemandedLower, II.getModule()->getDataLayout());
239 KnownBits KnownUpperBits = llvm::computeKnownBits(
240 Amt, DemandedUpper, II.getModule()->getDataLayout());
241 if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
242 (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
243 SmallVector<int, 16> ZeroSplat(VWidth, 0);
244 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
245 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
246 : Builder.CreateLShr(Vec, Amt))
247 : Builder.CreateAShr(Vec, Amt));
248 }
249 }
250
251 // Simplify if count is constant vector.
252 auto *CDV = dyn_cast<ConstantDataVector>(Amt);
253 if (!CDV)
254 return nullptr;
255
256 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
257 // operand to compute the shift amount.
258 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
259 cast<VectorType>(AmtVT)->getElementType() == SVT &&
260 "Unexpected shift-by-scalar type");
261
262 // Concatenate the sub-elements to create the 64-bit value.
263 APInt Count(64, 0);
264 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
265 unsigned SubEltIdx = (NumSubElts - 1) - i;
266 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
267 Count <<= BitWidth;
268 Count |= SubElt->getValue().zextOrTrunc(64);
269 }
270
271 // If shift-by-zero then just return the original value.
272 if (Count.isZero())
273 return Vec;
274
275 // Handle cases when Shift >= BitWidth.
276 if (Count.uge(BitWidth)) {
277 // If LogicalShift - just return zero.
278 if (LogicalShift)
279 return ConstantAggregateZero::get(VT);
280
281 // If ArithmeticShift - clamp Shift to (BitWidth - 1).
282 Count = APInt(64, BitWidth - 1);
283 }
284
285 // Get a constant vector of the same type as the first operand.
286 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
287 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
288
289 if (ShiftLeft)
290 return Builder.CreateShl(Vec, ShiftVec);
291
292 if (LogicalShift)
293 return Builder.CreateLShr(Vec, ShiftVec);
294
295 return Builder.CreateAShr(Vec, ShiftVec);
296 }
297
298 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
299 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
300 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
simplifyX86varShift(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)301 static Value *simplifyX86varShift(const IntrinsicInst &II,
302 InstCombiner::BuilderTy &Builder) {
303 bool LogicalShift = false;
304 bool ShiftLeft = false;
305
306 switch (II.getIntrinsicID()) {
307 default:
308 llvm_unreachable("Unexpected intrinsic!");
309 case Intrinsic::x86_avx2_psrav_d:
310 case Intrinsic::x86_avx2_psrav_d_256:
311 case Intrinsic::x86_avx512_psrav_q_128:
312 case Intrinsic::x86_avx512_psrav_q_256:
313 case Intrinsic::x86_avx512_psrav_d_512:
314 case Intrinsic::x86_avx512_psrav_q_512:
315 case Intrinsic::x86_avx512_psrav_w_128:
316 case Intrinsic::x86_avx512_psrav_w_256:
317 case Intrinsic::x86_avx512_psrav_w_512:
318 LogicalShift = false;
319 ShiftLeft = false;
320 break;
321 case Intrinsic::x86_avx2_psrlv_d:
322 case Intrinsic::x86_avx2_psrlv_d_256:
323 case Intrinsic::x86_avx2_psrlv_q:
324 case Intrinsic::x86_avx2_psrlv_q_256:
325 case Intrinsic::x86_avx512_psrlv_d_512:
326 case Intrinsic::x86_avx512_psrlv_q_512:
327 case Intrinsic::x86_avx512_psrlv_w_128:
328 case Intrinsic::x86_avx512_psrlv_w_256:
329 case Intrinsic::x86_avx512_psrlv_w_512:
330 LogicalShift = true;
331 ShiftLeft = false;
332 break;
333 case Intrinsic::x86_avx2_psllv_d:
334 case Intrinsic::x86_avx2_psllv_d_256:
335 case Intrinsic::x86_avx2_psllv_q:
336 case Intrinsic::x86_avx2_psllv_q_256:
337 case Intrinsic::x86_avx512_psllv_d_512:
338 case Intrinsic::x86_avx512_psllv_q_512:
339 case Intrinsic::x86_avx512_psllv_w_128:
340 case Intrinsic::x86_avx512_psllv_w_256:
341 case Intrinsic::x86_avx512_psllv_w_512:
342 LogicalShift = true;
343 ShiftLeft = true;
344 break;
345 }
346 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
347
348 Value *Vec = II.getArgOperand(0);
349 Value *Amt = II.getArgOperand(1);
350 auto *VT = cast<FixedVectorType>(II.getType());
351 Type *SVT = VT->getElementType();
352 int NumElts = VT->getNumElements();
353 int BitWidth = SVT->getIntegerBitWidth();
354
355 // If the shift amount is guaranteed to be in-range we can replace it with a
356 // generic shift.
357 KnownBits KnownAmt =
358 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
359 if (KnownAmt.getMaxValue().ult(BitWidth)) {
360 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
361 : Builder.CreateLShr(Vec, Amt))
362 : Builder.CreateAShr(Vec, Amt));
363 }
364
365 // Simplify if all shift amounts are constant/undef.
366 auto *CShift = dyn_cast<Constant>(Amt);
367 if (!CShift)
368 return nullptr;
369
370 // Collect each element's shift amount.
371 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
372 bool AnyOutOfRange = false;
373 SmallVector<int, 8> ShiftAmts;
374 for (int I = 0; I < NumElts; ++I) {
375 auto *CElt = CShift->getAggregateElement(I);
376 if (isa_and_nonnull<UndefValue>(CElt)) {
377 ShiftAmts.push_back(-1);
378 continue;
379 }
380
381 auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
382 if (!COp)
383 return nullptr;
384
385 // Handle out of range shifts.
386 // If LogicalShift - set to BitWidth (special case).
387 // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
388 APInt ShiftVal = COp->getValue();
389 if (ShiftVal.uge(BitWidth)) {
390 AnyOutOfRange = LogicalShift;
391 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
392 continue;
393 }
394
395 ShiftAmts.push_back((int)ShiftVal.getZExtValue());
396 }
397
398 // If all elements out of range or UNDEF, return vector of zeros/undefs.
399 // ArithmeticShift should only hit this if they are all UNDEF.
400 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
401 if (llvm::all_of(ShiftAmts, OutOfRange)) {
402 SmallVector<Constant *, 8> ConstantVec;
403 for (int Idx : ShiftAmts) {
404 if (Idx < 0) {
405 ConstantVec.push_back(UndefValue::get(SVT));
406 } else {
407 assert(LogicalShift && "Logical shift expected");
408 ConstantVec.push_back(ConstantInt::getNullValue(SVT));
409 }
410 }
411 return ConstantVector::get(ConstantVec);
412 }
413
414 // We can't handle only some out of range values with generic logical shifts.
415 if (AnyOutOfRange)
416 return nullptr;
417
418 // Build the shift amount constant vector.
419 SmallVector<Constant *, 8> ShiftVecAmts;
420 for (int Idx : ShiftAmts) {
421 if (Idx < 0)
422 ShiftVecAmts.push_back(UndefValue::get(SVT));
423 else
424 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
425 }
426 auto ShiftVec = ConstantVector::get(ShiftVecAmts);
427
428 if (ShiftLeft)
429 return Builder.CreateShl(Vec, ShiftVec);
430
431 if (LogicalShift)
432 return Builder.CreateLShr(Vec, ShiftVec);
433
434 return Builder.CreateAShr(Vec, ShiftVec);
435 }
436
simplifyX86pack(IntrinsicInst & II,InstCombiner::BuilderTy & Builder,bool IsSigned)437 static Value *simplifyX86pack(IntrinsicInst &II,
438 InstCombiner::BuilderTy &Builder, bool IsSigned) {
439 Value *Arg0 = II.getArgOperand(0);
440 Value *Arg1 = II.getArgOperand(1);
441 Type *ResTy = II.getType();
442
443 // Fast all undef handling.
444 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
445 return UndefValue::get(ResTy);
446
447 auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
448 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
449 unsigned NumSrcElts = ArgTy->getNumElements();
450 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
451 "Unexpected packing types");
452
453 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
454 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
455 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
456 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
457 "Unexpected packing types");
458
459 // Constant folding.
460 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
461 return nullptr;
462
463 // Clamp Values - signed/unsigned both use signed clamp values, but they
464 // differ on the min/max values.
465 APInt MinValue, MaxValue;
466 if (IsSigned) {
467 // PACKSS: Truncate signed value with signed saturation.
468 // Source values less than dst minint are saturated to minint.
469 // Source values greater than dst maxint are saturated to maxint.
470 MinValue =
471 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
472 MaxValue =
473 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
474 } else {
475 // PACKUS: Truncate signed value with unsigned saturation.
476 // Source values less than zero are saturated to zero.
477 // Source values greater than dst maxuint are saturated to maxuint.
478 MinValue = APInt::getZero(SrcScalarSizeInBits);
479 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
480 }
481
482 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
483 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
484 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
485 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
486 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
487 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
488
489 // Shuffle clamped args together at the lane level.
490 SmallVector<int, 32> PackMask;
491 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
492 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
493 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
494 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
495 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
496 }
497 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
498
499 // Truncate to dst size.
500 return Builder.CreateTrunc(Shuffle, ResTy);
501 }
502
simplifyX86movmsk(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)503 static Value *simplifyX86movmsk(const IntrinsicInst &II,
504 InstCombiner::BuilderTy &Builder) {
505 Value *Arg = II.getArgOperand(0);
506 Type *ResTy = II.getType();
507
508 // movmsk(undef) -> zero as we must ensure the upper bits are zero.
509 if (isa<UndefValue>(Arg))
510 return Constant::getNullValue(ResTy);
511
512 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
513 // We can't easily peek through x86_mmx types.
514 if (!ArgTy)
515 return nullptr;
516
517 // Expand MOVMSK to compare/bitcast/zext:
518 // e.g. PMOVMSKB(v16i8 x):
519 // %cmp = icmp slt <16 x i8> %x, zeroinitializer
520 // %int = bitcast <16 x i1> %cmp to i16
521 // %res = zext i16 %int to i32
522 unsigned NumElts = ArgTy->getNumElements();
523 Type *IntegerTy = Builder.getIntNTy(NumElts);
524
525 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
526 Res = Builder.CreateIsNeg(Res);
527 Res = Builder.CreateBitCast(Res, IntegerTy);
528 Res = Builder.CreateZExtOrTrunc(Res, ResTy);
529 return Res;
530 }
531
simplifyX86addcarry(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)532 static Value *simplifyX86addcarry(const IntrinsicInst &II,
533 InstCombiner::BuilderTy &Builder) {
534 Value *CarryIn = II.getArgOperand(0);
535 Value *Op1 = II.getArgOperand(1);
536 Value *Op2 = II.getArgOperand(2);
537 Type *RetTy = II.getType();
538 Type *OpTy = Op1->getType();
539 assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
540 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
541 "Unexpected types for x86 addcarry");
542
543 // If carry-in is zero, this is just an unsigned add with overflow.
544 if (match(CarryIn, PatternMatch::m_ZeroInt())) {
545 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
546 {Op1, Op2});
547 // The types have to be adjusted to match the x86 call types.
548 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
549 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
550 Builder.getInt8Ty());
551 Value *Res = UndefValue::get(RetTy);
552 Res = Builder.CreateInsertValue(Res, UAddOV, 0);
553 return Builder.CreateInsertValue(Res, UAddResult, 1);
554 }
555
556 return nullptr;
557 }
558
simplifyX86insertps(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)559 static Value *simplifyX86insertps(const IntrinsicInst &II,
560 InstCombiner::BuilderTy &Builder) {
561 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
562 if (!CInt)
563 return nullptr;
564
565 auto *VecTy = cast<FixedVectorType>(II.getType());
566 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
567
568 // The immediate permute control byte looks like this:
569 // [3:0] - zero mask for each 32-bit lane
570 // [5:4] - select one 32-bit destination lane
571 // [7:6] - select one 32-bit source lane
572
573 uint8_t Imm = CInt->getZExtValue();
574 uint8_t ZMask = Imm & 0xf;
575 uint8_t DestLane = (Imm >> 4) & 0x3;
576 uint8_t SourceLane = (Imm >> 6) & 0x3;
577
578 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
579
580 // If all zero mask bits are set, this was just a weird way to
581 // generate a zero vector.
582 if (ZMask == 0xf)
583 return ZeroVector;
584
585 // Initialize by passing all of the first source bits through.
586 int ShuffleMask[4] = {0, 1, 2, 3};
587
588 // We may replace the second operand with the zero vector.
589 Value *V1 = II.getArgOperand(1);
590
591 if (ZMask) {
592 // If the zero mask is being used with a single input or the zero mask
593 // overrides the destination lane, this is a shuffle with the zero vector.
594 if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
595 (ZMask & (1 << DestLane))) {
596 V1 = ZeroVector;
597 // We may still move 32-bits of the first source vector from one lane
598 // to another.
599 ShuffleMask[DestLane] = SourceLane;
600 // The zero mask may override the previous insert operation.
601 for (unsigned i = 0; i < 4; ++i)
602 if ((ZMask >> i) & 0x1)
603 ShuffleMask[i] = i + 4;
604 } else {
605 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
606 return nullptr;
607 }
608 } else {
609 // Replace the selected destination lane with the selected source lane.
610 ShuffleMask[DestLane] = SourceLane + 4;
611 }
612
613 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
614 }
615
616 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
617 /// or conversion to a shuffle vector.
simplifyX86extrq(IntrinsicInst & II,Value * Op0,ConstantInt * CILength,ConstantInt * CIIndex,InstCombiner::BuilderTy & Builder)618 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
619 ConstantInt *CILength, ConstantInt *CIIndex,
620 InstCombiner::BuilderTy &Builder) {
621 auto LowConstantHighUndef = [&](uint64_t Val) {
622 Type *IntTy64 = Type::getInt64Ty(II.getContext());
623 Constant *Args[] = {ConstantInt::get(IntTy64, Val),
624 UndefValue::get(IntTy64)};
625 return ConstantVector::get(Args);
626 };
627
628 // See if we're dealing with constant values.
629 auto *C0 = dyn_cast<Constant>(Op0);
630 auto *CI0 =
631 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
632 : nullptr;
633
634 // Attempt to constant fold.
635 if (CILength && CIIndex) {
636 // From AMD documentation: "The bit index and field length are each six
637 // bits in length other bits of the field are ignored."
638 APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
639 APInt APLength = CILength->getValue().zextOrTrunc(6);
640
641 unsigned Index = APIndex.getZExtValue();
642
643 // From AMD documentation: "a value of zero in the field length is
644 // defined as length of 64".
645 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
646
647 // From AMD documentation: "If the sum of the bit index + length field
648 // is greater than 64, the results are undefined".
649 unsigned End = Index + Length;
650
651 // Note that both field index and field length are 8-bit quantities.
652 // Since variables 'Index' and 'Length' are unsigned values
653 // obtained from zero-extending field index and field length
654 // respectively, their sum should never wrap around.
655 if (End > 64)
656 return UndefValue::get(II.getType());
657
658 // If we are inserting whole bytes, we can convert this to a shuffle.
659 // Lowering can recognize EXTRQI shuffle masks.
660 if ((Length % 8) == 0 && (Index % 8) == 0) {
661 // Convert bit indices to byte indices.
662 Length /= 8;
663 Index /= 8;
664
665 Type *IntTy8 = Type::getInt8Ty(II.getContext());
666 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
667
668 SmallVector<int, 16> ShuffleMask;
669 for (int i = 0; i != (int)Length; ++i)
670 ShuffleMask.push_back(i + Index);
671 for (int i = Length; i != 8; ++i)
672 ShuffleMask.push_back(i + 16);
673 for (int i = 8; i != 16; ++i)
674 ShuffleMask.push_back(-1);
675
676 Value *SV = Builder.CreateShuffleVector(
677 Builder.CreateBitCast(Op0, ShufTy),
678 ConstantAggregateZero::get(ShufTy), ShuffleMask);
679 return Builder.CreateBitCast(SV, II.getType());
680 }
681
682 // Constant Fold - shift Index'th bit to lowest position and mask off
683 // Length bits.
684 if (CI0) {
685 APInt Elt = CI0->getValue();
686 Elt.lshrInPlace(Index);
687 Elt = Elt.zextOrTrunc(Length);
688 return LowConstantHighUndef(Elt.getZExtValue());
689 }
690
691 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
692 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
693 Value *Args[] = {Op0, CILength, CIIndex};
694 Module *M = II.getModule();
695 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
696 return Builder.CreateCall(F, Args);
697 }
698 }
699
700 // Constant Fold - extraction from zero is always {zero, undef}.
701 if (CI0 && CI0->isZero())
702 return LowConstantHighUndef(0);
703
704 return nullptr;
705 }
706
707 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
708 /// folding or conversion to a shuffle vector.
simplifyX86insertq(IntrinsicInst & II,Value * Op0,Value * Op1,APInt APLength,APInt APIndex,InstCombiner::BuilderTy & Builder)709 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
710 APInt APLength, APInt APIndex,
711 InstCombiner::BuilderTy &Builder) {
712 // From AMD documentation: "The bit index and field length are each six bits
713 // in length other bits of the field are ignored."
714 APIndex = APIndex.zextOrTrunc(6);
715 APLength = APLength.zextOrTrunc(6);
716
717 // Attempt to constant fold.
718 unsigned Index = APIndex.getZExtValue();
719
720 // From AMD documentation: "a value of zero in the field length is
721 // defined as length of 64".
722 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
723
724 // From AMD documentation: "If the sum of the bit index + length field
725 // is greater than 64, the results are undefined".
726 unsigned End = Index + Length;
727
728 // Note that both field index and field length are 8-bit quantities.
729 // Since variables 'Index' and 'Length' are unsigned values
730 // obtained from zero-extending field index and field length
731 // respectively, their sum should never wrap around.
732 if (End > 64)
733 return UndefValue::get(II.getType());
734
735 // If we are inserting whole bytes, we can convert this to a shuffle.
736 // Lowering can recognize INSERTQI shuffle masks.
737 if ((Length % 8) == 0 && (Index % 8) == 0) {
738 // Convert bit indices to byte indices.
739 Length /= 8;
740 Index /= 8;
741
742 Type *IntTy8 = Type::getInt8Ty(II.getContext());
743 auto *ShufTy = FixedVectorType::get(IntTy8, 16);
744
745 SmallVector<int, 16> ShuffleMask;
746 for (int i = 0; i != (int)Index; ++i)
747 ShuffleMask.push_back(i);
748 for (int i = 0; i != (int)Length; ++i)
749 ShuffleMask.push_back(i + 16);
750 for (int i = Index + Length; i != 8; ++i)
751 ShuffleMask.push_back(i);
752 for (int i = 8; i != 16; ++i)
753 ShuffleMask.push_back(-1);
754
755 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
756 Builder.CreateBitCast(Op1, ShufTy),
757 ShuffleMask);
758 return Builder.CreateBitCast(SV, II.getType());
759 }
760
761 // See if we're dealing with constant values.
762 auto *C0 = dyn_cast<Constant>(Op0);
763 auto *C1 = dyn_cast<Constant>(Op1);
764 auto *CI00 =
765 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
766 : nullptr;
767 auto *CI10 =
768 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
769 : nullptr;
770
771 // Constant Fold - insert bottom Length bits starting at the Index'th bit.
772 if (CI00 && CI10) {
773 APInt V00 = CI00->getValue();
774 APInt V10 = CI10->getValue();
775 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
776 V00 = V00 & ~Mask;
777 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
778 APInt Val = V00 | V10;
779 Type *IntTy64 = Type::getInt64Ty(II.getContext());
780 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
781 UndefValue::get(IntTy64)};
782 return ConstantVector::get(Args);
783 }
784
785 // If we were an INSERTQ call, we'll save demanded elements if we convert to
786 // INSERTQI.
787 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
788 Type *IntTy8 = Type::getInt8Ty(II.getContext());
789 Constant *CILength = ConstantInt::get(IntTy8, Length, false);
790 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
791
792 Value *Args[] = {Op0, Op1, CILength, CIIndex};
793 Module *M = II.getModule();
794 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
795 return Builder.CreateCall(F, Args);
796 }
797
798 return nullptr;
799 }
800
801 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
simplifyX86pshufb(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)802 static Value *simplifyX86pshufb(const IntrinsicInst &II,
803 InstCombiner::BuilderTy &Builder) {
804 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
805 if (!V)
806 return nullptr;
807
808 auto *VecTy = cast<FixedVectorType>(II.getType());
809 unsigned NumElts = VecTy->getNumElements();
810 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
811 "Unexpected number of elements in shuffle mask!");
812
813 // Construct a shuffle mask from constant integers or UNDEFs.
814 int Indexes[64];
815
816 // Each byte in the shuffle control mask forms an index to permute the
817 // corresponding byte in the destination operand.
818 for (unsigned I = 0; I < NumElts; ++I) {
819 Constant *COp = V->getAggregateElement(I);
820 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
821 return nullptr;
822
823 if (isa<UndefValue>(COp)) {
824 Indexes[I] = -1;
825 continue;
826 }
827
828 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
829
830 // If the most significant bit (bit[7]) of each byte of the shuffle
831 // control mask is set, then zero is written in the result byte.
832 // The zero vector is in the right-hand side of the resulting
833 // shufflevector.
834
835 // The value of each index for the high 128-bit lane is the least
836 // significant 4 bits of the respective shuffle control byte.
837 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
838 Indexes[I] = Index;
839 }
840
841 auto V1 = II.getArgOperand(0);
842 auto V2 = Constant::getNullValue(VecTy);
843 return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
844 }
845
846 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
simplifyX86vpermilvar(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)847 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
848 InstCombiner::BuilderTy &Builder) {
849 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
850 if (!V)
851 return nullptr;
852
853 auto *VecTy = cast<FixedVectorType>(II.getType());
854 unsigned NumElts = VecTy->getNumElements();
855 bool IsPD = VecTy->getScalarType()->isDoubleTy();
856 unsigned NumLaneElts = IsPD ? 2 : 4;
857 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
858
859 // Construct a shuffle mask from constant integers or UNDEFs.
860 int Indexes[16];
861
862 // The intrinsics only read one or two bits, clear the rest.
863 for (unsigned I = 0; I < NumElts; ++I) {
864 Constant *COp = V->getAggregateElement(I);
865 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
866 return nullptr;
867
868 if (isa<UndefValue>(COp)) {
869 Indexes[I] = -1;
870 continue;
871 }
872
873 APInt Index = cast<ConstantInt>(COp)->getValue();
874 Index = Index.zextOrTrunc(32).getLoBits(2);
875
876 // The PD variants uses bit 1 to select per-lane element index, so
877 // shift down to convert to generic shuffle mask index.
878 if (IsPD)
879 Index.lshrInPlace(1);
880
881 // The _256 variants are a bit trickier since the mask bits always index
882 // into the corresponding 128 half. In order to convert to a generic
883 // shuffle, we have to make that explicit.
884 Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
885
886 Indexes[I] = Index.getZExtValue();
887 }
888
889 auto V1 = II.getArgOperand(0);
890 return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
891 }
892
893 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
simplifyX86vpermv(const IntrinsicInst & II,InstCombiner::BuilderTy & Builder)894 static Value *simplifyX86vpermv(const IntrinsicInst &II,
895 InstCombiner::BuilderTy &Builder) {
896 auto *V = dyn_cast<Constant>(II.getArgOperand(1));
897 if (!V)
898 return nullptr;
899
900 auto *VecTy = cast<FixedVectorType>(II.getType());
901 unsigned Size = VecTy->getNumElements();
902 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
903 "Unexpected shuffle mask size");
904
905 // Construct a shuffle mask from constant integers or UNDEFs.
906 int Indexes[64];
907
908 for (unsigned I = 0; I < Size; ++I) {
909 Constant *COp = V->getAggregateElement(I);
910 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
911 return nullptr;
912
913 if (isa<UndefValue>(COp)) {
914 Indexes[I] = -1;
915 continue;
916 }
917
918 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
919 Index &= Size - 1;
920 Indexes[I] = Index;
921 }
922
923 auto V1 = II.getArgOperand(0);
924 return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
925 }
926
927 Optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const928 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
929 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
930 unsigned DemandedWidth) {
931 APInt UndefElts(Width, 0);
932 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
933 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
934 };
935
936 Intrinsic::ID IID = II.getIntrinsicID();
937 switch (IID) {
938 case Intrinsic::x86_bmi_bextr_32:
939 case Intrinsic::x86_bmi_bextr_64:
940 case Intrinsic::x86_tbm_bextri_u32:
941 case Intrinsic::x86_tbm_bextri_u64:
942 // If the RHS is a constant we can try some simplifications.
943 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
944 uint64_t Shift = C->getZExtValue();
945 uint64_t Length = (Shift >> 8) & 0xff;
946 Shift &= 0xff;
947 unsigned BitWidth = II.getType()->getIntegerBitWidth();
948 // If the length is 0 or the shift is out of range, replace with zero.
949 if (Length == 0 || Shift >= BitWidth) {
950 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
951 }
952 // If the LHS is also a constant, we can completely constant fold this.
953 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
954 uint64_t Result = InC->getZExtValue() >> Shift;
955 if (Length > BitWidth)
956 Length = BitWidth;
957 Result &= maskTrailingOnes<uint64_t>(Length);
958 return IC.replaceInstUsesWith(II,
959 ConstantInt::get(II.getType(), Result));
960 }
961 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
962 // are only masking bits that a shift already cleared?
963 }
964 break;
965
966 case Intrinsic::x86_bmi_bzhi_32:
967 case Intrinsic::x86_bmi_bzhi_64:
968 // If the RHS is a constant we can try some simplifications.
969 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
970 uint64_t Index = C->getZExtValue() & 0xff;
971 unsigned BitWidth = II.getType()->getIntegerBitWidth();
972 if (Index >= BitWidth) {
973 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
974 }
975 if (Index == 0) {
976 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
977 }
978 // If the LHS is also a constant, we can completely constant fold this.
979 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
980 uint64_t Result = InC->getZExtValue();
981 Result &= maskTrailingOnes<uint64_t>(Index);
982 return IC.replaceInstUsesWith(II,
983 ConstantInt::get(II.getType(), Result));
984 }
985 // TODO should we convert this to an AND if the RHS is constant?
986 }
987 break;
988 case Intrinsic::x86_bmi_pext_32:
989 case Intrinsic::x86_bmi_pext_64:
990 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
991 if (MaskC->isNullValue()) {
992 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
993 }
994 if (MaskC->isAllOnesValue()) {
995 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
996 }
997
998 unsigned MaskIdx, MaskLen;
999 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
1000 // any single contingous sequence of 1s anywhere in the mask simply
1001 // describes a subset of the input bits shifted to the appropriate
1002 // position. Replace with the straight forward IR.
1003 Value *Input = II.getArgOperand(0);
1004 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
1005 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
1006 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
1007 return IC.replaceInstUsesWith(II, Shifted);
1008 }
1009
1010 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1011 uint64_t Src = SrcC->getZExtValue();
1012 uint64_t Mask = MaskC->getZExtValue();
1013 uint64_t Result = 0;
1014 uint64_t BitToSet = 1;
1015
1016 while (Mask) {
1017 // Isolate lowest set bit.
1018 uint64_t BitToTest = Mask & -Mask;
1019 if (BitToTest & Src)
1020 Result |= BitToSet;
1021
1022 BitToSet <<= 1;
1023 // Clear lowest set bit.
1024 Mask &= Mask - 1;
1025 }
1026
1027 return IC.replaceInstUsesWith(II,
1028 ConstantInt::get(II.getType(), Result));
1029 }
1030 }
1031 break;
1032 case Intrinsic::x86_bmi_pdep_32:
1033 case Intrinsic::x86_bmi_pdep_64:
1034 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
1035 if (MaskC->isNullValue()) {
1036 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
1037 }
1038 if (MaskC->isAllOnesValue()) {
1039 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1040 }
1041
1042 unsigned MaskIdx, MaskLen;
1043 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
1044 // any single contingous sequence of 1s anywhere in the mask simply
1045 // describes a subset of the input bits shifted to the appropriate
1046 // position. Replace with the straight forward IR.
1047 Value *Input = II.getArgOperand(0);
1048 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
1049 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
1050 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
1051 return IC.replaceInstUsesWith(II, Masked);
1052 }
1053
1054 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
1055 uint64_t Src = SrcC->getZExtValue();
1056 uint64_t Mask = MaskC->getZExtValue();
1057 uint64_t Result = 0;
1058 uint64_t BitToTest = 1;
1059
1060 while (Mask) {
1061 // Isolate lowest set bit.
1062 uint64_t BitToSet = Mask & -Mask;
1063 if (BitToTest & Src)
1064 Result |= BitToSet;
1065
1066 BitToTest <<= 1;
1067 // Clear lowest set bit;
1068 Mask &= Mask - 1;
1069 }
1070
1071 return IC.replaceInstUsesWith(II,
1072 ConstantInt::get(II.getType(), Result));
1073 }
1074 }
1075 break;
1076
1077 case Intrinsic::x86_sse_cvtss2si:
1078 case Intrinsic::x86_sse_cvtss2si64:
1079 case Intrinsic::x86_sse_cvttss2si:
1080 case Intrinsic::x86_sse_cvttss2si64:
1081 case Intrinsic::x86_sse2_cvtsd2si:
1082 case Intrinsic::x86_sse2_cvtsd2si64:
1083 case Intrinsic::x86_sse2_cvttsd2si:
1084 case Intrinsic::x86_sse2_cvttsd2si64:
1085 case Intrinsic::x86_avx512_vcvtss2si32:
1086 case Intrinsic::x86_avx512_vcvtss2si64:
1087 case Intrinsic::x86_avx512_vcvtss2usi32:
1088 case Intrinsic::x86_avx512_vcvtss2usi64:
1089 case Intrinsic::x86_avx512_vcvtsd2si32:
1090 case Intrinsic::x86_avx512_vcvtsd2si64:
1091 case Intrinsic::x86_avx512_vcvtsd2usi32:
1092 case Intrinsic::x86_avx512_vcvtsd2usi64:
1093 case Intrinsic::x86_avx512_cvttss2si:
1094 case Intrinsic::x86_avx512_cvttss2si64:
1095 case Intrinsic::x86_avx512_cvttss2usi:
1096 case Intrinsic::x86_avx512_cvttss2usi64:
1097 case Intrinsic::x86_avx512_cvttsd2si:
1098 case Intrinsic::x86_avx512_cvttsd2si64:
1099 case Intrinsic::x86_avx512_cvttsd2usi:
1100 case Intrinsic::x86_avx512_cvttsd2usi64: {
1101 // These intrinsics only demand the 0th element of their input vectors. If
1102 // we can simplify the input based on that, do so now.
1103 Value *Arg = II.getArgOperand(0);
1104 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
1105 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
1106 return IC.replaceOperand(II, 0, V);
1107 }
1108 break;
1109 }
1110
1111 case Intrinsic::x86_mmx_pmovmskb:
1112 case Intrinsic::x86_sse_movmsk_ps:
1113 case Intrinsic::x86_sse2_movmsk_pd:
1114 case Intrinsic::x86_sse2_pmovmskb_128:
1115 case Intrinsic::x86_avx_movmsk_pd_256:
1116 case Intrinsic::x86_avx_movmsk_ps_256:
1117 case Intrinsic::x86_avx2_pmovmskb:
1118 if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
1119 return IC.replaceInstUsesWith(II, V);
1120 }
1121 break;
1122
1123 case Intrinsic::x86_sse_comieq_ss:
1124 case Intrinsic::x86_sse_comige_ss:
1125 case Intrinsic::x86_sse_comigt_ss:
1126 case Intrinsic::x86_sse_comile_ss:
1127 case Intrinsic::x86_sse_comilt_ss:
1128 case Intrinsic::x86_sse_comineq_ss:
1129 case Intrinsic::x86_sse_ucomieq_ss:
1130 case Intrinsic::x86_sse_ucomige_ss:
1131 case Intrinsic::x86_sse_ucomigt_ss:
1132 case Intrinsic::x86_sse_ucomile_ss:
1133 case Intrinsic::x86_sse_ucomilt_ss:
1134 case Intrinsic::x86_sse_ucomineq_ss:
1135 case Intrinsic::x86_sse2_comieq_sd:
1136 case Intrinsic::x86_sse2_comige_sd:
1137 case Intrinsic::x86_sse2_comigt_sd:
1138 case Intrinsic::x86_sse2_comile_sd:
1139 case Intrinsic::x86_sse2_comilt_sd:
1140 case Intrinsic::x86_sse2_comineq_sd:
1141 case Intrinsic::x86_sse2_ucomieq_sd:
1142 case Intrinsic::x86_sse2_ucomige_sd:
1143 case Intrinsic::x86_sse2_ucomigt_sd:
1144 case Intrinsic::x86_sse2_ucomile_sd:
1145 case Intrinsic::x86_sse2_ucomilt_sd:
1146 case Intrinsic::x86_sse2_ucomineq_sd:
1147 case Intrinsic::x86_avx512_vcomi_ss:
1148 case Intrinsic::x86_avx512_vcomi_sd:
1149 case Intrinsic::x86_avx512_mask_cmp_ss:
1150 case Intrinsic::x86_avx512_mask_cmp_sd: {
1151 // These intrinsics only demand the 0th element of their input vectors. If
1152 // we can simplify the input based on that, do so now.
1153 bool MadeChange = false;
1154 Value *Arg0 = II.getArgOperand(0);
1155 Value *Arg1 = II.getArgOperand(1);
1156 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
1157 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
1158 IC.replaceOperand(II, 0, V);
1159 MadeChange = true;
1160 }
1161 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
1162 IC.replaceOperand(II, 1, V);
1163 MadeChange = true;
1164 }
1165 if (MadeChange) {
1166 return &II;
1167 }
1168 break;
1169 }
1170
1171 case Intrinsic::x86_avx512_add_ps_512:
1172 case Intrinsic::x86_avx512_div_ps_512:
1173 case Intrinsic::x86_avx512_mul_ps_512:
1174 case Intrinsic::x86_avx512_sub_ps_512:
1175 case Intrinsic::x86_avx512_add_pd_512:
1176 case Intrinsic::x86_avx512_div_pd_512:
1177 case Intrinsic::x86_avx512_mul_pd_512:
1178 case Intrinsic::x86_avx512_sub_pd_512:
1179 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1180 // IR operations.
1181 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1182 if (R->getValue() == 4) {
1183 Value *Arg0 = II.getArgOperand(0);
1184 Value *Arg1 = II.getArgOperand(1);
1185
1186 Value *V;
1187 switch (IID) {
1188 default:
1189 llvm_unreachable("Case stmts out of sync!");
1190 case Intrinsic::x86_avx512_add_ps_512:
1191 case Intrinsic::x86_avx512_add_pd_512:
1192 V = IC.Builder.CreateFAdd(Arg0, Arg1);
1193 break;
1194 case Intrinsic::x86_avx512_sub_ps_512:
1195 case Intrinsic::x86_avx512_sub_pd_512:
1196 V = IC.Builder.CreateFSub(Arg0, Arg1);
1197 break;
1198 case Intrinsic::x86_avx512_mul_ps_512:
1199 case Intrinsic::x86_avx512_mul_pd_512:
1200 V = IC.Builder.CreateFMul(Arg0, Arg1);
1201 break;
1202 case Intrinsic::x86_avx512_div_ps_512:
1203 case Intrinsic::x86_avx512_div_pd_512:
1204 V = IC.Builder.CreateFDiv(Arg0, Arg1);
1205 break;
1206 }
1207
1208 return IC.replaceInstUsesWith(II, V);
1209 }
1210 }
1211 break;
1212
1213 case Intrinsic::x86_avx512_mask_add_ss_round:
1214 case Intrinsic::x86_avx512_mask_div_ss_round:
1215 case Intrinsic::x86_avx512_mask_mul_ss_round:
1216 case Intrinsic::x86_avx512_mask_sub_ss_round:
1217 case Intrinsic::x86_avx512_mask_add_sd_round:
1218 case Intrinsic::x86_avx512_mask_div_sd_round:
1219 case Intrinsic::x86_avx512_mask_mul_sd_round:
1220 case Intrinsic::x86_avx512_mask_sub_sd_round:
1221 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
1222 // IR operations.
1223 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
1224 if (R->getValue() == 4) {
1225 // Extract the element as scalars.
1226 Value *Arg0 = II.getArgOperand(0);
1227 Value *Arg1 = II.getArgOperand(1);
1228 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
1229 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
1230
1231 Value *V;
1232 switch (IID) {
1233 default:
1234 llvm_unreachable("Case stmts out of sync!");
1235 case Intrinsic::x86_avx512_mask_add_ss_round:
1236 case Intrinsic::x86_avx512_mask_add_sd_round:
1237 V = IC.Builder.CreateFAdd(LHS, RHS);
1238 break;
1239 case Intrinsic::x86_avx512_mask_sub_ss_round:
1240 case Intrinsic::x86_avx512_mask_sub_sd_round:
1241 V = IC.Builder.CreateFSub(LHS, RHS);
1242 break;
1243 case Intrinsic::x86_avx512_mask_mul_ss_round:
1244 case Intrinsic::x86_avx512_mask_mul_sd_round:
1245 V = IC.Builder.CreateFMul(LHS, RHS);
1246 break;
1247 case Intrinsic::x86_avx512_mask_div_ss_round:
1248 case Intrinsic::x86_avx512_mask_div_sd_round:
1249 V = IC.Builder.CreateFDiv(LHS, RHS);
1250 break;
1251 }
1252
1253 // Handle the masking aspect of the intrinsic.
1254 Value *Mask = II.getArgOperand(3);
1255 auto *C = dyn_cast<ConstantInt>(Mask);
1256 // We don't need a select if we know the mask bit is a 1.
1257 if (!C || !C->getValue()[0]) {
1258 // Cast the mask to an i1 vector and then extract the lowest element.
1259 auto *MaskTy = FixedVectorType::get(
1260 IC.Builder.getInt1Ty(),
1261 cast<IntegerType>(Mask->getType())->getBitWidth());
1262 Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
1263 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
1264 // Extract the lowest element from the passthru operand.
1265 Value *Passthru =
1266 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
1267 V = IC.Builder.CreateSelect(Mask, V, Passthru);
1268 }
1269
1270 // Insert the result back into the original argument 0.
1271 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
1272
1273 return IC.replaceInstUsesWith(II, V);
1274 }
1275 }
1276 break;
1277
1278 // Constant fold ashr( <A x Bi>, Ci ).
1279 // Constant fold lshr( <A x Bi>, Ci ).
1280 // Constant fold shl( <A x Bi>, Ci ).
1281 case Intrinsic::x86_sse2_psrai_d:
1282 case Intrinsic::x86_sse2_psrai_w:
1283 case Intrinsic::x86_avx2_psrai_d:
1284 case Intrinsic::x86_avx2_psrai_w:
1285 case Intrinsic::x86_avx512_psrai_q_128:
1286 case Intrinsic::x86_avx512_psrai_q_256:
1287 case Intrinsic::x86_avx512_psrai_d_512:
1288 case Intrinsic::x86_avx512_psrai_q_512:
1289 case Intrinsic::x86_avx512_psrai_w_512:
1290 case Intrinsic::x86_sse2_psrli_d:
1291 case Intrinsic::x86_sse2_psrli_q:
1292 case Intrinsic::x86_sse2_psrli_w:
1293 case Intrinsic::x86_avx2_psrli_d:
1294 case Intrinsic::x86_avx2_psrli_q:
1295 case Intrinsic::x86_avx2_psrli_w:
1296 case Intrinsic::x86_avx512_psrli_d_512:
1297 case Intrinsic::x86_avx512_psrli_q_512:
1298 case Intrinsic::x86_avx512_psrli_w_512:
1299 case Intrinsic::x86_sse2_pslli_d:
1300 case Intrinsic::x86_sse2_pslli_q:
1301 case Intrinsic::x86_sse2_pslli_w:
1302 case Intrinsic::x86_avx2_pslli_d:
1303 case Intrinsic::x86_avx2_pslli_q:
1304 case Intrinsic::x86_avx2_pslli_w:
1305 case Intrinsic::x86_avx512_pslli_d_512:
1306 case Intrinsic::x86_avx512_pslli_q_512:
1307 case Intrinsic::x86_avx512_pslli_w_512:
1308 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1309 return IC.replaceInstUsesWith(II, V);
1310 }
1311 break;
1312
1313 case Intrinsic::x86_sse2_psra_d:
1314 case Intrinsic::x86_sse2_psra_w:
1315 case Intrinsic::x86_avx2_psra_d:
1316 case Intrinsic::x86_avx2_psra_w:
1317 case Intrinsic::x86_avx512_psra_q_128:
1318 case Intrinsic::x86_avx512_psra_q_256:
1319 case Intrinsic::x86_avx512_psra_d_512:
1320 case Intrinsic::x86_avx512_psra_q_512:
1321 case Intrinsic::x86_avx512_psra_w_512:
1322 case Intrinsic::x86_sse2_psrl_d:
1323 case Intrinsic::x86_sse2_psrl_q:
1324 case Intrinsic::x86_sse2_psrl_w:
1325 case Intrinsic::x86_avx2_psrl_d:
1326 case Intrinsic::x86_avx2_psrl_q:
1327 case Intrinsic::x86_avx2_psrl_w:
1328 case Intrinsic::x86_avx512_psrl_d_512:
1329 case Intrinsic::x86_avx512_psrl_q_512:
1330 case Intrinsic::x86_avx512_psrl_w_512:
1331 case Intrinsic::x86_sse2_psll_d:
1332 case Intrinsic::x86_sse2_psll_q:
1333 case Intrinsic::x86_sse2_psll_w:
1334 case Intrinsic::x86_avx2_psll_d:
1335 case Intrinsic::x86_avx2_psll_q:
1336 case Intrinsic::x86_avx2_psll_w:
1337 case Intrinsic::x86_avx512_psll_d_512:
1338 case Intrinsic::x86_avx512_psll_q_512:
1339 case Intrinsic::x86_avx512_psll_w_512: {
1340 if (Value *V = simplifyX86immShift(II, IC.Builder)) {
1341 return IC.replaceInstUsesWith(II, V);
1342 }
1343
1344 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
1345 // operand to compute the shift amount.
1346 Value *Arg1 = II.getArgOperand(1);
1347 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
1348 "Unexpected packed shift size");
1349 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
1350
1351 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
1352 return IC.replaceOperand(II, 1, V);
1353 }
1354 break;
1355 }
1356
1357 case Intrinsic::x86_avx2_psllv_d:
1358 case Intrinsic::x86_avx2_psllv_d_256:
1359 case Intrinsic::x86_avx2_psllv_q:
1360 case Intrinsic::x86_avx2_psllv_q_256:
1361 case Intrinsic::x86_avx512_psllv_d_512:
1362 case Intrinsic::x86_avx512_psllv_q_512:
1363 case Intrinsic::x86_avx512_psllv_w_128:
1364 case Intrinsic::x86_avx512_psllv_w_256:
1365 case Intrinsic::x86_avx512_psllv_w_512:
1366 case Intrinsic::x86_avx2_psrav_d:
1367 case Intrinsic::x86_avx2_psrav_d_256:
1368 case Intrinsic::x86_avx512_psrav_q_128:
1369 case Intrinsic::x86_avx512_psrav_q_256:
1370 case Intrinsic::x86_avx512_psrav_d_512:
1371 case Intrinsic::x86_avx512_psrav_q_512:
1372 case Intrinsic::x86_avx512_psrav_w_128:
1373 case Intrinsic::x86_avx512_psrav_w_256:
1374 case Intrinsic::x86_avx512_psrav_w_512:
1375 case Intrinsic::x86_avx2_psrlv_d:
1376 case Intrinsic::x86_avx2_psrlv_d_256:
1377 case Intrinsic::x86_avx2_psrlv_q:
1378 case Intrinsic::x86_avx2_psrlv_q_256:
1379 case Intrinsic::x86_avx512_psrlv_d_512:
1380 case Intrinsic::x86_avx512_psrlv_q_512:
1381 case Intrinsic::x86_avx512_psrlv_w_128:
1382 case Intrinsic::x86_avx512_psrlv_w_256:
1383 case Intrinsic::x86_avx512_psrlv_w_512:
1384 if (Value *V = simplifyX86varShift(II, IC.Builder)) {
1385 return IC.replaceInstUsesWith(II, V);
1386 }
1387 break;
1388
1389 case Intrinsic::x86_sse2_packssdw_128:
1390 case Intrinsic::x86_sse2_packsswb_128:
1391 case Intrinsic::x86_avx2_packssdw:
1392 case Intrinsic::x86_avx2_packsswb:
1393 case Intrinsic::x86_avx512_packssdw_512:
1394 case Intrinsic::x86_avx512_packsswb_512:
1395 if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
1396 return IC.replaceInstUsesWith(II, V);
1397 }
1398 break;
1399
1400 case Intrinsic::x86_sse2_packuswb_128:
1401 case Intrinsic::x86_sse41_packusdw:
1402 case Intrinsic::x86_avx2_packusdw:
1403 case Intrinsic::x86_avx2_packuswb:
1404 case Intrinsic::x86_avx512_packusdw_512:
1405 case Intrinsic::x86_avx512_packuswb_512:
1406 if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
1407 return IC.replaceInstUsesWith(II, V);
1408 }
1409 break;
1410
1411 case Intrinsic::x86_pclmulqdq:
1412 case Intrinsic::x86_pclmulqdq_256:
1413 case Intrinsic::x86_pclmulqdq_512: {
1414 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
1415 unsigned Imm = C->getZExtValue();
1416
1417 bool MadeChange = false;
1418 Value *Arg0 = II.getArgOperand(0);
1419 Value *Arg1 = II.getArgOperand(1);
1420 unsigned VWidth =
1421 cast<FixedVectorType>(Arg0->getType())->getNumElements();
1422
1423 APInt UndefElts1(VWidth, 0);
1424 APInt DemandedElts1 =
1425 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
1426 if (Value *V =
1427 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
1428 IC.replaceOperand(II, 0, V);
1429 MadeChange = true;
1430 }
1431
1432 APInt UndefElts2(VWidth, 0);
1433 APInt DemandedElts2 =
1434 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
1435 if (Value *V =
1436 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
1437 IC.replaceOperand(II, 1, V);
1438 MadeChange = true;
1439 }
1440
1441 // If either input elements are undef, the result is zero.
1442 if (DemandedElts1.isSubsetOf(UndefElts1) ||
1443 DemandedElts2.isSubsetOf(UndefElts2)) {
1444 return IC.replaceInstUsesWith(II,
1445 ConstantAggregateZero::get(II.getType()));
1446 }
1447
1448 if (MadeChange) {
1449 return &II;
1450 }
1451 }
1452 break;
1453 }
1454
1455 case Intrinsic::x86_sse41_insertps:
1456 if (Value *V = simplifyX86insertps(II, IC.Builder)) {
1457 return IC.replaceInstUsesWith(II, V);
1458 }
1459 break;
1460
1461 case Intrinsic::x86_sse4a_extrq: {
1462 Value *Op0 = II.getArgOperand(0);
1463 Value *Op1 = II.getArgOperand(1);
1464 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1465 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1466 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1467 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1468 VWidth1 == 16 && "Unexpected operand sizes");
1469
1470 // See if we're dealing with constant values.
1471 auto *C1 = dyn_cast<Constant>(Op1);
1472 auto *CILength =
1473 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1474 : nullptr;
1475 auto *CIIndex =
1476 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1477 : nullptr;
1478
1479 // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
1480 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1481 return IC.replaceInstUsesWith(II, V);
1482 }
1483
1484 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
1485 // operands and the lowest 16-bits of the second.
1486 bool MadeChange = false;
1487 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1488 IC.replaceOperand(II, 0, V);
1489 MadeChange = true;
1490 }
1491 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
1492 IC.replaceOperand(II, 1, V);
1493 MadeChange = true;
1494 }
1495 if (MadeChange) {
1496 return &II;
1497 }
1498 break;
1499 }
1500
1501 case Intrinsic::x86_sse4a_extrqi: {
1502 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
1503 // bits of the lower 64-bits. The upper 64-bits are undefined.
1504 Value *Op0 = II.getArgOperand(0);
1505 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1506 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1507 "Unexpected operand size");
1508
1509 // See if we're dealing with constant values.
1510 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
1511 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
1512
1513 // Attempt to simplify to a constant or shuffle vector.
1514 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
1515 return IC.replaceInstUsesWith(II, V);
1516 }
1517
1518 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
1519 // operand.
1520 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1521 return IC.replaceOperand(II, 0, V);
1522 }
1523 break;
1524 }
1525
1526 case Intrinsic::x86_sse4a_insertq: {
1527 Value *Op0 = II.getArgOperand(0);
1528 Value *Op1 = II.getArgOperand(1);
1529 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
1530 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1531 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
1532 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
1533 "Unexpected operand size");
1534
1535 // See if we're dealing with constant values.
1536 auto *C1 = dyn_cast<Constant>(Op1);
1537 auto *CI11 =
1538 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
1539 : nullptr;
1540
1541 // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
1542 if (CI11) {
1543 const APInt &V11 = CI11->getValue();
1544 APInt Len = V11.zextOrTrunc(6);
1545 APInt Idx = V11.lshr(8).zextOrTrunc(6);
1546 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1547 return IC.replaceInstUsesWith(II, V);
1548 }
1549 }
1550
1551 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
1552 // operand.
1553 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
1554 return IC.replaceOperand(II, 0, V);
1555 }
1556 break;
1557 }
1558
1559 case Intrinsic::x86_sse4a_insertqi: {
1560 // INSERTQI: Extract lowest Length bits from lower half of second source and
1561 // insert over first source starting at Index bit. The upper 64-bits are
1562 // undefined.
1563 Value *Op0 = II.getArgOperand(0);
1564 Value *Op1 = II.getArgOperand(1);
1565 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
1566 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
1567 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
1568 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
1569 VWidth1 == 2 && "Unexpected operand sizes");
1570
1571 // See if we're dealing with constant values.
1572 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
1573 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
1574
1575 // Attempt to simplify to a constant or shuffle vector.
1576 if (CILength && CIIndex) {
1577 APInt Len = CILength->getValue().zextOrTrunc(6);
1578 APInt Idx = CIIndex->getValue().zextOrTrunc(6);
1579 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
1580 return IC.replaceInstUsesWith(II, V);
1581 }
1582 }
1583
1584 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
1585 // operands.
1586 bool MadeChange = false;
1587 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
1588 IC.replaceOperand(II, 0, V);
1589 MadeChange = true;
1590 }
1591 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
1592 IC.replaceOperand(II, 1, V);
1593 MadeChange = true;
1594 }
1595 if (MadeChange) {
1596 return &II;
1597 }
1598 break;
1599 }
1600
1601 case Intrinsic::x86_sse41_pblendvb:
1602 case Intrinsic::x86_sse41_blendvps:
1603 case Intrinsic::x86_sse41_blendvpd:
1604 case Intrinsic::x86_avx_blendv_ps_256:
1605 case Intrinsic::x86_avx_blendv_pd_256:
1606 case Intrinsic::x86_avx2_pblendvb: {
1607 // fold (blend A, A, Mask) -> A
1608 Value *Op0 = II.getArgOperand(0);
1609 Value *Op1 = II.getArgOperand(1);
1610 Value *Mask = II.getArgOperand(2);
1611 if (Op0 == Op1) {
1612 return IC.replaceInstUsesWith(II, Op0);
1613 }
1614
1615 // Zero Mask - select 1st argument.
1616 if (isa<ConstantAggregateZero>(Mask)) {
1617 return IC.replaceInstUsesWith(II, Op0);
1618 }
1619
1620 // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
1621 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
1622 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
1623 return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
1624 }
1625
1626 // Convert to a vector select if we can bypass casts and find a boolean
1627 // vector condition value.
1628 Value *BoolVec;
1629 Mask = InstCombiner::peekThroughBitcast(Mask);
1630 if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
1631 BoolVec->getType()->isVectorTy() &&
1632 BoolVec->getType()->getScalarSizeInBits() == 1) {
1633 assert(Mask->getType()->getPrimitiveSizeInBits() ==
1634 II.getType()->getPrimitiveSizeInBits() &&
1635 "Not expecting mask and operands with different sizes");
1636
1637 unsigned NumMaskElts =
1638 cast<FixedVectorType>(Mask->getType())->getNumElements();
1639 unsigned NumOperandElts =
1640 cast<FixedVectorType>(II.getType())->getNumElements();
1641 if (NumMaskElts == NumOperandElts) {
1642 return SelectInst::Create(BoolVec, Op1, Op0);
1643 }
1644
1645 // If the mask has less elements than the operands, each mask bit maps to
1646 // multiple elements of the operands. Bitcast back and forth.
1647 if (NumMaskElts < NumOperandElts) {
1648 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
1649 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
1650 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
1651 return new BitCastInst(Sel, II.getType());
1652 }
1653 }
1654
1655 break;
1656 }
1657
1658 case Intrinsic::x86_ssse3_pshuf_b_128:
1659 case Intrinsic::x86_avx2_pshuf_b:
1660 case Intrinsic::x86_avx512_pshuf_b_512:
1661 if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
1662 return IC.replaceInstUsesWith(II, V);
1663 }
1664 break;
1665
1666 case Intrinsic::x86_avx_vpermilvar_ps:
1667 case Intrinsic::x86_avx_vpermilvar_ps_256:
1668 case Intrinsic::x86_avx512_vpermilvar_ps_512:
1669 case Intrinsic::x86_avx_vpermilvar_pd:
1670 case Intrinsic::x86_avx_vpermilvar_pd_256:
1671 case Intrinsic::x86_avx512_vpermilvar_pd_512:
1672 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
1673 return IC.replaceInstUsesWith(II, V);
1674 }
1675 break;
1676
1677 case Intrinsic::x86_avx2_permd:
1678 case Intrinsic::x86_avx2_permps:
1679 case Intrinsic::x86_avx512_permvar_df_256:
1680 case Intrinsic::x86_avx512_permvar_df_512:
1681 case Intrinsic::x86_avx512_permvar_di_256:
1682 case Intrinsic::x86_avx512_permvar_di_512:
1683 case Intrinsic::x86_avx512_permvar_hi_128:
1684 case Intrinsic::x86_avx512_permvar_hi_256:
1685 case Intrinsic::x86_avx512_permvar_hi_512:
1686 case Intrinsic::x86_avx512_permvar_qi_128:
1687 case Intrinsic::x86_avx512_permvar_qi_256:
1688 case Intrinsic::x86_avx512_permvar_qi_512:
1689 case Intrinsic::x86_avx512_permvar_sf_512:
1690 case Intrinsic::x86_avx512_permvar_si_512:
1691 if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
1692 return IC.replaceInstUsesWith(II, V);
1693 }
1694 break;
1695
1696 case Intrinsic::x86_avx_maskload_ps:
1697 case Intrinsic::x86_avx_maskload_pd:
1698 case Intrinsic::x86_avx_maskload_ps_256:
1699 case Intrinsic::x86_avx_maskload_pd_256:
1700 case Intrinsic::x86_avx2_maskload_d:
1701 case Intrinsic::x86_avx2_maskload_q:
1702 case Intrinsic::x86_avx2_maskload_d_256:
1703 case Intrinsic::x86_avx2_maskload_q_256:
1704 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
1705 return I;
1706 }
1707 break;
1708
1709 case Intrinsic::x86_sse2_maskmov_dqu:
1710 case Intrinsic::x86_avx_maskstore_ps:
1711 case Intrinsic::x86_avx_maskstore_pd:
1712 case Intrinsic::x86_avx_maskstore_ps_256:
1713 case Intrinsic::x86_avx_maskstore_pd_256:
1714 case Intrinsic::x86_avx2_maskstore_d:
1715 case Intrinsic::x86_avx2_maskstore_q:
1716 case Intrinsic::x86_avx2_maskstore_d_256:
1717 case Intrinsic::x86_avx2_maskstore_q_256:
1718 if (simplifyX86MaskedStore(II, IC)) {
1719 return nullptr;
1720 }
1721 break;
1722
1723 case Intrinsic::x86_addcarry_32:
1724 case Intrinsic::x86_addcarry_64:
1725 if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
1726 return IC.replaceInstUsesWith(II, V);
1727 }
1728 break;
1729
1730 default:
1731 break;
1732 }
1733 return None;
1734 }
1735
simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed) const1736 Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
1737 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
1738 bool &KnownBitsComputed) const {
1739 switch (II.getIntrinsicID()) {
1740 default:
1741 break;
1742 case Intrinsic::x86_mmx_pmovmskb:
1743 case Intrinsic::x86_sse_movmsk_ps:
1744 case Intrinsic::x86_sse2_movmsk_pd:
1745 case Intrinsic::x86_sse2_pmovmskb_128:
1746 case Intrinsic::x86_avx_movmsk_ps_256:
1747 case Intrinsic::x86_avx_movmsk_pd_256:
1748 case Intrinsic::x86_avx2_pmovmskb: {
1749 // MOVMSK copies the vector elements' sign bits to the low bits
1750 // and zeros the high bits.
1751 unsigned ArgWidth;
1752 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
1753 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
1754 } else {
1755 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
1756 ArgWidth = ArgType->getNumElements();
1757 }
1758
1759 // If we don't need any of low bits then return zero,
1760 // we know that DemandedMask is non-zero already.
1761 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
1762 Type *VTy = II.getType();
1763 if (DemandedElts.isZero()) {
1764 return ConstantInt::getNullValue(VTy);
1765 }
1766
1767 // We know that the upper bits are set to zero.
1768 Known.Zero.setBitsFrom(ArgWidth);
1769 KnownBitsComputed = true;
1770 break;
1771 }
1772 }
1773 return None;
1774 }
1775
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> simplifyAndSetOp) const1776 Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
1777 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1778 APInt &UndefElts2, APInt &UndefElts3,
1779 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1780 simplifyAndSetOp) const {
1781 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
1782 switch (II.getIntrinsicID()) {
1783 default:
1784 break;
1785 case Intrinsic::x86_xop_vfrcz_ss:
1786 case Intrinsic::x86_xop_vfrcz_sd:
1787 // The instructions for these intrinsics are speced to zero upper bits not
1788 // pass them through like other scalar intrinsics. So we shouldn't just
1789 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
1790 // Instead we should return a zero vector.
1791 if (!DemandedElts[0]) {
1792 IC.addToWorklist(&II);
1793 return ConstantAggregateZero::get(II.getType());
1794 }
1795
1796 // Only the lower element is used.
1797 DemandedElts = 1;
1798 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1799
1800 // Only the lower element is undefined. The high elements are zero.
1801 UndefElts = UndefElts[0];
1802 break;
1803
1804 // Unary scalar-as-vector operations that work column-wise.
1805 case Intrinsic::x86_sse_rcp_ss:
1806 case Intrinsic::x86_sse_rsqrt_ss:
1807 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1808
1809 // If lowest element of a scalar op isn't used then use Arg0.
1810 if (!DemandedElts[0]) {
1811 IC.addToWorklist(&II);
1812 return II.getArgOperand(0);
1813 }
1814 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
1815 // checks).
1816 break;
1817
1818 // Binary scalar-as-vector operations that work column-wise. The high
1819 // elements come from operand 0. The low element is a function of both
1820 // operands.
1821 case Intrinsic::x86_sse_min_ss:
1822 case Intrinsic::x86_sse_max_ss:
1823 case Intrinsic::x86_sse_cmp_ss:
1824 case Intrinsic::x86_sse2_min_sd:
1825 case Intrinsic::x86_sse2_max_sd:
1826 case Intrinsic::x86_sse2_cmp_sd: {
1827 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1828
1829 // If lowest element of a scalar op isn't used then use Arg0.
1830 if (!DemandedElts[0]) {
1831 IC.addToWorklist(&II);
1832 return II.getArgOperand(0);
1833 }
1834
1835 // Only lower element is used for operand 1.
1836 DemandedElts = 1;
1837 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1838
1839 // Lower element is undefined if both lower elements are undefined.
1840 // Consider things like undef&0. The result is known zero, not undef.
1841 if (!UndefElts2[0])
1842 UndefElts.clearBit(0);
1843
1844 break;
1845 }
1846
1847 // Binary scalar-as-vector operations that work column-wise. The high
1848 // elements come from operand 0 and the low element comes from operand 1.
1849 case Intrinsic::x86_sse41_round_ss:
1850 case Intrinsic::x86_sse41_round_sd: {
1851 // Don't use the low element of operand 0.
1852 APInt DemandedElts2 = DemandedElts;
1853 DemandedElts2.clearBit(0);
1854 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
1855
1856 // If lowest element of a scalar op isn't used then use Arg0.
1857 if (!DemandedElts[0]) {
1858 IC.addToWorklist(&II);
1859 return II.getArgOperand(0);
1860 }
1861
1862 // Only lower element is used for operand 1.
1863 DemandedElts = 1;
1864 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1865
1866 // Take the high undef elements from operand 0 and take the lower element
1867 // from operand 1.
1868 UndefElts.clearBit(0);
1869 UndefElts |= UndefElts2[0];
1870 break;
1871 }
1872
1873 // Three input scalar-as-vector operations that work column-wise. The high
1874 // elements come from operand 0 and the low element is a function of all
1875 // three inputs.
1876 case Intrinsic::x86_avx512_mask_add_ss_round:
1877 case Intrinsic::x86_avx512_mask_div_ss_round:
1878 case Intrinsic::x86_avx512_mask_mul_ss_round:
1879 case Intrinsic::x86_avx512_mask_sub_ss_round:
1880 case Intrinsic::x86_avx512_mask_max_ss_round:
1881 case Intrinsic::x86_avx512_mask_min_ss_round:
1882 case Intrinsic::x86_avx512_mask_add_sd_round:
1883 case Intrinsic::x86_avx512_mask_div_sd_round:
1884 case Intrinsic::x86_avx512_mask_mul_sd_round:
1885 case Intrinsic::x86_avx512_mask_sub_sd_round:
1886 case Intrinsic::x86_avx512_mask_max_sd_round:
1887 case Intrinsic::x86_avx512_mask_min_sd_round:
1888 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1889
1890 // If lowest element of a scalar op isn't used then use Arg0.
1891 if (!DemandedElts[0]) {
1892 IC.addToWorklist(&II);
1893 return II.getArgOperand(0);
1894 }
1895
1896 // Only lower element is used for operand 1 and 2.
1897 DemandedElts = 1;
1898 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1899 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
1900
1901 // Lower element is undefined if all three lower elements are undefined.
1902 // Consider things like undef&0. The result is known zero, not undef.
1903 if (!UndefElts2[0] || !UndefElts3[0])
1904 UndefElts.clearBit(0);
1905 break;
1906
1907 // TODO: Add fmaddsub support?
1908 case Intrinsic::x86_sse3_addsub_pd:
1909 case Intrinsic::x86_sse3_addsub_ps:
1910 case Intrinsic::x86_avx_addsub_pd_256:
1911 case Intrinsic::x86_avx_addsub_ps_256: {
1912 // If none of the even or none of the odd lanes are required, turn this
1913 // into a generic FP math instruction.
1914 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
1915 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
1916 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
1917 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
1918 if (IsSubOnly || IsAddOnly) {
1919 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
1920 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1921 IC.Builder.SetInsertPoint(&II);
1922 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
1923 return IC.Builder.CreateBinOp(
1924 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
1925 }
1926
1927 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1928 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1929 UndefElts &= UndefElts2;
1930 break;
1931 }
1932
1933 // General per-element vector operations.
1934 case Intrinsic::x86_avx2_psllv_d:
1935 case Intrinsic::x86_avx2_psllv_d_256:
1936 case Intrinsic::x86_avx2_psllv_q:
1937 case Intrinsic::x86_avx2_psllv_q_256:
1938 case Intrinsic::x86_avx2_psrlv_d:
1939 case Intrinsic::x86_avx2_psrlv_d_256:
1940 case Intrinsic::x86_avx2_psrlv_q:
1941 case Intrinsic::x86_avx2_psrlv_q_256:
1942 case Intrinsic::x86_avx2_psrav_d:
1943 case Intrinsic::x86_avx2_psrav_d_256: {
1944 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
1945 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
1946 UndefElts &= UndefElts2;
1947 break;
1948 }
1949
1950 case Intrinsic::x86_sse2_packssdw_128:
1951 case Intrinsic::x86_sse2_packsswb_128:
1952 case Intrinsic::x86_sse2_packuswb_128:
1953 case Intrinsic::x86_sse41_packusdw:
1954 case Intrinsic::x86_avx2_packssdw:
1955 case Intrinsic::x86_avx2_packsswb:
1956 case Intrinsic::x86_avx2_packusdw:
1957 case Intrinsic::x86_avx2_packuswb:
1958 case Intrinsic::x86_avx512_packssdw_512:
1959 case Intrinsic::x86_avx512_packsswb_512:
1960 case Intrinsic::x86_avx512_packusdw_512:
1961 case Intrinsic::x86_avx512_packuswb_512: {
1962 auto *Ty0 = II.getArgOperand(0)->getType();
1963 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
1964 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
1965
1966 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
1967 unsigned VWidthPerLane = VWidth / NumLanes;
1968 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
1969
1970 // Per lane, pack the elements of the first input and then the second.
1971 // e.g.
1972 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
1973 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
1974 for (int OpNum = 0; OpNum != 2; ++OpNum) {
1975 APInt OpDemandedElts(InnerVWidth, 0);
1976 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1977 unsigned LaneIdx = Lane * VWidthPerLane;
1978 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
1979 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
1980 if (DemandedElts[Idx])
1981 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
1982 }
1983 }
1984
1985 // Demand elements from the operand.
1986 APInt OpUndefElts(InnerVWidth, 0);
1987 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
1988
1989 // Pack the operand's UNDEF elements, one lane at a time.
1990 OpUndefElts = OpUndefElts.zext(VWidth);
1991 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
1992 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
1993 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
1994 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
1995 UndefElts |= LaneElts;
1996 }
1997 }
1998 break;
1999 }
2000
2001 // PSHUFB
2002 case Intrinsic::x86_ssse3_pshuf_b_128:
2003 case Intrinsic::x86_avx2_pshuf_b:
2004 case Intrinsic::x86_avx512_pshuf_b_512:
2005 // PERMILVAR
2006 case Intrinsic::x86_avx_vpermilvar_ps:
2007 case Intrinsic::x86_avx_vpermilvar_ps_256:
2008 case Intrinsic::x86_avx512_vpermilvar_ps_512:
2009 case Intrinsic::x86_avx_vpermilvar_pd:
2010 case Intrinsic::x86_avx_vpermilvar_pd_256:
2011 case Intrinsic::x86_avx512_vpermilvar_pd_512:
2012 // PERMV
2013 case Intrinsic::x86_avx2_permd:
2014 case Intrinsic::x86_avx2_permps: {
2015 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
2016 break;
2017 }
2018
2019 // SSE4A instructions leave the upper 64-bits of the 128-bit result
2020 // in an undefined state.
2021 case Intrinsic::x86_sse4a_extrq:
2022 case Intrinsic::x86_sse4a_extrqi:
2023 case Intrinsic::x86_sse4a_insertq:
2024 case Intrinsic::x86_sse4a_insertqi:
2025 UndefElts.setHighBits(VWidth / 2);
2026 break;
2027 }
2028 return None;
2029 }
2030