1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
19 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
20 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/Utils.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineRegisterInfo.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/CodeGen/TargetInstrInfo.h"
28 #include "llvm/CodeGen/TargetLowering.h"
29 #include "llvm/CodeGen/TargetOpcodes.h"
30 #include "llvm/CodeGen/TargetSubtargetInfo.h"
31 #include "llvm/IR/Instructions.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/MathExtras.h"
34 #include "llvm/Support/raw_ostream.h"
35 #include "llvm/Target/TargetMachine.h"
36
37 #define DEBUG_TYPE "legalizer"
38
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace MIPatternMatch;
42
43 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
44 ///
45 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
46 /// with any leftover piece as type \p LeftoverTy
47 ///
48 /// Returns -1 in the first element of the pair if the breakdown is not
49 /// satisfiable.
50 static std::pair<int, int>
getNarrowTypeBreakDown(LLT OrigTy,LLT NarrowTy,LLT & LeftoverTy)51 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
52 assert(!LeftoverTy.isValid() && "this is an out argument");
53
54 unsigned Size = OrigTy.getSizeInBits();
55 unsigned NarrowSize = NarrowTy.getSizeInBits();
56 unsigned NumParts = Size / NarrowSize;
57 unsigned LeftoverSize = Size - NumParts * NarrowSize;
58 assert(Size > NarrowSize);
59
60 if (LeftoverSize == 0)
61 return {NumParts, 0};
62
63 if (NarrowTy.isVector()) {
64 unsigned EltSize = OrigTy.getScalarSizeInBits();
65 if (LeftoverSize % EltSize != 0)
66 return {-1, -1};
67 LeftoverTy = LLT::scalarOrVector(
68 ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
69 } else {
70 LeftoverTy = LLT::scalar(LeftoverSize);
71 }
72
73 int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
74 return std::make_pair(NumParts, NumLeftover);
75 }
76
getFloatTypeForLLT(LLVMContext & Ctx,LLT Ty)77 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
78
79 if (!Ty.isScalar())
80 return nullptr;
81
82 switch (Ty.getSizeInBits()) {
83 case 16:
84 return Type::getHalfTy(Ctx);
85 case 32:
86 return Type::getFloatTy(Ctx);
87 case 64:
88 return Type::getDoubleTy(Ctx);
89 case 80:
90 return Type::getX86_FP80Ty(Ctx);
91 case 128:
92 return Type::getFP128Ty(Ctx);
93 default:
94 return nullptr;
95 }
96 }
97
LegalizerHelper(MachineFunction & MF,GISelChangeObserver & Observer,MachineIRBuilder & Builder)98 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
99 GISelChangeObserver &Observer,
100 MachineIRBuilder &Builder)
101 : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
102 LI(*MF.getSubtarget().getLegalizerInfo()),
103 TLI(*MF.getSubtarget().getTargetLowering()) { }
104
LegalizerHelper(MachineFunction & MF,const LegalizerInfo & LI,GISelChangeObserver & Observer,MachineIRBuilder & B)105 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
106 GISelChangeObserver &Observer,
107 MachineIRBuilder &B)
108 : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
109 TLI(*MF.getSubtarget().getTargetLowering()) { }
110
111 LegalizerHelper::LegalizeResult
legalizeInstrStep(MachineInstr & MI,LostDebugLocObserver & LocObserver)112 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
113 LostDebugLocObserver &LocObserver) {
114 LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
115
116 MIRBuilder.setInstrAndDebugLoc(MI);
117
118 if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
119 MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
120 return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
121 auto Step = LI.getAction(MI, MRI);
122 switch (Step.Action) {
123 case Legal:
124 LLVM_DEBUG(dbgs() << ".. Already legal\n");
125 return AlreadyLegal;
126 case Libcall:
127 LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
128 return libcall(MI, LocObserver);
129 case NarrowScalar:
130 LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
131 return narrowScalar(MI, Step.TypeIdx, Step.NewType);
132 case WidenScalar:
133 LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
134 return widenScalar(MI, Step.TypeIdx, Step.NewType);
135 case Bitcast:
136 LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
137 return bitcast(MI, Step.TypeIdx, Step.NewType);
138 case Lower:
139 LLVM_DEBUG(dbgs() << ".. Lower\n");
140 return lower(MI, Step.TypeIdx, Step.NewType);
141 case FewerElements:
142 LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
143 return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
144 case MoreElements:
145 LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
146 return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
147 case Custom:
148 LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
149 return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
150 default:
151 LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
152 return UnableToLegalize;
153 }
154 }
155
extractParts(Register Reg,LLT Ty,int NumParts,SmallVectorImpl<Register> & VRegs)156 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
157 SmallVectorImpl<Register> &VRegs) {
158 for (int i = 0; i < NumParts; ++i)
159 VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
160 MIRBuilder.buildUnmerge(VRegs, Reg);
161 }
162
extractParts(Register Reg,LLT RegTy,LLT MainTy,LLT & LeftoverTy,SmallVectorImpl<Register> & VRegs,SmallVectorImpl<Register> & LeftoverRegs)163 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
164 LLT MainTy, LLT &LeftoverTy,
165 SmallVectorImpl<Register> &VRegs,
166 SmallVectorImpl<Register> &LeftoverRegs) {
167 assert(!LeftoverTy.isValid() && "this is an out argument");
168
169 unsigned RegSize = RegTy.getSizeInBits();
170 unsigned MainSize = MainTy.getSizeInBits();
171 unsigned NumParts = RegSize / MainSize;
172 unsigned LeftoverSize = RegSize - NumParts * MainSize;
173
174 // Use an unmerge when possible.
175 if (LeftoverSize == 0) {
176 for (unsigned I = 0; I < NumParts; ++I)
177 VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
178 MIRBuilder.buildUnmerge(VRegs, Reg);
179 return true;
180 }
181
182 // Perform irregular split. Leftover is last element of RegPieces.
183 if (MainTy.isVector()) {
184 SmallVector<Register, 8> RegPieces;
185 extractVectorParts(Reg, MainTy.getNumElements(), RegPieces);
186 for (unsigned i = 0; i < RegPieces.size() - 1; ++i)
187 VRegs.push_back(RegPieces[i]);
188 LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]);
189 LeftoverTy = MRI.getType(LeftoverRegs[0]);
190 return true;
191 }
192
193 LeftoverTy = LLT::scalar(LeftoverSize);
194 // For irregular sizes, extract the individual parts.
195 for (unsigned I = 0; I != NumParts; ++I) {
196 Register NewReg = MRI.createGenericVirtualRegister(MainTy);
197 VRegs.push_back(NewReg);
198 MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
199 }
200
201 for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
202 Offset += LeftoverSize) {
203 Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
204 LeftoverRegs.push_back(NewReg);
205 MIRBuilder.buildExtract(NewReg, Reg, Offset);
206 }
207
208 return true;
209 }
210
extractVectorParts(Register Reg,unsigned NumElts,SmallVectorImpl<Register> & VRegs)211 void LegalizerHelper::extractVectorParts(Register Reg, unsigned NumElts,
212 SmallVectorImpl<Register> &VRegs) {
213 LLT RegTy = MRI.getType(Reg);
214 assert(RegTy.isVector() && "Expected a vector type");
215
216 LLT EltTy = RegTy.getElementType();
217 LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
218 unsigned RegNumElts = RegTy.getNumElements();
219 unsigned LeftoverNumElts = RegNumElts % NumElts;
220 unsigned NumNarrowTyPieces = RegNumElts / NumElts;
221
222 // Perfect split without leftover
223 if (LeftoverNumElts == 0)
224 return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs);
225
226 // Irregular split. Provide direct access to all elements for artifact
227 // combiner using unmerge to elements. Then build vectors with NumElts
228 // elements. Remaining element(s) will be (used to build vector) Leftover.
229 SmallVector<Register, 8> Elts;
230 extractParts(Reg, EltTy, RegNumElts, Elts);
231
232 unsigned Offset = 0;
233 // Requested sub-vectors of NarrowTy.
234 for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) {
235 ArrayRef<Register> Pieces(&Elts[Offset], NumElts);
236 VRegs.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0));
237 }
238
239 // Leftover element(s).
240 if (LeftoverNumElts == 1) {
241 VRegs.push_back(Elts[Offset]);
242 } else {
243 LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy);
244 ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts);
245 VRegs.push_back(MIRBuilder.buildMerge(LeftoverTy, Pieces).getReg(0));
246 }
247 }
248
insertParts(Register DstReg,LLT ResultTy,LLT PartTy,ArrayRef<Register> PartRegs,LLT LeftoverTy,ArrayRef<Register> LeftoverRegs)249 void LegalizerHelper::insertParts(Register DstReg,
250 LLT ResultTy, LLT PartTy,
251 ArrayRef<Register> PartRegs,
252 LLT LeftoverTy,
253 ArrayRef<Register> LeftoverRegs) {
254 if (!LeftoverTy.isValid()) {
255 assert(LeftoverRegs.empty());
256
257 if (!ResultTy.isVector()) {
258 MIRBuilder.buildMerge(DstReg, PartRegs);
259 return;
260 }
261
262 if (PartTy.isVector())
263 MIRBuilder.buildConcatVectors(DstReg, PartRegs);
264 else
265 MIRBuilder.buildBuildVector(DstReg, PartRegs);
266 return;
267 }
268
269 // Merge sub-vectors with different number of elements and insert into DstReg.
270 if (ResultTy.isVector()) {
271 assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
272 SmallVector<Register, 8> AllRegs;
273 for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs))
274 AllRegs.push_back(Reg);
275 return mergeMixedSubvectors(DstReg, AllRegs);
276 }
277
278 SmallVector<Register> GCDRegs;
279 LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
280 for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
281 extractGCDType(GCDRegs, GCDTy, PartReg);
282 LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
283 buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
284 }
285
appendVectorElts(SmallVectorImpl<Register> & Elts,Register Reg)286 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
287 Register Reg) {
288 LLT Ty = MRI.getType(Reg);
289 SmallVector<Register, 8> RegElts;
290 extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts);
291 Elts.append(RegElts);
292 }
293
294 /// Merge \p PartRegs with different types into \p DstReg.
mergeMixedSubvectors(Register DstReg,ArrayRef<Register> PartRegs)295 void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
296 ArrayRef<Register> PartRegs) {
297 SmallVector<Register, 8> AllElts;
298 for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
299 appendVectorElts(AllElts, PartRegs[i]);
300
301 Register Leftover = PartRegs[PartRegs.size() - 1];
302 if (MRI.getType(Leftover).isScalar())
303 AllElts.push_back(Leftover);
304 else
305 appendVectorElts(AllElts, Leftover);
306
307 MIRBuilder.buildMerge(DstReg, AllElts);
308 }
309
310 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
getUnmergeResults(SmallVectorImpl<Register> & Regs,const MachineInstr & MI)311 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
312 const MachineInstr &MI) {
313 assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
314
315 const int StartIdx = Regs.size();
316 const int NumResults = MI.getNumOperands() - 1;
317 Regs.resize(Regs.size() + NumResults);
318 for (int I = 0; I != NumResults; ++I)
319 Regs[StartIdx + I] = MI.getOperand(I).getReg();
320 }
321
extractGCDType(SmallVectorImpl<Register> & Parts,LLT GCDTy,Register SrcReg)322 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
323 LLT GCDTy, Register SrcReg) {
324 LLT SrcTy = MRI.getType(SrcReg);
325 if (SrcTy == GCDTy) {
326 // If the source already evenly divides the result type, we don't need to do
327 // anything.
328 Parts.push_back(SrcReg);
329 } else {
330 // Need to split into common type sized pieces.
331 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
332 getUnmergeResults(Parts, *Unmerge);
333 }
334 }
335
extractGCDType(SmallVectorImpl<Register> & Parts,LLT DstTy,LLT NarrowTy,Register SrcReg)336 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
337 LLT NarrowTy, Register SrcReg) {
338 LLT SrcTy = MRI.getType(SrcReg);
339 LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
340 extractGCDType(Parts, GCDTy, SrcReg);
341 return GCDTy;
342 }
343
buildLCMMergePieces(LLT DstTy,LLT NarrowTy,LLT GCDTy,SmallVectorImpl<Register> & VRegs,unsigned PadStrategy)344 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
345 SmallVectorImpl<Register> &VRegs,
346 unsigned PadStrategy) {
347 LLT LCMTy = getLCMType(DstTy, NarrowTy);
348
349 int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
350 int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
351 int NumOrigSrc = VRegs.size();
352
353 Register PadReg;
354
355 // Get a value we can use to pad the source value if the sources won't evenly
356 // cover the result type.
357 if (NumOrigSrc < NumParts * NumSubParts) {
358 if (PadStrategy == TargetOpcode::G_ZEXT)
359 PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
360 else if (PadStrategy == TargetOpcode::G_ANYEXT)
361 PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
362 else {
363 assert(PadStrategy == TargetOpcode::G_SEXT);
364
365 // Shift the sign bit of the low register through the high register.
366 auto ShiftAmt =
367 MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
368 PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
369 }
370 }
371
372 // Registers for the final merge to be produced.
373 SmallVector<Register, 4> Remerge(NumParts);
374
375 // Registers needed for intermediate merges, which will be merged into a
376 // source for Remerge.
377 SmallVector<Register, 4> SubMerge(NumSubParts);
378
379 // Once we've fully read off the end of the original source bits, we can reuse
380 // the same high bits for remaining padding elements.
381 Register AllPadReg;
382
383 // Build merges to the LCM type to cover the original result type.
384 for (int I = 0; I != NumParts; ++I) {
385 bool AllMergePartsArePadding = true;
386
387 // Build the requested merges to the requested type.
388 for (int J = 0; J != NumSubParts; ++J) {
389 int Idx = I * NumSubParts + J;
390 if (Idx >= NumOrigSrc) {
391 SubMerge[J] = PadReg;
392 continue;
393 }
394
395 SubMerge[J] = VRegs[Idx];
396
397 // There are meaningful bits here we can't reuse later.
398 AllMergePartsArePadding = false;
399 }
400
401 // If we've filled up a complete piece with padding bits, we can directly
402 // emit the natural sized constant if applicable, rather than a merge of
403 // smaller constants.
404 if (AllMergePartsArePadding && !AllPadReg) {
405 if (PadStrategy == TargetOpcode::G_ANYEXT)
406 AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
407 else if (PadStrategy == TargetOpcode::G_ZEXT)
408 AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
409
410 // If this is a sign extension, we can't materialize a trivial constant
411 // with the right type and have to produce a merge.
412 }
413
414 if (AllPadReg) {
415 // Avoid creating additional instructions if we're just adding additional
416 // copies of padding bits.
417 Remerge[I] = AllPadReg;
418 continue;
419 }
420
421 if (NumSubParts == 1)
422 Remerge[I] = SubMerge[0];
423 else
424 Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0);
425
426 // In the sign extend padding case, re-use the first all-signbit merge.
427 if (AllMergePartsArePadding && !AllPadReg)
428 AllPadReg = Remerge[I];
429 }
430
431 VRegs = std::move(Remerge);
432 return LCMTy;
433 }
434
buildWidenedRemergeToDst(Register DstReg,LLT LCMTy,ArrayRef<Register> RemergeRegs)435 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
436 ArrayRef<Register> RemergeRegs) {
437 LLT DstTy = MRI.getType(DstReg);
438
439 // Create the merge to the widened source, and extract the relevant bits into
440 // the result.
441
442 if (DstTy == LCMTy) {
443 MIRBuilder.buildMerge(DstReg, RemergeRegs);
444 return;
445 }
446
447 auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs);
448 if (DstTy.isScalar() && LCMTy.isScalar()) {
449 MIRBuilder.buildTrunc(DstReg, Remerge);
450 return;
451 }
452
453 if (LCMTy.isVector()) {
454 unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
455 SmallVector<Register, 8> UnmergeDefs(NumDefs);
456 UnmergeDefs[0] = DstReg;
457 for (unsigned I = 1; I != NumDefs; ++I)
458 UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
459
460 MIRBuilder.buildUnmerge(UnmergeDefs,
461 MIRBuilder.buildMerge(LCMTy, RemergeRegs));
462 return;
463 }
464
465 llvm_unreachable("unhandled case");
466 }
467
getRTLibDesc(unsigned Opcode,unsigned Size)468 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
469 #define RTLIBCASE_INT(LibcallPrefix) \
470 do { \
471 switch (Size) { \
472 case 32: \
473 return RTLIB::LibcallPrefix##32; \
474 case 64: \
475 return RTLIB::LibcallPrefix##64; \
476 case 128: \
477 return RTLIB::LibcallPrefix##128; \
478 default: \
479 llvm_unreachable("unexpected size"); \
480 } \
481 } while (0)
482
483 #define RTLIBCASE(LibcallPrefix) \
484 do { \
485 switch (Size) { \
486 case 32: \
487 return RTLIB::LibcallPrefix##32; \
488 case 64: \
489 return RTLIB::LibcallPrefix##64; \
490 case 80: \
491 return RTLIB::LibcallPrefix##80; \
492 case 128: \
493 return RTLIB::LibcallPrefix##128; \
494 default: \
495 llvm_unreachable("unexpected size"); \
496 } \
497 } while (0)
498
499 switch (Opcode) {
500 case TargetOpcode::G_SDIV:
501 RTLIBCASE_INT(SDIV_I);
502 case TargetOpcode::G_UDIV:
503 RTLIBCASE_INT(UDIV_I);
504 case TargetOpcode::G_SREM:
505 RTLIBCASE_INT(SREM_I);
506 case TargetOpcode::G_UREM:
507 RTLIBCASE_INT(UREM_I);
508 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
509 RTLIBCASE_INT(CTLZ_I);
510 case TargetOpcode::G_FADD:
511 RTLIBCASE(ADD_F);
512 case TargetOpcode::G_FSUB:
513 RTLIBCASE(SUB_F);
514 case TargetOpcode::G_FMUL:
515 RTLIBCASE(MUL_F);
516 case TargetOpcode::G_FDIV:
517 RTLIBCASE(DIV_F);
518 case TargetOpcode::G_FEXP:
519 RTLIBCASE(EXP_F);
520 case TargetOpcode::G_FEXP2:
521 RTLIBCASE(EXP2_F);
522 case TargetOpcode::G_FREM:
523 RTLIBCASE(REM_F);
524 case TargetOpcode::G_FPOW:
525 RTLIBCASE(POW_F);
526 case TargetOpcode::G_FMA:
527 RTLIBCASE(FMA_F);
528 case TargetOpcode::G_FSIN:
529 RTLIBCASE(SIN_F);
530 case TargetOpcode::G_FCOS:
531 RTLIBCASE(COS_F);
532 case TargetOpcode::G_FLOG10:
533 RTLIBCASE(LOG10_F);
534 case TargetOpcode::G_FLOG:
535 RTLIBCASE(LOG_F);
536 case TargetOpcode::G_FLOG2:
537 RTLIBCASE(LOG2_F);
538 case TargetOpcode::G_FCEIL:
539 RTLIBCASE(CEIL_F);
540 case TargetOpcode::G_FFLOOR:
541 RTLIBCASE(FLOOR_F);
542 case TargetOpcode::G_FMINNUM:
543 RTLIBCASE(FMIN_F);
544 case TargetOpcode::G_FMAXNUM:
545 RTLIBCASE(FMAX_F);
546 case TargetOpcode::G_FSQRT:
547 RTLIBCASE(SQRT_F);
548 case TargetOpcode::G_FRINT:
549 RTLIBCASE(RINT_F);
550 case TargetOpcode::G_FNEARBYINT:
551 RTLIBCASE(NEARBYINT_F);
552 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
553 RTLIBCASE(ROUNDEVEN_F);
554 }
555 llvm_unreachable("Unknown libcall function");
556 }
557
558 /// True if an instruction is in tail position in its caller. Intended for
559 /// legalizing libcalls as tail calls when possible.
isLibCallInTailPosition(MachineInstr & MI,const TargetInstrInfo & TII,MachineRegisterInfo & MRI)560 static bool isLibCallInTailPosition(MachineInstr &MI,
561 const TargetInstrInfo &TII,
562 MachineRegisterInfo &MRI) {
563 MachineBasicBlock &MBB = *MI.getParent();
564 const Function &F = MBB.getParent()->getFunction();
565
566 // Conservatively require the attributes of the call to match those of
567 // the return. Ignore NoAlias and NonNull because they don't affect the
568 // call sequence.
569 AttributeList CallerAttrs = F.getAttributes();
570 if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
571 .removeAttribute(Attribute::NoAlias)
572 .removeAttribute(Attribute::NonNull)
573 .hasAttributes())
574 return false;
575
576 // It's not safe to eliminate the sign / zero extension of the return value.
577 if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
578 CallerAttrs.hasRetAttr(Attribute::SExt))
579 return false;
580
581 // Only tail call if the following instruction is a standard return or if we
582 // have a `thisreturn` callee, and a sequence like:
583 //
584 // G_MEMCPY %0, %1, %2
585 // $x0 = COPY %0
586 // RET_ReallyLR implicit $x0
587 auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
588 if (Next != MBB.instr_end() && Next->isCopy()) {
589 switch (MI.getOpcode()) {
590 default:
591 llvm_unreachable("unsupported opcode");
592 case TargetOpcode::G_BZERO:
593 return false;
594 case TargetOpcode::G_MEMCPY:
595 case TargetOpcode::G_MEMMOVE:
596 case TargetOpcode::G_MEMSET:
597 break;
598 }
599
600 Register VReg = MI.getOperand(0).getReg();
601 if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
602 return false;
603
604 Register PReg = Next->getOperand(0).getReg();
605 if (!PReg.isPhysical())
606 return false;
607
608 auto Ret = next_nodbg(Next, MBB.instr_end());
609 if (Ret == MBB.instr_end() || !Ret->isReturn())
610 return false;
611
612 if (Ret->getNumImplicitOperands() != 1)
613 return false;
614
615 if (PReg != Ret->getOperand(0).getReg())
616 return false;
617
618 // Skip over the COPY that we just validated.
619 Next = Ret;
620 }
621
622 if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
623 return false;
624
625 return true;
626 }
627
628 LegalizerHelper::LegalizeResult
createLibcall(MachineIRBuilder & MIRBuilder,const char * Name,const CallLowering::ArgInfo & Result,ArrayRef<CallLowering::ArgInfo> Args,const CallingConv::ID CC)629 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
630 const CallLowering::ArgInfo &Result,
631 ArrayRef<CallLowering::ArgInfo> Args,
632 const CallingConv::ID CC) {
633 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
634
635 CallLowering::CallLoweringInfo Info;
636 Info.CallConv = CC;
637 Info.Callee = MachineOperand::CreateES(Name);
638 Info.OrigRet = Result;
639 std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
640 if (!CLI.lowerCall(MIRBuilder, Info))
641 return LegalizerHelper::UnableToLegalize;
642
643 return LegalizerHelper::Legalized;
644 }
645
646 LegalizerHelper::LegalizeResult
createLibcall(MachineIRBuilder & MIRBuilder,RTLIB::Libcall Libcall,const CallLowering::ArgInfo & Result,ArrayRef<CallLowering::ArgInfo> Args)647 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
648 const CallLowering::ArgInfo &Result,
649 ArrayRef<CallLowering::ArgInfo> Args) {
650 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
651 const char *Name = TLI.getLibcallName(Libcall);
652 const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
653 return createLibcall(MIRBuilder, Name, Result, Args, CC);
654 }
655
656 // Useful for libcalls where all operands have the same type.
657 static LegalizerHelper::LegalizeResult
simpleLibcall(MachineInstr & MI,MachineIRBuilder & MIRBuilder,unsigned Size,Type * OpType)658 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
659 Type *OpType) {
660 auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
661
662 // FIXME: What does the original arg index mean here?
663 SmallVector<CallLowering::ArgInfo, 3> Args;
664 for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
665 Args.push_back({MO.getReg(), OpType, 0});
666 return createLibcall(MIRBuilder, Libcall,
667 {MI.getOperand(0).getReg(), OpType, 0}, Args);
668 }
669
670 LegalizerHelper::LegalizeResult
createMemLibcall(MachineIRBuilder & MIRBuilder,MachineRegisterInfo & MRI,MachineInstr & MI,LostDebugLocObserver & LocObserver)671 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
672 MachineInstr &MI, LostDebugLocObserver &LocObserver) {
673 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
674
675 SmallVector<CallLowering::ArgInfo, 3> Args;
676 // Add all the args, except for the last which is an imm denoting 'tail'.
677 for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
678 Register Reg = MI.getOperand(i).getReg();
679
680 // Need derive an IR type for call lowering.
681 LLT OpLLT = MRI.getType(Reg);
682 Type *OpTy = nullptr;
683 if (OpLLT.isPointer())
684 OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
685 else
686 OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
687 Args.push_back({Reg, OpTy, 0});
688 }
689
690 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
691 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
692 RTLIB::Libcall RTLibcall;
693 unsigned Opc = MI.getOpcode();
694 switch (Opc) {
695 case TargetOpcode::G_BZERO:
696 RTLibcall = RTLIB::BZERO;
697 break;
698 case TargetOpcode::G_MEMCPY:
699 RTLibcall = RTLIB::MEMCPY;
700 Args[0].Flags[0].setReturned();
701 break;
702 case TargetOpcode::G_MEMMOVE:
703 RTLibcall = RTLIB::MEMMOVE;
704 Args[0].Flags[0].setReturned();
705 break;
706 case TargetOpcode::G_MEMSET:
707 RTLibcall = RTLIB::MEMSET;
708 Args[0].Flags[0].setReturned();
709 break;
710 default:
711 llvm_unreachable("unsupported opcode");
712 }
713 const char *Name = TLI.getLibcallName(RTLibcall);
714
715 // Unsupported libcall on the target.
716 if (!Name) {
717 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
718 << MIRBuilder.getTII().getName(Opc) << "\n");
719 return LegalizerHelper::UnableToLegalize;
720 }
721
722 CallLowering::CallLoweringInfo Info;
723 Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
724 Info.Callee = MachineOperand::CreateES(Name);
725 Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
726 Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
727 isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
728
729 std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
730 if (!CLI.lowerCall(MIRBuilder, Info))
731 return LegalizerHelper::UnableToLegalize;
732
733 if (Info.LoweredTailCall) {
734 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
735
736 // Check debug locations before removing the return.
737 LocObserver.checkpoint(true);
738
739 // We must have a return following the call (or debug insts) to get past
740 // isLibCallInTailPosition.
741 do {
742 MachineInstr *Next = MI.getNextNode();
743 assert(Next &&
744 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
745 "Expected instr following MI to be return or debug inst?");
746 // We lowered a tail call, so the call is now the return from the block.
747 // Delete the old return.
748 Next->eraseFromParent();
749 } while (MI.getNextNode());
750
751 // We expect to lose the debug location from the return.
752 LocObserver.checkpoint(false);
753 }
754
755 return LegalizerHelper::Legalized;
756 }
757
getConvRTLibDesc(unsigned Opcode,Type * ToType,Type * FromType)758 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
759 Type *FromType) {
760 auto ToMVT = MVT::getVT(ToType);
761 auto FromMVT = MVT::getVT(FromType);
762
763 switch (Opcode) {
764 case TargetOpcode::G_FPEXT:
765 return RTLIB::getFPEXT(FromMVT, ToMVT);
766 case TargetOpcode::G_FPTRUNC:
767 return RTLIB::getFPROUND(FromMVT, ToMVT);
768 case TargetOpcode::G_FPTOSI:
769 return RTLIB::getFPTOSINT(FromMVT, ToMVT);
770 case TargetOpcode::G_FPTOUI:
771 return RTLIB::getFPTOUINT(FromMVT, ToMVT);
772 case TargetOpcode::G_SITOFP:
773 return RTLIB::getSINTTOFP(FromMVT, ToMVT);
774 case TargetOpcode::G_UITOFP:
775 return RTLIB::getUINTTOFP(FromMVT, ToMVT);
776 }
777 llvm_unreachable("Unsupported libcall function");
778 }
779
780 static LegalizerHelper::LegalizeResult
conversionLibcall(MachineInstr & MI,MachineIRBuilder & MIRBuilder,Type * ToType,Type * FromType)781 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
782 Type *FromType) {
783 RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
784 return createLibcall(MIRBuilder, Libcall,
785 {MI.getOperand(0).getReg(), ToType, 0},
786 {{MI.getOperand(1).getReg(), FromType, 0}});
787 }
788
789 LegalizerHelper::LegalizeResult
libcall(MachineInstr & MI,LostDebugLocObserver & LocObserver)790 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
791 LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
792 unsigned Size = LLTy.getSizeInBits();
793 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
794
795 switch (MI.getOpcode()) {
796 default:
797 return UnableToLegalize;
798 case TargetOpcode::G_SDIV:
799 case TargetOpcode::G_UDIV:
800 case TargetOpcode::G_SREM:
801 case TargetOpcode::G_UREM:
802 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
803 Type *HLTy = IntegerType::get(Ctx, Size);
804 auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
805 if (Status != Legalized)
806 return Status;
807 break;
808 }
809 case TargetOpcode::G_FADD:
810 case TargetOpcode::G_FSUB:
811 case TargetOpcode::G_FMUL:
812 case TargetOpcode::G_FDIV:
813 case TargetOpcode::G_FMA:
814 case TargetOpcode::G_FPOW:
815 case TargetOpcode::G_FREM:
816 case TargetOpcode::G_FCOS:
817 case TargetOpcode::G_FSIN:
818 case TargetOpcode::G_FLOG10:
819 case TargetOpcode::G_FLOG:
820 case TargetOpcode::G_FLOG2:
821 case TargetOpcode::G_FEXP:
822 case TargetOpcode::G_FEXP2:
823 case TargetOpcode::G_FCEIL:
824 case TargetOpcode::G_FFLOOR:
825 case TargetOpcode::G_FMINNUM:
826 case TargetOpcode::G_FMAXNUM:
827 case TargetOpcode::G_FSQRT:
828 case TargetOpcode::G_FRINT:
829 case TargetOpcode::G_FNEARBYINT:
830 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
831 Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
832 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
833 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
834 return UnableToLegalize;
835 }
836 auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
837 if (Status != Legalized)
838 return Status;
839 break;
840 }
841 case TargetOpcode::G_FPEXT:
842 case TargetOpcode::G_FPTRUNC: {
843 Type *FromTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(1).getReg()));
844 Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
845 if (!FromTy || !ToTy)
846 return UnableToLegalize;
847 LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
848 if (Status != Legalized)
849 return Status;
850 break;
851 }
852 case TargetOpcode::G_FPTOSI:
853 case TargetOpcode::G_FPTOUI: {
854 // FIXME: Support other types
855 unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
856 unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
857 if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
858 return UnableToLegalize;
859 LegalizeResult Status = conversionLibcall(
860 MI, MIRBuilder,
861 ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
862 FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
863 if (Status != Legalized)
864 return Status;
865 break;
866 }
867 case TargetOpcode::G_SITOFP:
868 case TargetOpcode::G_UITOFP: {
869 // FIXME: Support other types
870 unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
871 unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
872 if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
873 return UnableToLegalize;
874 LegalizeResult Status = conversionLibcall(
875 MI, MIRBuilder,
876 ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
877 FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
878 if (Status != Legalized)
879 return Status;
880 break;
881 }
882 case TargetOpcode::G_BZERO:
883 case TargetOpcode::G_MEMCPY:
884 case TargetOpcode::G_MEMMOVE:
885 case TargetOpcode::G_MEMSET: {
886 LegalizeResult Result =
887 createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
888 if (Result != Legalized)
889 return Result;
890 MI.eraseFromParent();
891 return Result;
892 }
893 }
894
895 MI.eraseFromParent();
896 return Legalized;
897 }
898
narrowScalar(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)899 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
900 unsigned TypeIdx,
901 LLT NarrowTy) {
902 uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
903 uint64_t NarrowSize = NarrowTy.getSizeInBits();
904
905 switch (MI.getOpcode()) {
906 default:
907 return UnableToLegalize;
908 case TargetOpcode::G_IMPLICIT_DEF: {
909 Register DstReg = MI.getOperand(0).getReg();
910 LLT DstTy = MRI.getType(DstReg);
911
912 // If SizeOp0 is not an exact multiple of NarrowSize, emit
913 // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
914 // FIXME: Although this would also be legal for the general case, it causes
915 // a lot of regressions in the emitted code (superfluous COPYs, artifact
916 // combines not being hit). This seems to be a problem related to the
917 // artifact combiner.
918 if (SizeOp0 % NarrowSize != 0) {
919 LLT ImplicitTy = NarrowTy;
920 if (DstTy.isVector())
921 ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
922
923 Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
924 MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
925
926 MI.eraseFromParent();
927 return Legalized;
928 }
929
930 int NumParts = SizeOp0 / NarrowSize;
931
932 SmallVector<Register, 2> DstRegs;
933 for (int i = 0; i < NumParts; ++i)
934 DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
935
936 if (DstTy.isVector())
937 MIRBuilder.buildBuildVector(DstReg, DstRegs);
938 else
939 MIRBuilder.buildMerge(DstReg, DstRegs);
940 MI.eraseFromParent();
941 return Legalized;
942 }
943 case TargetOpcode::G_CONSTANT: {
944 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
945 const APInt &Val = MI.getOperand(1).getCImm()->getValue();
946 unsigned TotalSize = Ty.getSizeInBits();
947 unsigned NarrowSize = NarrowTy.getSizeInBits();
948 int NumParts = TotalSize / NarrowSize;
949
950 SmallVector<Register, 4> PartRegs;
951 for (int I = 0; I != NumParts; ++I) {
952 unsigned Offset = I * NarrowSize;
953 auto K = MIRBuilder.buildConstant(NarrowTy,
954 Val.lshr(Offset).trunc(NarrowSize));
955 PartRegs.push_back(K.getReg(0));
956 }
957
958 LLT LeftoverTy;
959 unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
960 SmallVector<Register, 1> LeftoverRegs;
961 if (LeftoverBits != 0) {
962 LeftoverTy = LLT::scalar(LeftoverBits);
963 auto K = MIRBuilder.buildConstant(
964 LeftoverTy,
965 Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
966 LeftoverRegs.push_back(K.getReg(0));
967 }
968
969 insertParts(MI.getOperand(0).getReg(),
970 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
971
972 MI.eraseFromParent();
973 return Legalized;
974 }
975 case TargetOpcode::G_SEXT:
976 case TargetOpcode::G_ZEXT:
977 case TargetOpcode::G_ANYEXT:
978 return narrowScalarExt(MI, TypeIdx, NarrowTy);
979 case TargetOpcode::G_TRUNC: {
980 if (TypeIdx != 1)
981 return UnableToLegalize;
982
983 uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
984 if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
985 LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
986 return UnableToLegalize;
987 }
988
989 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
990 MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
991 MI.eraseFromParent();
992 return Legalized;
993 }
994
995 case TargetOpcode::G_FREEZE: {
996 if (TypeIdx != 0)
997 return UnableToLegalize;
998
999 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1000 // Should widen scalar first
1001 if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1002 return UnableToLegalize;
1003
1004 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
1005 SmallVector<Register, 8> Parts;
1006 for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1007 Parts.push_back(
1008 MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
1009 }
1010
1011 MIRBuilder.buildMerge(MI.getOperand(0).getReg(), Parts);
1012 MI.eraseFromParent();
1013 return Legalized;
1014 }
1015 case TargetOpcode::G_ADD:
1016 case TargetOpcode::G_SUB:
1017 case TargetOpcode::G_SADDO:
1018 case TargetOpcode::G_SSUBO:
1019 case TargetOpcode::G_SADDE:
1020 case TargetOpcode::G_SSUBE:
1021 case TargetOpcode::G_UADDO:
1022 case TargetOpcode::G_USUBO:
1023 case TargetOpcode::G_UADDE:
1024 case TargetOpcode::G_USUBE:
1025 return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1026 case TargetOpcode::G_MUL:
1027 case TargetOpcode::G_UMULH:
1028 return narrowScalarMul(MI, NarrowTy);
1029 case TargetOpcode::G_EXTRACT:
1030 return narrowScalarExtract(MI, TypeIdx, NarrowTy);
1031 case TargetOpcode::G_INSERT:
1032 return narrowScalarInsert(MI, TypeIdx, NarrowTy);
1033 case TargetOpcode::G_LOAD: {
1034 auto &LoadMI = cast<GLoad>(MI);
1035 Register DstReg = LoadMI.getDstReg();
1036 LLT DstTy = MRI.getType(DstReg);
1037 if (DstTy.isVector())
1038 return UnableToLegalize;
1039
1040 if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
1041 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1042 MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
1043 MIRBuilder.buildAnyExt(DstReg, TmpReg);
1044 LoadMI.eraseFromParent();
1045 return Legalized;
1046 }
1047
1048 return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
1049 }
1050 case TargetOpcode::G_ZEXTLOAD:
1051 case TargetOpcode::G_SEXTLOAD: {
1052 auto &LoadMI = cast<GExtLoad>(MI);
1053 Register DstReg = LoadMI.getDstReg();
1054 Register PtrReg = LoadMI.getPointerReg();
1055
1056 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1057 auto &MMO = LoadMI.getMMO();
1058 unsigned MemSize = MMO.getSizeInBits();
1059
1060 if (MemSize == NarrowSize) {
1061 MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
1062 } else if (MemSize < NarrowSize) {
1063 MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
1064 } else if (MemSize > NarrowSize) {
1065 // FIXME: Need to split the load.
1066 return UnableToLegalize;
1067 }
1068
1069 if (isa<GZExtLoad>(LoadMI))
1070 MIRBuilder.buildZExt(DstReg, TmpReg);
1071 else
1072 MIRBuilder.buildSExt(DstReg, TmpReg);
1073
1074 LoadMI.eraseFromParent();
1075 return Legalized;
1076 }
1077 case TargetOpcode::G_STORE: {
1078 auto &StoreMI = cast<GStore>(MI);
1079
1080 Register SrcReg = StoreMI.getValueReg();
1081 LLT SrcTy = MRI.getType(SrcReg);
1082 if (SrcTy.isVector())
1083 return UnableToLegalize;
1084
1085 int NumParts = SizeOp0 / NarrowSize;
1086 unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1087 unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1088 if (SrcTy.isVector() && LeftoverBits != 0)
1089 return UnableToLegalize;
1090
1091 if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
1092 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1093 MIRBuilder.buildTrunc(TmpReg, SrcReg);
1094 MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1095 StoreMI.eraseFromParent();
1096 return Legalized;
1097 }
1098
1099 return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1100 }
1101 case TargetOpcode::G_SELECT:
1102 return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1103 case TargetOpcode::G_AND:
1104 case TargetOpcode::G_OR:
1105 case TargetOpcode::G_XOR: {
1106 // Legalize bitwise operation:
1107 // A = BinOp<Ty> B, C
1108 // into:
1109 // B1, ..., BN = G_UNMERGE_VALUES B
1110 // C1, ..., CN = G_UNMERGE_VALUES C
1111 // A1 = BinOp<Ty/N> B1, C2
1112 // ...
1113 // AN = BinOp<Ty/N> BN, CN
1114 // A = G_MERGE_VALUES A1, ..., AN
1115 return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1116 }
1117 case TargetOpcode::G_SHL:
1118 case TargetOpcode::G_LSHR:
1119 case TargetOpcode::G_ASHR:
1120 return narrowScalarShift(MI, TypeIdx, NarrowTy);
1121 case TargetOpcode::G_CTLZ:
1122 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1123 case TargetOpcode::G_CTTZ:
1124 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1125 case TargetOpcode::G_CTPOP:
1126 if (TypeIdx == 1)
1127 switch (MI.getOpcode()) {
1128 case TargetOpcode::G_CTLZ:
1129 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1130 return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1131 case TargetOpcode::G_CTTZ:
1132 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1133 return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1134 case TargetOpcode::G_CTPOP:
1135 return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1136 default:
1137 return UnableToLegalize;
1138 }
1139
1140 Observer.changingInstr(MI);
1141 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1142 Observer.changedInstr(MI);
1143 return Legalized;
1144 case TargetOpcode::G_INTTOPTR:
1145 if (TypeIdx != 1)
1146 return UnableToLegalize;
1147
1148 Observer.changingInstr(MI);
1149 narrowScalarSrc(MI, NarrowTy, 1);
1150 Observer.changedInstr(MI);
1151 return Legalized;
1152 case TargetOpcode::G_PTRTOINT:
1153 if (TypeIdx != 0)
1154 return UnableToLegalize;
1155
1156 Observer.changingInstr(MI);
1157 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1158 Observer.changedInstr(MI);
1159 return Legalized;
1160 case TargetOpcode::G_PHI: {
1161 // FIXME: add support for when SizeOp0 isn't an exact multiple of
1162 // NarrowSize.
1163 if (SizeOp0 % NarrowSize != 0)
1164 return UnableToLegalize;
1165
1166 unsigned NumParts = SizeOp0 / NarrowSize;
1167 SmallVector<Register, 2> DstRegs(NumParts);
1168 SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1169 Observer.changingInstr(MI);
1170 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1171 MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1172 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
1173 extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1174 SrcRegs[i / 2]);
1175 }
1176 MachineBasicBlock &MBB = *MI.getParent();
1177 MIRBuilder.setInsertPt(MBB, MI);
1178 for (unsigned i = 0; i < NumParts; ++i) {
1179 DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1180 MachineInstrBuilder MIB =
1181 MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1182 for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1183 MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1184 }
1185 MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1186 MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1187 Observer.changedInstr(MI);
1188 MI.eraseFromParent();
1189 return Legalized;
1190 }
1191 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1192 case TargetOpcode::G_INSERT_VECTOR_ELT: {
1193 if (TypeIdx != 2)
1194 return UnableToLegalize;
1195
1196 int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1197 Observer.changingInstr(MI);
1198 narrowScalarSrc(MI, NarrowTy, OpIdx);
1199 Observer.changedInstr(MI);
1200 return Legalized;
1201 }
1202 case TargetOpcode::G_ICMP: {
1203 Register LHS = MI.getOperand(2).getReg();
1204 LLT SrcTy = MRI.getType(LHS);
1205 uint64_t SrcSize = SrcTy.getSizeInBits();
1206 CmpInst::Predicate Pred =
1207 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1208
1209 // TODO: Handle the non-equality case for weird sizes.
1210 if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1211 return UnableToLegalize;
1212
1213 LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1214 SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1215 if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1216 LHSLeftoverRegs))
1217 return UnableToLegalize;
1218
1219 LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1220 SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1221 if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1222 RHSPartRegs, RHSLeftoverRegs))
1223 return UnableToLegalize;
1224
1225 // We now have the LHS and RHS of the compare split into narrow-type
1226 // registers, plus potentially some leftover type.
1227 Register Dst = MI.getOperand(0).getReg();
1228 LLT ResTy = MRI.getType(Dst);
1229 if (ICmpInst::isEquality(Pred)) {
1230 // For each part on the LHS and RHS, keep track of the result of XOR-ing
1231 // them together. For each equal part, the result should be all 0s. For
1232 // each non-equal part, we'll get at least one 1.
1233 auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1234 SmallVector<Register, 4> Xors;
1235 for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1236 auto LHS = std::get<0>(LHSAndRHS);
1237 auto RHS = std::get<1>(LHSAndRHS);
1238 auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1239 Xors.push_back(Xor);
1240 }
1241
1242 // Build a G_XOR for each leftover register. Each G_XOR must be widened
1243 // to the desired narrow type so that we can OR them together later.
1244 SmallVector<Register, 4> WidenedXors;
1245 for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1246 auto LHS = std::get<0>(LHSAndRHS);
1247 auto RHS = std::get<1>(LHSAndRHS);
1248 auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1249 LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1250 buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1251 /* PadStrategy = */ TargetOpcode::G_ZEXT);
1252 Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1253 }
1254
1255 // Now, for each part we broke up, we know if they are equal/not equal
1256 // based off the G_XOR. We can OR these all together and compare against
1257 // 0 to get the result.
1258 assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1259 auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1260 for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1261 Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1262 MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1263 } else {
1264 // TODO: Handle non-power-of-two types.
1265 assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1266 assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1267 Register LHSL = LHSPartRegs[0];
1268 Register LHSH = LHSPartRegs[1];
1269 Register RHSL = RHSPartRegs[0];
1270 Register RHSH = RHSPartRegs[1];
1271 MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1272 MachineInstrBuilder CmpHEQ =
1273 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1274 MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1275 ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1276 MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1277 }
1278 MI.eraseFromParent();
1279 return Legalized;
1280 }
1281 case TargetOpcode::G_SEXT_INREG: {
1282 if (TypeIdx != 0)
1283 return UnableToLegalize;
1284
1285 int64_t SizeInBits = MI.getOperand(2).getImm();
1286
1287 // So long as the new type has more bits than the bits we're extending we
1288 // don't need to break it apart.
1289 if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1290 Observer.changingInstr(MI);
1291 // We don't lose any non-extension bits by truncating the src and
1292 // sign-extending the dst.
1293 MachineOperand &MO1 = MI.getOperand(1);
1294 auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1295 MO1.setReg(TruncMIB.getReg(0));
1296
1297 MachineOperand &MO2 = MI.getOperand(0);
1298 Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1299 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1300 MIRBuilder.buildSExt(MO2, DstExt);
1301 MO2.setReg(DstExt);
1302 Observer.changedInstr(MI);
1303 return Legalized;
1304 }
1305
1306 // Break it apart. Components below the extension point are unmodified. The
1307 // component containing the extension point becomes a narrower SEXT_INREG.
1308 // Components above it are ashr'd from the component containing the
1309 // extension point.
1310 if (SizeOp0 % NarrowSize != 0)
1311 return UnableToLegalize;
1312 int NumParts = SizeOp0 / NarrowSize;
1313
1314 // List the registers where the destination will be scattered.
1315 SmallVector<Register, 2> DstRegs;
1316 // List the registers where the source will be split.
1317 SmallVector<Register, 2> SrcRegs;
1318
1319 // Create all the temporary registers.
1320 for (int i = 0; i < NumParts; ++i) {
1321 Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1322
1323 SrcRegs.push_back(SrcReg);
1324 }
1325
1326 // Explode the big arguments into smaller chunks.
1327 MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1328
1329 Register AshrCstReg =
1330 MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1331 .getReg(0);
1332 Register FullExtensionReg = 0;
1333 Register PartialExtensionReg = 0;
1334
1335 // Do the operation on each small part.
1336 for (int i = 0; i < NumParts; ++i) {
1337 if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
1338 DstRegs.push_back(SrcRegs[i]);
1339 else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
1340 assert(PartialExtensionReg &&
1341 "Expected to visit partial extension before full");
1342 if (FullExtensionReg) {
1343 DstRegs.push_back(FullExtensionReg);
1344 continue;
1345 }
1346 DstRegs.push_back(
1347 MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1348 .getReg(0));
1349 FullExtensionReg = DstRegs.back();
1350 } else {
1351 DstRegs.push_back(
1352 MIRBuilder
1353 .buildInstr(
1354 TargetOpcode::G_SEXT_INREG, {NarrowTy},
1355 {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1356 .getReg(0));
1357 PartialExtensionReg = DstRegs.back();
1358 }
1359 }
1360
1361 // Gather the destination registers into the final destination.
1362 Register DstReg = MI.getOperand(0).getReg();
1363 MIRBuilder.buildMerge(DstReg, DstRegs);
1364 MI.eraseFromParent();
1365 return Legalized;
1366 }
1367 case TargetOpcode::G_BSWAP:
1368 case TargetOpcode::G_BITREVERSE: {
1369 if (SizeOp0 % NarrowSize != 0)
1370 return UnableToLegalize;
1371
1372 Observer.changingInstr(MI);
1373 SmallVector<Register, 2> SrcRegs, DstRegs;
1374 unsigned NumParts = SizeOp0 / NarrowSize;
1375 extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1376
1377 for (unsigned i = 0; i < NumParts; ++i) {
1378 auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1379 {SrcRegs[NumParts - 1 - i]});
1380 DstRegs.push_back(DstPart.getReg(0));
1381 }
1382
1383 MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1384
1385 Observer.changedInstr(MI);
1386 MI.eraseFromParent();
1387 return Legalized;
1388 }
1389 case TargetOpcode::G_PTR_ADD:
1390 case TargetOpcode::G_PTRMASK: {
1391 if (TypeIdx != 1)
1392 return UnableToLegalize;
1393 Observer.changingInstr(MI);
1394 narrowScalarSrc(MI, NarrowTy, 2);
1395 Observer.changedInstr(MI);
1396 return Legalized;
1397 }
1398 case TargetOpcode::G_FPTOUI:
1399 case TargetOpcode::G_FPTOSI:
1400 return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1401 case TargetOpcode::G_FPEXT:
1402 if (TypeIdx != 0)
1403 return UnableToLegalize;
1404 Observer.changingInstr(MI);
1405 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1406 Observer.changedInstr(MI);
1407 return Legalized;
1408 }
1409 }
1410
coerceToScalar(Register Val)1411 Register LegalizerHelper::coerceToScalar(Register Val) {
1412 LLT Ty = MRI.getType(Val);
1413 if (Ty.isScalar())
1414 return Val;
1415
1416 const DataLayout &DL = MIRBuilder.getDataLayout();
1417 LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1418 if (Ty.isPointer()) {
1419 if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1420 return Register();
1421 return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1422 }
1423
1424 Register NewVal = Val;
1425
1426 assert(Ty.isVector());
1427 LLT EltTy = Ty.getElementType();
1428 if (EltTy.isPointer())
1429 NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1430 return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1431 }
1432
widenScalarSrc(MachineInstr & MI,LLT WideTy,unsigned OpIdx,unsigned ExtOpcode)1433 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1434 unsigned OpIdx, unsigned ExtOpcode) {
1435 MachineOperand &MO = MI.getOperand(OpIdx);
1436 auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1437 MO.setReg(ExtB.getReg(0));
1438 }
1439
narrowScalarSrc(MachineInstr & MI,LLT NarrowTy,unsigned OpIdx)1440 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1441 unsigned OpIdx) {
1442 MachineOperand &MO = MI.getOperand(OpIdx);
1443 auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1444 MO.setReg(ExtB.getReg(0));
1445 }
1446
widenScalarDst(MachineInstr & MI,LLT WideTy,unsigned OpIdx,unsigned TruncOpcode)1447 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1448 unsigned OpIdx, unsigned TruncOpcode) {
1449 MachineOperand &MO = MI.getOperand(OpIdx);
1450 Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1451 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1452 MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1453 MO.setReg(DstExt);
1454 }
1455
narrowScalarDst(MachineInstr & MI,LLT NarrowTy,unsigned OpIdx,unsigned ExtOpcode)1456 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1457 unsigned OpIdx, unsigned ExtOpcode) {
1458 MachineOperand &MO = MI.getOperand(OpIdx);
1459 Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1460 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1461 MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1462 MO.setReg(DstTrunc);
1463 }
1464
moreElementsVectorDst(MachineInstr & MI,LLT WideTy,unsigned OpIdx)1465 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1466 unsigned OpIdx) {
1467 MachineOperand &MO = MI.getOperand(OpIdx);
1468 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1469 Register Dst = MO.getReg();
1470 Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1471 MO.setReg(DstExt);
1472 MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
1473 }
1474
moreElementsVectorSrc(MachineInstr & MI,LLT MoreTy,unsigned OpIdx)1475 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1476 unsigned OpIdx) {
1477 MachineOperand &MO = MI.getOperand(OpIdx);
1478 SmallVector<Register, 8> Regs;
1479 MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
1480 }
1481
bitcastSrc(MachineInstr & MI,LLT CastTy,unsigned OpIdx)1482 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1483 MachineOperand &Op = MI.getOperand(OpIdx);
1484 Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1485 }
1486
bitcastDst(MachineInstr & MI,LLT CastTy,unsigned OpIdx)1487 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1488 MachineOperand &MO = MI.getOperand(OpIdx);
1489 Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1490 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1491 MIRBuilder.buildBitcast(MO, CastDst);
1492 MO.setReg(CastDst);
1493 }
1494
1495 LegalizerHelper::LegalizeResult
widenScalarMergeValues(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)1496 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1497 LLT WideTy) {
1498 if (TypeIdx != 1)
1499 return UnableToLegalize;
1500
1501 Register DstReg = MI.getOperand(0).getReg();
1502 LLT DstTy = MRI.getType(DstReg);
1503 if (DstTy.isVector())
1504 return UnableToLegalize;
1505
1506 Register Src1 = MI.getOperand(1).getReg();
1507 LLT SrcTy = MRI.getType(Src1);
1508 const int DstSize = DstTy.getSizeInBits();
1509 const int SrcSize = SrcTy.getSizeInBits();
1510 const int WideSize = WideTy.getSizeInBits();
1511 const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1512
1513 unsigned NumOps = MI.getNumOperands();
1514 unsigned NumSrc = MI.getNumOperands() - 1;
1515 unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1516
1517 if (WideSize >= DstSize) {
1518 // Directly pack the bits in the target type.
1519 Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0);
1520
1521 for (unsigned I = 2; I != NumOps; ++I) {
1522 const unsigned Offset = (I - 1) * PartSize;
1523
1524 Register SrcReg = MI.getOperand(I).getReg();
1525 assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1526
1527 auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1528
1529 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1530 MRI.createGenericVirtualRegister(WideTy);
1531
1532 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1533 auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1534 MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1535 ResultReg = NextResult;
1536 }
1537
1538 if (WideSize > DstSize)
1539 MIRBuilder.buildTrunc(DstReg, ResultReg);
1540 else if (DstTy.isPointer())
1541 MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1542
1543 MI.eraseFromParent();
1544 return Legalized;
1545 }
1546
1547 // Unmerge the original values to the GCD type, and recombine to the next
1548 // multiple greater than the original type.
1549 //
1550 // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1551 // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1552 // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1553 // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1554 // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1555 // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1556 // %12:_(s12) = G_MERGE_VALUES %10, %11
1557 //
1558 // Padding with undef if necessary:
1559 //
1560 // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1561 // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1562 // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1563 // %7:_(s2) = G_IMPLICIT_DEF
1564 // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1565 // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1566 // %10:_(s12) = G_MERGE_VALUES %8, %9
1567
1568 const int GCD = greatestCommonDivisor(SrcSize, WideSize);
1569 LLT GCDTy = LLT::scalar(GCD);
1570
1571 SmallVector<Register, 8> Parts;
1572 SmallVector<Register, 8> NewMergeRegs;
1573 SmallVector<Register, 8> Unmerges;
1574 LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1575
1576 // Decompose the original operands if they don't evenly divide.
1577 for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
1578 Register SrcReg = MO.getReg();
1579 if (GCD == SrcSize) {
1580 Unmerges.push_back(SrcReg);
1581 } else {
1582 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1583 for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1584 Unmerges.push_back(Unmerge.getReg(J));
1585 }
1586 }
1587
1588 // Pad with undef to the next size that is a multiple of the requested size.
1589 if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1590 Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1591 for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1592 Unmerges.push_back(UndefReg);
1593 }
1594
1595 const int PartsPerGCD = WideSize / GCD;
1596
1597 // Build merges of each piece.
1598 ArrayRef<Register> Slicer(Unmerges);
1599 for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1600 auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD));
1601 NewMergeRegs.push_back(Merge.getReg(0));
1602 }
1603
1604 // A truncate may be necessary if the requested type doesn't evenly divide the
1605 // original result type.
1606 if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1607 MIRBuilder.buildMerge(DstReg, NewMergeRegs);
1608 } else {
1609 auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs);
1610 MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1611 }
1612
1613 MI.eraseFromParent();
1614 return Legalized;
1615 }
1616
1617 LegalizerHelper::LegalizeResult
widenScalarUnmergeValues(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)1618 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1619 LLT WideTy) {
1620 if (TypeIdx != 0)
1621 return UnableToLegalize;
1622
1623 int NumDst = MI.getNumOperands() - 1;
1624 Register SrcReg = MI.getOperand(NumDst).getReg();
1625 LLT SrcTy = MRI.getType(SrcReg);
1626 if (SrcTy.isVector())
1627 return UnableToLegalize;
1628
1629 Register Dst0Reg = MI.getOperand(0).getReg();
1630 LLT DstTy = MRI.getType(Dst0Reg);
1631 if (!DstTy.isScalar())
1632 return UnableToLegalize;
1633
1634 if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1635 if (SrcTy.isPointer()) {
1636 const DataLayout &DL = MIRBuilder.getDataLayout();
1637 if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1638 LLVM_DEBUG(
1639 dbgs() << "Not casting non-integral address space integer\n");
1640 return UnableToLegalize;
1641 }
1642
1643 SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1644 SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1645 }
1646
1647 // Widen SrcTy to WideTy. This does not affect the result, but since the
1648 // user requested this size, it is probably better handled than SrcTy and
1649 // should reduce the total number of legalization artifacts.
1650 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1651 SrcTy = WideTy;
1652 SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1653 }
1654
1655 // Theres no unmerge type to target. Directly extract the bits from the
1656 // source type
1657 unsigned DstSize = DstTy.getSizeInBits();
1658
1659 MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1660 for (int I = 1; I != NumDst; ++I) {
1661 auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1662 auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1663 MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1664 }
1665
1666 MI.eraseFromParent();
1667 return Legalized;
1668 }
1669
1670 // Extend the source to a wider type.
1671 LLT LCMTy = getLCMType(SrcTy, WideTy);
1672
1673 Register WideSrc = SrcReg;
1674 if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1675 // TODO: If this is an integral address space, cast to integer and anyext.
1676 if (SrcTy.isPointer()) {
1677 LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1678 return UnableToLegalize;
1679 }
1680
1681 WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1682 }
1683
1684 auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1685
1686 // Create a sequence of unmerges and merges to the original results. Since we
1687 // may have widened the source, we will need to pad the results with dead defs
1688 // to cover the source register.
1689 // e.g. widen s48 to s64:
1690 // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1691 //
1692 // =>
1693 // %4:_(s192) = G_ANYEXT %0:_(s96)
1694 // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1695 // ; unpack to GCD type, with extra dead defs
1696 // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1697 // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1698 // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1699 // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination
1700 // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1701 const LLT GCDTy = getGCDType(WideTy, DstTy);
1702 const int NumUnmerge = Unmerge->getNumOperands() - 1;
1703 const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1704
1705 // Directly unmerge to the destination without going through a GCD type
1706 // if possible
1707 if (PartsPerRemerge == 1) {
1708 const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1709
1710 for (int I = 0; I != NumUnmerge; ++I) {
1711 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1712
1713 for (int J = 0; J != PartsPerUnmerge; ++J) {
1714 int Idx = I * PartsPerUnmerge + J;
1715 if (Idx < NumDst)
1716 MIB.addDef(MI.getOperand(Idx).getReg());
1717 else {
1718 // Create dead def for excess components.
1719 MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
1720 }
1721 }
1722
1723 MIB.addUse(Unmerge.getReg(I));
1724 }
1725 } else {
1726 SmallVector<Register, 16> Parts;
1727 for (int J = 0; J != NumUnmerge; ++J)
1728 extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
1729
1730 SmallVector<Register, 8> RemergeParts;
1731 for (int I = 0; I != NumDst; ++I) {
1732 for (int J = 0; J < PartsPerRemerge; ++J) {
1733 const int Idx = I * PartsPerRemerge + J;
1734 RemergeParts.emplace_back(Parts[Idx]);
1735 }
1736
1737 MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts);
1738 RemergeParts.clear();
1739 }
1740 }
1741
1742 MI.eraseFromParent();
1743 return Legalized;
1744 }
1745
1746 LegalizerHelper::LegalizeResult
widenScalarExtract(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)1747 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
1748 LLT WideTy) {
1749 Register DstReg = MI.getOperand(0).getReg();
1750 Register SrcReg = MI.getOperand(1).getReg();
1751 LLT SrcTy = MRI.getType(SrcReg);
1752
1753 LLT DstTy = MRI.getType(DstReg);
1754 unsigned Offset = MI.getOperand(2).getImm();
1755
1756 if (TypeIdx == 0) {
1757 if (SrcTy.isVector() || DstTy.isVector())
1758 return UnableToLegalize;
1759
1760 SrcOp Src(SrcReg);
1761 if (SrcTy.isPointer()) {
1762 // Extracts from pointers can be handled only if they are really just
1763 // simple integers.
1764 const DataLayout &DL = MIRBuilder.getDataLayout();
1765 if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
1766 return UnableToLegalize;
1767
1768 LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
1769 Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
1770 SrcTy = SrcAsIntTy;
1771 }
1772
1773 if (DstTy.isPointer())
1774 return UnableToLegalize;
1775
1776 if (Offset == 0) {
1777 // Avoid a shift in the degenerate case.
1778 MIRBuilder.buildTrunc(DstReg,
1779 MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
1780 MI.eraseFromParent();
1781 return Legalized;
1782 }
1783
1784 // Do a shift in the source type.
1785 LLT ShiftTy = SrcTy;
1786 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1787 Src = MIRBuilder.buildAnyExt(WideTy, Src);
1788 ShiftTy = WideTy;
1789 }
1790
1791 auto LShr = MIRBuilder.buildLShr(
1792 ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
1793 MIRBuilder.buildTrunc(DstReg, LShr);
1794 MI.eraseFromParent();
1795 return Legalized;
1796 }
1797
1798 if (SrcTy.isScalar()) {
1799 Observer.changingInstr(MI);
1800 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1801 Observer.changedInstr(MI);
1802 return Legalized;
1803 }
1804
1805 if (!SrcTy.isVector())
1806 return UnableToLegalize;
1807
1808 if (DstTy != SrcTy.getElementType())
1809 return UnableToLegalize;
1810
1811 if (Offset % SrcTy.getScalarSizeInBits() != 0)
1812 return UnableToLegalize;
1813
1814 Observer.changingInstr(MI);
1815 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1816
1817 MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
1818 Offset);
1819 widenScalarDst(MI, WideTy.getScalarType(), 0);
1820 Observer.changedInstr(MI);
1821 return Legalized;
1822 }
1823
1824 LegalizerHelper::LegalizeResult
widenScalarInsert(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)1825 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
1826 LLT WideTy) {
1827 if (TypeIdx != 0 || WideTy.isVector())
1828 return UnableToLegalize;
1829 Observer.changingInstr(MI);
1830 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1831 widenScalarDst(MI, WideTy);
1832 Observer.changedInstr(MI);
1833 return Legalized;
1834 }
1835
1836 LegalizerHelper::LegalizeResult
widenScalarAddSubOverflow(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)1837 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
1838 LLT WideTy) {
1839 unsigned Opcode;
1840 unsigned ExtOpcode;
1841 Optional<Register> CarryIn = None;
1842 switch (MI.getOpcode()) {
1843 default:
1844 llvm_unreachable("Unexpected opcode!");
1845 case TargetOpcode::G_SADDO:
1846 Opcode = TargetOpcode::G_ADD;
1847 ExtOpcode = TargetOpcode::G_SEXT;
1848 break;
1849 case TargetOpcode::G_SSUBO:
1850 Opcode = TargetOpcode::G_SUB;
1851 ExtOpcode = TargetOpcode::G_SEXT;
1852 break;
1853 case TargetOpcode::G_UADDO:
1854 Opcode = TargetOpcode::G_ADD;
1855 ExtOpcode = TargetOpcode::G_ZEXT;
1856 break;
1857 case TargetOpcode::G_USUBO:
1858 Opcode = TargetOpcode::G_SUB;
1859 ExtOpcode = TargetOpcode::G_ZEXT;
1860 break;
1861 case TargetOpcode::G_SADDE:
1862 Opcode = TargetOpcode::G_UADDE;
1863 ExtOpcode = TargetOpcode::G_SEXT;
1864 CarryIn = MI.getOperand(4).getReg();
1865 break;
1866 case TargetOpcode::G_SSUBE:
1867 Opcode = TargetOpcode::G_USUBE;
1868 ExtOpcode = TargetOpcode::G_SEXT;
1869 CarryIn = MI.getOperand(4).getReg();
1870 break;
1871 case TargetOpcode::G_UADDE:
1872 Opcode = TargetOpcode::G_UADDE;
1873 ExtOpcode = TargetOpcode::G_ZEXT;
1874 CarryIn = MI.getOperand(4).getReg();
1875 break;
1876 case TargetOpcode::G_USUBE:
1877 Opcode = TargetOpcode::G_USUBE;
1878 ExtOpcode = TargetOpcode::G_ZEXT;
1879 CarryIn = MI.getOperand(4).getReg();
1880 break;
1881 }
1882
1883 if (TypeIdx == 1) {
1884 unsigned BoolExtOp = MIRBuilder.getBoolExtOp(WideTy.isVector(), false);
1885
1886 Observer.changingInstr(MI);
1887 widenScalarDst(MI, WideTy, 1);
1888 if (CarryIn)
1889 widenScalarSrc(MI, WideTy, 4, BoolExtOp);
1890
1891 Observer.changedInstr(MI);
1892 return Legalized;
1893 }
1894
1895 auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
1896 auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
1897 // Do the arithmetic in the larger type.
1898 Register NewOp;
1899 if (CarryIn) {
1900 LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
1901 NewOp = MIRBuilder
1902 .buildInstr(Opcode, {WideTy, CarryOutTy},
1903 {LHSExt, RHSExt, *CarryIn})
1904 .getReg(0);
1905 } else {
1906 NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
1907 }
1908 LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
1909 auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
1910 auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
1911 // There is no overflow if the ExtOp is the same as NewOp.
1912 MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
1913 // Now trunc the NewOp to the original result.
1914 MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
1915 MI.eraseFromParent();
1916 return Legalized;
1917 }
1918
1919 LegalizerHelper::LegalizeResult
widenScalarAddSubShlSat(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)1920 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
1921 LLT WideTy) {
1922 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
1923 MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
1924 MI.getOpcode() == TargetOpcode::G_SSHLSAT;
1925 bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
1926 MI.getOpcode() == TargetOpcode::G_USHLSAT;
1927 // We can convert this to:
1928 // 1. Any extend iN to iM
1929 // 2. SHL by M-N
1930 // 3. [US][ADD|SUB|SHL]SAT
1931 // 4. L/ASHR by M-N
1932 //
1933 // It may be more efficient to lower this to a min and a max operation in
1934 // the higher precision arithmetic if the promoted operation isn't legal,
1935 // but this decision is up to the target's lowering request.
1936 Register DstReg = MI.getOperand(0).getReg();
1937
1938 unsigned NewBits = WideTy.getScalarSizeInBits();
1939 unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
1940
1941 // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
1942 // must not left shift the RHS to preserve the shift amount.
1943 auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
1944 auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
1945 : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
1946 auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
1947 auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
1948 auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
1949
1950 auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
1951 {ShiftL, ShiftR}, MI.getFlags());
1952
1953 // Use a shift that will preserve the number of sign bits when the trunc is
1954 // folded away.
1955 auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
1956 : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
1957
1958 MIRBuilder.buildTrunc(DstReg, Result);
1959 MI.eraseFromParent();
1960 return Legalized;
1961 }
1962
1963 LegalizerHelper::LegalizeResult
widenScalarMulo(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)1964 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
1965 LLT WideTy) {
1966 if (TypeIdx == 1) {
1967 Observer.changingInstr(MI);
1968 widenScalarDst(MI, WideTy, 1);
1969 Observer.changedInstr(MI);
1970 return Legalized;
1971 }
1972
1973 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
1974 Register Result = MI.getOperand(0).getReg();
1975 Register OriginalOverflow = MI.getOperand(1).getReg();
1976 Register LHS = MI.getOperand(2).getReg();
1977 Register RHS = MI.getOperand(3).getReg();
1978 LLT SrcTy = MRI.getType(LHS);
1979 LLT OverflowTy = MRI.getType(OriginalOverflow);
1980 unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
1981
1982 // To determine if the result overflowed in the larger type, we extend the
1983 // input to the larger type, do the multiply (checking if it overflows),
1984 // then also check the high bits of the result to see if overflow happened
1985 // there.
1986 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
1987 auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
1988 auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
1989
1990 auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
1991 {LeftOperand, RightOperand});
1992 auto Mul = Mulo->getOperand(0);
1993 MIRBuilder.buildTrunc(Result, Mul);
1994
1995 MachineInstrBuilder ExtResult;
1996 // Overflow occurred if it occurred in the larger type, or if the high part
1997 // of the result does not zero/sign-extend the low part. Check this second
1998 // possibility first.
1999 if (IsSigned) {
2000 // For signed, overflow occurred when the high part does not sign-extend
2001 // the low part.
2002 ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
2003 } else {
2004 // Unsigned overflow occurred when the high part does not zero-extend the
2005 // low part.
2006 ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
2007 }
2008
2009 // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2010 // so we don't need to check the overflow result of larger type Mulo.
2011 if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
2012 auto Overflow =
2013 MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
2014 // Finally check if the multiplication in the larger type itself overflowed.
2015 MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
2016 } else {
2017 MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
2018 }
2019 MI.eraseFromParent();
2020 return Legalized;
2021 }
2022
2023 LegalizerHelper::LegalizeResult
widenScalar(MachineInstr & MI,unsigned TypeIdx,LLT WideTy)2024 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2025 switch (MI.getOpcode()) {
2026 default:
2027 return UnableToLegalize;
2028 case TargetOpcode::G_ATOMICRMW_XCHG:
2029 case TargetOpcode::G_ATOMICRMW_ADD:
2030 case TargetOpcode::G_ATOMICRMW_SUB:
2031 case TargetOpcode::G_ATOMICRMW_AND:
2032 case TargetOpcode::G_ATOMICRMW_OR:
2033 case TargetOpcode::G_ATOMICRMW_XOR:
2034 case TargetOpcode::G_ATOMICRMW_MIN:
2035 case TargetOpcode::G_ATOMICRMW_MAX:
2036 case TargetOpcode::G_ATOMICRMW_UMIN:
2037 case TargetOpcode::G_ATOMICRMW_UMAX:
2038 assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2039 Observer.changingInstr(MI);
2040 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2041 widenScalarDst(MI, WideTy, 0);
2042 Observer.changedInstr(MI);
2043 return Legalized;
2044 case TargetOpcode::G_ATOMIC_CMPXCHG:
2045 assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2046 Observer.changingInstr(MI);
2047 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2048 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2049 widenScalarDst(MI, WideTy, 0);
2050 Observer.changedInstr(MI);
2051 return Legalized;
2052 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2053 if (TypeIdx == 0) {
2054 Observer.changingInstr(MI);
2055 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2056 widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2057 widenScalarDst(MI, WideTy, 0);
2058 Observer.changedInstr(MI);
2059 return Legalized;
2060 }
2061 assert(TypeIdx == 1 &&
2062 "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2063 Observer.changingInstr(MI);
2064 widenScalarDst(MI, WideTy, 1);
2065 Observer.changedInstr(MI);
2066 return Legalized;
2067 case TargetOpcode::G_EXTRACT:
2068 return widenScalarExtract(MI, TypeIdx, WideTy);
2069 case TargetOpcode::G_INSERT:
2070 return widenScalarInsert(MI, TypeIdx, WideTy);
2071 case TargetOpcode::G_MERGE_VALUES:
2072 return widenScalarMergeValues(MI, TypeIdx, WideTy);
2073 case TargetOpcode::G_UNMERGE_VALUES:
2074 return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2075 case TargetOpcode::G_SADDO:
2076 case TargetOpcode::G_SSUBO:
2077 case TargetOpcode::G_UADDO:
2078 case TargetOpcode::G_USUBO:
2079 case TargetOpcode::G_SADDE:
2080 case TargetOpcode::G_SSUBE:
2081 case TargetOpcode::G_UADDE:
2082 case TargetOpcode::G_USUBE:
2083 return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2084 case TargetOpcode::G_UMULO:
2085 case TargetOpcode::G_SMULO:
2086 return widenScalarMulo(MI, TypeIdx, WideTy);
2087 case TargetOpcode::G_SADDSAT:
2088 case TargetOpcode::G_SSUBSAT:
2089 case TargetOpcode::G_SSHLSAT:
2090 case TargetOpcode::G_UADDSAT:
2091 case TargetOpcode::G_USUBSAT:
2092 case TargetOpcode::G_USHLSAT:
2093 return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2094 case TargetOpcode::G_CTTZ:
2095 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2096 case TargetOpcode::G_CTLZ:
2097 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2098 case TargetOpcode::G_CTPOP: {
2099 if (TypeIdx == 0) {
2100 Observer.changingInstr(MI);
2101 widenScalarDst(MI, WideTy, 0);
2102 Observer.changedInstr(MI);
2103 return Legalized;
2104 }
2105
2106 Register SrcReg = MI.getOperand(1).getReg();
2107
2108 // First extend the input.
2109 unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2110 MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2111 ? TargetOpcode::G_ANYEXT
2112 : TargetOpcode::G_ZEXT;
2113 auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2114 LLT CurTy = MRI.getType(SrcReg);
2115 unsigned NewOpc = MI.getOpcode();
2116 if (NewOpc == TargetOpcode::G_CTTZ) {
2117 // The count is the same in the larger type except if the original
2118 // value was zero. This can be handled by setting the bit just off
2119 // the top of the original type.
2120 auto TopBit =
2121 APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2122 MIBSrc = MIRBuilder.buildOr(
2123 WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2124 // Now we know the operand is non-zero, use the more relaxed opcode.
2125 NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2126 }
2127
2128 // Perform the operation at the larger size.
2129 auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2130 // This is already the correct result for CTPOP and CTTZs
2131 if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2132 MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2133 // The correct result is NewOp - (Difference in widety and current ty).
2134 unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2135 MIBNewOp = MIRBuilder.buildSub(
2136 WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2137 }
2138
2139 MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2140 MI.eraseFromParent();
2141 return Legalized;
2142 }
2143 case TargetOpcode::G_BSWAP: {
2144 Observer.changingInstr(MI);
2145 Register DstReg = MI.getOperand(0).getReg();
2146
2147 Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2148 Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2149 Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2150 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2151
2152 MI.getOperand(0).setReg(DstExt);
2153
2154 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2155
2156 LLT Ty = MRI.getType(DstReg);
2157 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2158 MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2159 MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2160
2161 MIRBuilder.buildTrunc(DstReg, ShrReg);
2162 Observer.changedInstr(MI);
2163 return Legalized;
2164 }
2165 case TargetOpcode::G_BITREVERSE: {
2166 Observer.changingInstr(MI);
2167
2168 Register DstReg = MI.getOperand(0).getReg();
2169 LLT Ty = MRI.getType(DstReg);
2170 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2171
2172 Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2173 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2174 MI.getOperand(0).setReg(DstExt);
2175 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2176
2177 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2178 auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2179 MIRBuilder.buildTrunc(DstReg, Shift);
2180 Observer.changedInstr(MI);
2181 return Legalized;
2182 }
2183 case TargetOpcode::G_FREEZE:
2184 Observer.changingInstr(MI);
2185 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2186 widenScalarDst(MI, WideTy);
2187 Observer.changedInstr(MI);
2188 return Legalized;
2189
2190 case TargetOpcode::G_ABS:
2191 Observer.changingInstr(MI);
2192 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2193 widenScalarDst(MI, WideTy);
2194 Observer.changedInstr(MI);
2195 return Legalized;
2196
2197 case TargetOpcode::G_ADD:
2198 case TargetOpcode::G_AND:
2199 case TargetOpcode::G_MUL:
2200 case TargetOpcode::G_OR:
2201 case TargetOpcode::G_XOR:
2202 case TargetOpcode::G_SUB:
2203 // Perform operation at larger width (any extension is fines here, high bits
2204 // don't affect the result) and then truncate the result back to the
2205 // original type.
2206 Observer.changingInstr(MI);
2207 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2208 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2209 widenScalarDst(MI, WideTy);
2210 Observer.changedInstr(MI);
2211 return Legalized;
2212
2213 case TargetOpcode::G_SBFX:
2214 case TargetOpcode::G_UBFX:
2215 Observer.changingInstr(MI);
2216
2217 if (TypeIdx == 0) {
2218 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2219 widenScalarDst(MI, WideTy);
2220 } else {
2221 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2222 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2223 }
2224
2225 Observer.changedInstr(MI);
2226 return Legalized;
2227
2228 case TargetOpcode::G_SHL:
2229 Observer.changingInstr(MI);
2230
2231 if (TypeIdx == 0) {
2232 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2233 widenScalarDst(MI, WideTy);
2234 } else {
2235 assert(TypeIdx == 1);
2236 // The "number of bits to shift" operand must preserve its value as an
2237 // unsigned integer:
2238 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2239 }
2240
2241 Observer.changedInstr(MI);
2242 return Legalized;
2243
2244 case TargetOpcode::G_SDIV:
2245 case TargetOpcode::G_SREM:
2246 case TargetOpcode::G_SMIN:
2247 case TargetOpcode::G_SMAX:
2248 Observer.changingInstr(MI);
2249 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2250 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2251 widenScalarDst(MI, WideTy);
2252 Observer.changedInstr(MI);
2253 return Legalized;
2254
2255 case TargetOpcode::G_SDIVREM:
2256 Observer.changingInstr(MI);
2257 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2258 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2259 widenScalarDst(MI, WideTy);
2260 widenScalarDst(MI, WideTy, 1);
2261 Observer.changedInstr(MI);
2262 return Legalized;
2263
2264 case TargetOpcode::G_ASHR:
2265 case TargetOpcode::G_LSHR:
2266 Observer.changingInstr(MI);
2267
2268 if (TypeIdx == 0) {
2269 unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2270 TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2271
2272 widenScalarSrc(MI, WideTy, 1, CvtOp);
2273 widenScalarDst(MI, WideTy);
2274 } else {
2275 assert(TypeIdx == 1);
2276 // The "number of bits to shift" operand must preserve its value as an
2277 // unsigned integer:
2278 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2279 }
2280
2281 Observer.changedInstr(MI);
2282 return Legalized;
2283 case TargetOpcode::G_UDIV:
2284 case TargetOpcode::G_UREM:
2285 case TargetOpcode::G_UMIN:
2286 case TargetOpcode::G_UMAX:
2287 Observer.changingInstr(MI);
2288 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2289 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2290 widenScalarDst(MI, WideTy);
2291 Observer.changedInstr(MI);
2292 return Legalized;
2293
2294 case TargetOpcode::G_UDIVREM:
2295 Observer.changingInstr(MI);
2296 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2297 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2298 widenScalarDst(MI, WideTy);
2299 widenScalarDst(MI, WideTy, 1);
2300 Observer.changedInstr(MI);
2301 return Legalized;
2302
2303 case TargetOpcode::G_SELECT:
2304 Observer.changingInstr(MI);
2305 if (TypeIdx == 0) {
2306 // Perform operation at larger width (any extension is fine here, high
2307 // bits don't affect the result) and then truncate the result back to the
2308 // original type.
2309 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2310 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2311 widenScalarDst(MI, WideTy);
2312 } else {
2313 bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2314 // Explicit extension is required here since high bits affect the result.
2315 widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2316 }
2317 Observer.changedInstr(MI);
2318 return Legalized;
2319
2320 case TargetOpcode::G_FPTOSI:
2321 case TargetOpcode::G_FPTOUI:
2322 Observer.changingInstr(MI);
2323
2324 if (TypeIdx == 0)
2325 widenScalarDst(MI, WideTy);
2326 else
2327 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2328
2329 Observer.changedInstr(MI);
2330 return Legalized;
2331 case TargetOpcode::G_SITOFP:
2332 Observer.changingInstr(MI);
2333
2334 if (TypeIdx == 0)
2335 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2336 else
2337 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2338
2339 Observer.changedInstr(MI);
2340 return Legalized;
2341 case TargetOpcode::G_UITOFP:
2342 Observer.changingInstr(MI);
2343
2344 if (TypeIdx == 0)
2345 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2346 else
2347 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2348
2349 Observer.changedInstr(MI);
2350 return Legalized;
2351 case TargetOpcode::G_LOAD:
2352 case TargetOpcode::G_SEXTLOAD:
2353 case TargetOpcode::G_ZEXTLOAD:
2354 Observer.changingInstr(MI);
2355 widenScalarDst(MI, WideTy);
2356 Observer.changedInstr(MI);
2357 return Legalized;
2358
2359 case TargetOpcode::G_STORE: {
2360 if (TypeIdx != 0)
2361 return UnableToLegalize;
2362
2363 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2364 if (!Ty.isScalar())
2365 return UnableToLegalize;
2366
2367 Observer.changingInstr(MI);
2368
2369 unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2370 TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2371 widenScalarSrc(MI, WideTy, 0, ExtType);
2372
2373 Observer.changedInstr(MI);
2374 return Legalized;
2375 }
2376 case TargetOpcode::G_CONSTANT: {
2377 MachineOperand &SrcMO = MI.getOperand(1);
2378 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2379 unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2380 MRI.getType(MI.getOperand(0).getReg()));
2381 assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2382 ExtOpc == TargetOpcode::G_ANYEXT) &&
2383 "Illegal Extend");
2384 const APInt &SrcVal = SrcMO.getCImm()->getValue();
2385 const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2386 ? SrcVal.sext(WideTy.getSizeInBits())
2387 : SrcVal.zext(WideTy.getSizeInBits());
2388 Observer.changingInstr(MI);
2389 SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2390
2391 widenScalarDst(MI, WideTy);
2392 Observer.changedInstr(MI);
2393 return Legalized;
2394 }
2395 case TargetOpcode::G_FCONSTANT: {
2396 // To avoid changing the bits of the constant due to extension to a larger
2397 // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
2398 MachineOperand &SrcMO = MI.getOperand(1);
2399 APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
2400 MIRBuilder.setInstrAndDebugLoc(MI);
2401 auto IntCst = MIRBuilder.buildConstant(MI.getOperand(0).getReg(), Val);
2402 widenScalarDst(*IntCst, WideTy, 0, TargetOpcode::G_TRUNC);
2403 MI.eraseFromParent();
2404 return Legalized;
2405 }
2406 case TargetOpcode::G_IMPLICIT_DEF: {
2407 Observer.changingInstr(MI);
2408 widenScalarDst(MI, WideTy);
2409 Observer.changedInstr(MI);
2410 return Legalized;
2411 }
2412 case TargetOpcode::G_BRCOND:
2413 Observer.changingInstr(MI);
2414 widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2415 Observer.changedInstr(MI);
2416 return Legalized;
2417
2418 case TargetOpcode::G_FCMP:
2419 Observer.changingInstr(MI);
2420 if (TypeIdx == 0)
2421 widenScalarDst(MI, WideTy);
2422 else {
2423 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2424 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2425 }
2426 Observer.changedInstr(MI);
2427 return Legalized;
2428
2429 case TargetOpcode::G_ICMP:
2430 Observer.changingInstr(MI);
2431 if (TypeIdx == 0)
2432 widenScalarDst(MI, WideTy);
2433 else {
2434 unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2435 MI.getOperand(1).getPredicate()))
2436 ? TargetOpcode::G_SEXT
2437 : TargetOpcode::G_ZEXT;
2438 widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2439 widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2440 }
2441 Observer.changedInstr(MI);
2442 return Legalized;
2443
2444 case TargetOpcode::G_PTR_ADD:
2445 assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2446 Observer.changingInstr(MI);
2447 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2448 Observer.changedInstr(MI);
2449 return Legalized;
2450
2451 case TargetOpcode::G_PHI: {
2452 assert(TypeIdx == 0 && "Expecting only Idx 0");
2453
2454 Observer.changingInstr(MI);
2455 for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2456 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2457 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
2458 widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2459 }
2460
2461 MachineBasicBlock &MBB = *MI.getParent();
2462 MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2463 widenScalarDst(MI, WideTy);
2464 Observer.changedInstr(MI);
2465 return Legalized;
2466 }
2467 case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2468 if (TypeIdx == 0) {
2469 Register VecReg = MI.getOperand(1).getReg();
2470 LLT VecTy = MRI.getType(VecReg);
2471 Observer.changingInstr(MI);
2472
2473 widenScalarSrc(
2474 MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2475 TargetOpcode::G_ANYEXT);
2476
2477 widenScalarDst(MI, WideTy, 0);
2478 Observer.changedInstr(MI);
2479 return Legalized;
2480 }
2481
2482 if (TypeIdx != 2)
2483 return UnableToLegalize;
2484 Observer.changingInstr(MI);
2485 // TODO: Probably should be zext
2486 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2487 Observer.changedInstr(MI);
2488 return Legalized;
2489 }
2490 case TargetOpcode::G_INSERT_VECTOR_ELT: {
2491 if (TypeIdx == 1) {
2492 Observer.changingInstr(MI);
2493
2494 Register VecReg = MI.getOperand(1).getReg();
2495 LLT VecTy = MRI.getType(VecReg);
2496 LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2497
2498 widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2499 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2500 widenScalarDst(MI, WideVecTy, 0);
2501 Observer.changedInstr(MI);
2502 return Legalized;
2503 }
2504
2505 if (TypeIdx == 2) {
2506 Observer.changingInstr(MI);
2507 // TODO: Probably should be zext
2508 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2509 Observer.changedInstr(MI);
2510 return Legalized;
2511 }
2512
2513 return UnableToLegalize;
2514 }
2515 case TargetOpcode::G_FADD:
2516 case TargetOpcode::G_FMUL:
2517 case TargetOpcode::G_FSUB:
2518 case TargetOpcode::G_FMA:
2519 case TargetOpcode::G_FMAD:
2520 case TargetOpcode::G_FNEG:
2521 case TargetOpcode::G_FABS:
2522 case TargetOpcode::G_FCANONICALIZE:
2523 case TargetOpcode::G_FMINNUM:
2524 case TargetOpcode::G_FMAXNUM:
2525 case TargetOpcode::G_FMINNUM_IEEE:
2526 case TargetOpcode::G_FMAXNUM_IEEE:
2527 case TargetOpcode::G_FMINIMUM:
2528 case TargetOpcode::G_FMAXIMUM:
2529 case TargetOpcode::G_FDIV:
2530 case TargetOpcode::G_FREM:
2531 case TargetOpcode::G_FCEIL:
2532 case TargetOpcode::G_FFLOOR:
2533 case TargetOpcode::G_FCOS:
2534 case TargetOpcode::G_FSIN:
2535 case TargetOpcode::G_FLOG10:
2536 case TargetOpcode::G_FLOG:
2537 case TargetOpcode::G_FLOG2:
2538 case TargetOpcode::G_FRINT:
2539 case TargetOpcode::G_FNEARBYINT:
2540 case TargetOpcode::G_FSQRT:
2541 case TargetOpcode::G_FEXP:
2542 case TargetOpcode::G_FEXP2:
2543 case TargetOpcode::G_FPOW:
2544 case TargetOpcode::G_INTRINSIC_TRUNC:
2545 case TargetOpcode::G_INTRINSIC_ROUND:
2546 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2547 assert(TypeIdx == 0);
2548 Observer.changingInstr(MI);
2549
2550 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2551 widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2552
2553 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2554 Observer.changedInstr(MI);
2555 return Legalized;
2556 case TargetOpcode::G_FPOWI: {
2557 if (TypeIdx != 0)
2558 return UnableToLegalize;
2559 Observer.changingInstr(MI);
2560 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2561 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2562 Observer.changedInstr(MI);
2563 return Legalized;
2564 }
2565 case TargetOpcode::G_INTTOPTR:
2566 if (TypeIdx != 1)
2567 return UnableToLegalize;
2568
2569 Observer.changingInstr(MI);
2570 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2571 Observer.changedInstr(MI);
2572 return Legalized;
2573 case TargetOpcode::G_PTRTOINT:
2574 if (TypeIdx != 0)
2575 return UnableToLegalize;
2576
2577 Observer.changingInstr(MI);
2578 widenScalarDst(MI, WideTy, 0);
2579 Observer.changedInstr(MI);
2580 return Legalized;
2581 case TargetOpcode::G_BUILD_VECTOR: {
2582 Observer.changingInstr(MI);
2583
2584 const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2585 for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2586 widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2587
2588 // Avoid changing the result vector type if the source element type was
2589 // requested.
2590 if (TypeIdx == 1) {
2591 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2592 } else {
2593 widenScalarDst(MI, WideTy, 0);
2594 }
2595
2596 Observer.changedInstr(MI);
2597 return Legalized;
2598 }
2599 case TargetOpcode::G_SEXT_INREG:
2600 if (TypeIdx != 0)
2601 return UnableToLegalize;
2602
2603 Observer.changingInstr(MI);
2604 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2605 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2606 Observer.changedInstr(MI);
2607 return Legalized;
2608 case TargetOpcode::G_PTRMASK: {
2609 if (TypeIdx != 1)
2610 return UnableToLegalize;
2611 Observer.changingInstr(MI);
2612 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2613 Observer.changedInstr(MI);
2614 return Legalized;
2615 }
2616 }
2617 }
2618
getUnmergePieces(SmallVectorImpl<Register> & Pieces,MachineIRBuilder & B,Register Src,LLT Ty)2619 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2620 MachineIRBuilder &B, Register Src, LLT Ty) {
2621 auto Unmerge = B.buildUnmerge(Ty, Src);
2622 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2623 Pieces.push_back(Unmerge.getReg(I));
2624 }
2625
2626 LegalizerHelper::LegalizeResult
lowerBitcast(MachineInstr & MI)2627 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
2628 Register Dst = MI.getOperand(0).getReg();
2629 Register Src = MI.getOperand(1).getReg();
2630 LLT DstTy = MRI.getType(Dst);
2631 LLT SrcTy = MRI.getType(Src);
2632
2633 if (SrcTy.isVector()) {
2634 LLT SrcEltTy = SrcTy.getElementType();
2635 SmallVector<Register, 8> SrcRegs;
2636
2637 if (DstTy.isVector()) {
2638 int NumDstElt = DstTy.getNumElements();
2639 int NumSrcElt = SrcTy.getNumElements();
2640
2641 LLT DstEltTy = DstTy.getElementType();
2642 LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
2643 LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
2644
2645 // If there's an element size mismatch, insert intermediate casts to match
2646 // the result element type.
2647 if (NumSrcElt < NumDstElt) { // Source element type is larger.
2648 // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
2649 //
2650 // =>
2651 //
2652 // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
2653 // %3:_(<2 x s8>) = G_BITCAST %2
2654 // %4:_(<2 x s8>) = G_BITCAST %3
2655 // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
2656 DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
2657 SrcPartTy = SrcEltTy;
2658 } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
2659 //
2660 // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
2661 //
2662 // =>
2663 //
2664 // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
2665 // %3:_(s16) = G_BITCAST %2
2666 // %4:_(s16) = G_BITCAST %3
2667 // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
2668 SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
2669 DstCastTy = DstEltTy;
2670 }
2671
2672 getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
2673 for (Register &SrcReg : SrcRegs)
2674 SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
2675 } else
2676 getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
2677
2678 MIRBuilder.buildMerge(Dst, SrcRegs);
2679 MI.eraseFromParent();
2680 return Legalized;
2681 }
2682
2683 if (DstTy.isVector()) {
2684 SmallVector<Register, 8> SrcRegs;
2685 getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
2686 MIRBuilder.buildMerge(Dst, SrcRegs);
2687 MI.eraseFromParent();
2688 return Legalized;
2689 }
2690
2691 return UnableToLegalize;
2692 }
2693
2694 /// Figure out the bit offset into a register when coercing a vector index for
2695 /// the wide element type. This is only for the case when promoting vector to
2696 /// one with larger elements.
2697 //
2698 ///
2699 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2700 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
getBitcastWiderVectorElementOffset(MachineIRBuilder & B,Register Idx,unsigned NewEltSize,unsigned OldEltSize)2701 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
2702 Register Idx,
2703 unsigned NewEltSize,
2704 unsigned OldEltSize) {
2705 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2706 LLT IdxTy = B.getMRI()->getType(Idx);
2707
2708 // Now figure out the amount we need to shift to get the target bits.
2709 auto OffsetMask = B.buildConstant(
2710 IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
2711 auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
2712 return B.buildShl(IdxTy, OffsetIdx,
2713 B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
2714 }
2715
2716 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
2717 /// is casting to a vector with a smaller element size, perform multiple element
2718 /// extracts and merge the results. If this is coercing to a vector with larger
2719 /// elements, index the bitcasted vector and extract the target element with bit
2720 /// operations. This is intended to force the indexing in the native register
2721 /// size for architectures that can dynamically index the register file.
2722 LegalizerHelper::LegalizeResult
bitcastExtractVectorElt(MachineInstr & MI,unsigned TypeIdx,LLT CastTy)2723 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
2724 LLT CastTy) {
2725 if (TypeIdx != 1)
2726 return UnableToLegalize;
2727
2728 Register Dst = MI.getOperand(0).getReg();
2729 Register SrcVec = MI.getOperand(1).getReg();
2730 Register Idx = MI.getOperand(2).getReg();
2731 LLT SrcVecTy = MRI.getType(SrcVec);
2732 LLT IdxTy = MRI.getType(Idx);
2733
2734 LLT SrcEltTy = SrcVecTy.getElementType();
2735 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2736 unsigned OldNumElts = SrcVecTy.getNumElements();
2737
2738 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2739 Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2740
2741 const unsigned NewEltSize = NewEltTy.getSizeInBits();
2742 const unsigned OldEltSize = SrcEltTy.getSizeInBits();
2743 if (NewNumElts > OldNumElts) {
2744 // Decreasing the vector element size
2745 //
2746 // e.g. i64 = extract_vector_elt x:v2i64, y:i32
2747 // =>
2748 // v4i32:castx = bitcast x:v2i64
2749 //
2750 // i64 = bitcast
2751 // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
2752 // (i32 (extract_vector_elt castx, (2 * y + 1)))
2753 //
2754 if (NewNumElts % OldNumElts != 0)
2755 return UnableToLegalize;
2756
2757 // Type of the intermediate result vector.
2758 const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
2759 LLT MidTy =
2760 LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
2761
2762 auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
2763
2764 SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
2765 auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
2766
2767 for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
2768 auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
2769 auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
2770 auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
2771 NewOps[I] = Elt.getReg(0);
2772 }
2773
2774 auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
2775 MIRBuilder.buildBitcast(Dst, NewVec);
2776 MI.eraseFromParent();
2777 return Legalized;
2778 }
2779
2780 if (NewNumElts < OldNumElts) {
2781 if (NewEltSize % OldEltSize != 0)
2782 return UnableToLegalize;
2783
2784 // This only depends on powers of 2 because we use bit tricks to figure out
2785 // the bit offset we need to shift to get the target element. A general
2786 // expansion could emit division/multiply.
2787 if (!isPowerOf2_32(NewEltSize / OldEltSize))
2788 return UnableToLegalize;
2789
2790 // Increasing the vector element size.
2791 // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
2792 //
2793 // =>
2794 //
2795 // %cast = G_BITCAST %vec
2796 // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
2797 // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
2798 // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2799 // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2800 // %elt_bits = G_LSHR %wide_elt, %offset_bits
2801 // %elt = G_TRUNC %elt_bits
2802
2803 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2804 auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2805
2806 // Divide to get the index in the wider element type.
2807 auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2808
2809 Register WideElt = CastVec;
2810 if (CastTy.isVector()) {
2811 WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2812 ScaledIdx).getReg(0);
2813 }
2814
2815 // Compute the bit offset into the register of the target element.
2816 Register OffsetBits = getBitcastWiderVectorElementOffset(
2817 MIRBuilder, Idx, NewEltSize, OldEltSize);
2818
2819 // Shift the wide element to get the target element.
2820 auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
2821 MIRBuilder.buildTrunc(Dst, ExtractedBits);
2822 MI.eraseFromParent();
2823 return Legalized;
2824 }
2825
2826 return UnableToLegalize;
2827 }
2828
2829 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
2830 /// TargetReg, while preserving other bits in \p TargetReg.
2831 ///
2832 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
buildBitFieldInsert(MachineIRBuilder & B,Register TargetReg,Register InsertReg,Register OffsetBits)2833 static Register buildBitFieldInsert(MachineIRBuilder &B,
2834 Register TargetReg, Register InsertReg,
2835 Register OffsetBits) {
2836 LLT TargetTy = B.getMRI()->getType(TargetReg);
2837 LLT InsertTy = B.getMRI()->getType(InsertReg);
2838 auto ZextVal = B.buildZExt(TargetTy, InsertReg);
2839 auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
2840
2841 // Produce a bitmask of the value to insert
2842 auto EltMask = B.buildConstant(
2843 TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
2844 InsertTy.getSizeInBits()));
2845 // Shift it into position
2846 auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
2847 auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
2848
2849 // Clear out the bits in the wide element
2850 auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
2851
2852 // The value to insert has all zeros already, so stick it into the masked
2853 // wide element.
2854 return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
2855 }
2856
2857 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
2858 /// is increasing the element size, perform the indexing in the target element
2859 /// type, and use bit operations to insert at the element position. This is
2860 /// intended for architectures that can dynamically index the register file and
2861 /// want to force indexing in the native register size.
2862 LegalizerHelper::LegalizeResult
bitcastInsertVectorElt(MachineInstr & MI,unsigned TypeIdx,LLT CastTy)2863 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
2864 LLT CastTy) {
2865 if (TypeIdx != 0)
2866 return UnableToLegalize;
2867
2868 Register Dst = MI.getOperand(0).getReg();
2869 Register SrcVec = MI.getOperand(1).getReg();
2870 Register Val = MI.getOperand(2).getReg();
2871 Register Idx = MI.getOperand(3).getReg();
2872
2873 LLT VecTy = MRI.getType(Dst);
2874 LLT IdxTy = MRI.getType(Idx);
2875
2876 LLT VecEltTy = VecTy.getElementType();
2877 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2878 const unsigned NewEltSize = NewEltTy.getSizeInBits();
2879 const unsigned OldEltSize = VecEltTy.getSizeInBits();
2880
2881 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2882 unsigned OldNumElts = VecTy.getNumElements();
2883
2884 Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2885 if (NewNumElts < OldNumElts) {
2886 if (NewEltSize % OldEltSize != 0)
2887 return UnableToLegalize;
2888
2889 // This only depends on powers of 2 because we use bit tricks to figure out
2890 // the bit offset we need to shift to get the target element. A general
2891 // expansion could emit division/multiply.
2892 if (!isPowerOf2_32(NewEltSize / OldEltSize))
2893 return UnableToLegalize;
2894
2895 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2896 auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2897
2898 // Divide to get the index in the wider element type.
2899 auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2900
2901 Register ExtractedElt = CastVec;
2902 if (CastTy.isVector()) {
2903 ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2904 ScaledIdx).getReg(0);
2905 }
2906
2907 // Compute the bit offset into the register of the target element.
2908 Register OffsetBits = getBitcastWiderVectorElementOffset(
2909 MIRBuilder, Idx, NewEltSize, OldEltSize);
2910
2911 Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
2912 Val, OffsetBits);
2913 if (CastTy.isVector()) {
2914 InsertedElt = MIRBuilder.buildInsertVectorElement(
2915 CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
2916 }
2917
2918 MIRBuilder.buildBitcast(Dst, InsertedElt);
2919 MI.eraseFromParent();
2920 return Legalized;
2921 }
2922
2923 return UnableToLegalize;
2924 }
2925
lowerLoad(GAnyLoad & LoadMI)2926 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
2927 // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
2928 Register DstReg = LoadMI.getDstReg();
2929 Register PtrReg = LoadMI.getPointerReg();
2930 LLT DstTy = MRI.getType(DstReg);
2931 MachineMemOperand &MMO = LoadMI.getMMO();
2932 LLT MemTy = MMO.getMemoryType();
2933 MachineFunction &MF = MIRBuilder.getMF();
2934
2935 unsigned MemSizeInBits = MemTy.getSizeInBits();
2936 unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
2937
2938 if (MemSizeInBits != MemStoreSizeInBits) {
2939 if (MemTy.isVector())
2940 return UnableToLegalize;
2941
2942 // Promote to a byte-sized load if not loading an integral number of
2943 // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
2944 LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
2945 MachineMemOperand *NewMMO =
2946 MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
2947
2948 Register LoadReg = DstReg;
2949 LLT LoadTy = DstTy;
2950
2951 // If this wasn't already an extending load, we need to widen the result
2952 // register to avoid creating a load with a narrower result than the source.
2953 if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
2954 LoadTy = WideMemTy;
2955 LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
2956 }
2957
2958 if (isa<GSExtLoad>(LoadMI)) {
2959 auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2960 MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
2961 } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == LoadTy) {
2962 auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2963 // The extra bits are guaranteed to be zero, since we stored them that
2964 // way. A zext load from Wide thus automatically gives zext from MemVT.
2965 MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
2966 } else {
2967 MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
2968 }
2969
2970 if (DstTy != LoadTy)
2971 MIRBuilder.buildTrunc(DstReg, LoadReg);
2972
2973 LoadMI.eraseFromParent();
2974 return Legalized;
2975 }
2976
2977 // Big endian lowering not implemented.
2978 if (MIRBuilder.getDataLayout().isBigEndian())
2979 return UnableToLegalize;
2980
2981 // This load needs splitting into power of 2 sized loads.
2982 //
2983 // Our strategy here is to generate anyextending loads for the smaller
2984 // types up to next power-2 result type, and then combine the two larger
2985 // result values together, before truncating back down to the non-pow-2
2986 // type.
2987 // E.g. v1 = i24 load =>
2988 // v2 = i32 zextload (2 byte)
2989 // v3 = i32 load (1 byte)
2990 // v4 = i32 shl v3, 16
2991 // v5 = i32 or v4, v2
2992 // v1 = i24 trunc v5
2993 // By doing this we generate the correct truncate which should get
2994 // combined away as an artifact with a matching extend.
2995
2996 uint64_t LargeSplitSize, SmallSplitSize;
2997
2998 if (!isPowerOf2_32(MemSizeInBits)) {
2999 // This load needs splitting into power of 2 sized loads.
3000 LargeSplitSize = PowerOf2Floor(MemSizeInBits);
3001 SmallSplitSize = MemSizeInBits - LargeSplitSize;
3002 } else {
3003 // This is already a power of 2, but we still need to split this in half.
3004 //
3005 // Assume we're being asked to decompose an unaligned load.
3006 // TODO: If this requires multiple splits, handle them all at once.
3007 auto &Ctx = MF.getFunction().getContext();
3008 if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3009 return UnableToLegalize;
3010
3011 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3012 }
3013
3014 if (MemTy.isVector()) {
3015 // TODO: Handle vector extloads
3016 if (MemTy != DstTy)
3017 return UnableToLegalize;
3018
3019 // TODO: We can do better than scalarizing the vector and at least split it
3020 // in half.
3021 return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
3022 }
3023
3024 MachineMemOperand *LargeMMO =
3025 MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3026 MachineMemOperand *SmallMMO =
3027 MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3028
3029 LLT PtrTy = MRI.getType(PtrReg);
3030 unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
3031 LLT AnyExtTy = LLT::scalar(AnyExtSize);
3032 auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
3033 PtrReg, *LargeMMO);
3034
3035 auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
3036 LargeSplitSize / 8);
3037 Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3038 auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3039 auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3040 SmallPtr, *SmallMMO);
3041
3042 auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3043 auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3044
3045 if (AnyExtTy == DstTy)
3046 MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3047 else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3048 auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3049 MIRBuilder.buildTrunc(DstReg, {Or});
3050 } else {
3051 assert(DstTy.isPointer() && "expected pointer");
3052 auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3053
3054 // FIXME: We currently consider this to be illegal for non-integral address
3055 // spaces, but we need still need a way to reinterpret the bits.
3056 MIRBuilder.buildIntToPtr(DstReg, Or);
3057 }
3058
3059 LoadMI.eraseFromParent();
3060 return Legalized;
3061 }
3062
lowerStore(GStore & StoreMI)3063 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3064 // Lower a non-power of 2 store into multiple pow-2 stores.
3065 // E.g. split an i24 store into an i16 store + i8 store.
3066 // We do this by first extending the stored value to the next largest power
3067 // of 2 type, and then using truncating stores to store the components.
3068 // By doing this, likewise with G_LOAD, generate an extend that can be
3069 // artifact-combined away instead of leaving behind extracts.
3070 Register SrcReg = StoreMI.getValueReg();
3071 Register PtrReg = StoreMI.getPointerReg();
3072 LLT SrcTy = MRI.getType(SrcReg);
3073 MachineFunction &MF = MIRBuilder.getMF();
3074 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3075 LLT MemTy = MMO.getMemoryType();
3076
3077 unsigned StoreWidth = MemTy.getSizeInBits();
3078 unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3079
3080 if (StoreWidth != StoreSizeInBits) {
3081 if (SrcTy.isVector())
3082 return UnableToLegalize;
3083
3084 // Promote to a byte-sized store with upper bits zero if not
3085 // storing an integral number of bytes. For example, promote
3086 // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3087 LLT WideTy = LLT::scalar(StoreSizeInBits);
3088
3089 if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3090 // Avoid creating a store with a narrower source than result.
3091 SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3092 SrcTy = WideTy;
3093 }
3094
3095 auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3096
3097 MachineMemOperand *NewMMO =
3098 MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3099 MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3100 StoreMI.eraseFromParent();
3101 return Legalized;
3102 }
3103
3104 if (MemTy.isVector()) {
3105 // TODO: Handle vector trunc stores
3106 if (MemTy != SrcTy)
3107 return UnableToLegalize;
3108
3109 // TODO: We can do better than scalarizing the vector and at least split it
3110 // in half.
3111 return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3112 }
3113
3114 unsigned MemSizeInBits = MemTy.getSizeInBits();
3115 uint64_t LargeSplitSize, SmallSplitSize;
3116
3117 if (!isPowerOf2_32(MemSizeInBits)) {
3118 LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
3119 SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3120 } else {
3121 auto &Ctx = MF.getFunction().getContext();
3122 if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3123 return UnableToLegalize; // Don't know what we're being asked to do.
3124
3125 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3126 }
3127
3128 // Extend to the next pow-2. If this store was itself the result of lowering,
3129 // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3130 // that's wider than the stored size.
3131 unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3132 const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3133
3134 if (SrcTy.isPointer()) {
3135 const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3136 SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3137 }
3138
3139 auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3140
3141 // Obtain the smaller value by shifting away the larger value.
3142 auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3143 auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3144
3145 // Generate the PtrAdd and truncating stores.
3146 LLT PtrTy = MRI.getType(PtrReg);
3147 auto OffsetCst = MIRBuilder.buildConstant(
3148 LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3149 auto SmallPtr =
3150 MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3151
3152 MachineMemOperand *LargeMMO =
3153 MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3154 MachineMemOperand *SmallMMO =
3155 MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3156 MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3157 MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3158 StoreMI.eraseFromParent();
3159 return Legalized;
3160 }
3161
3162 LegalizerHelper::LegalizeResult
bitcast(MachineInstr & MI,unsigned TypeIdx,LLT CastTy)3163 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3164 switch (MI.getOpcode()) {
3165 case TargetOpcode::G_LOAD: {
3166 if (TypeIdx != 0)
3167 return UnableToLegalize;
3168 MachineMemOperand &MMO = **MI.memoperands_begin();
3169
3170 // Not sure how to interpret a bitcast of an extending load.
3171 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3172 return UnableToLegalize;
3173
3174 Observer.changingInstr(MI);
3175 bitcastDst(MI, CastTy, 0);
3176 MMO.setType(CastTy);
3177 Observer.changedInstr(MI);
3178 return Legalized;
3179 }
3180 case TargetOpcode::G_STORE: {
3181 if (TypeIdx != 0)
3182 return UnableToLegalize;
3183
3184 MachineMemOperand &MMO = **MI.memoperands_begin();
3185
3186 // Not sure how to interpret a bitcast of a truncating store.
3187 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3188 return UnableToLegalize;
3189
3190 Observer.changingInstr(MI);
3191 bitcastSrc(MI, CastTy, 0);
3192 MMO.setType(CastTy);
3193 Observer.changedInstr(MI);
3194 return Legalized;
3195 }
3196 case TargetOpcode::G_SELECT: {
3197 if (TypeIdx != 0)
3198 return UnableToLegalize;
3199
3200 if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3201 LLVM_DEBUG(
3202 dbgs() << "bitcast action not implemented for vector select\n");
3203 return UnableToLegalize;
3204 }
3205
3206 Observer.changingInstr(MI);
3207 bitcastSrc(MI, CastTy, 2);
3208 bitcastSrc(MI, CastTy, 3);
3209 bitcastDst(MI, CastTy, 0);
3210 Observer.changedInstr(MI);
3211 return Legalized;
3212 }
3213 case TargetOpcode::G_AND:
3214 case TargetOpcode::G_OR:
3215 case TargetOpcode::G_XOR: {
3216 Observer.changingInstr(MI);
3217 bitcastSrc(MI, CastTy, 1);
3218 bitcastSrc(MI, CastTy, 2);
3219 bitcastDst(MI, CastTy, 0);
3220 Observer.changedInstr(MI);
3221 return Legalized;
3222 }
3223 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3224 return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3225 case TargetOpcode::G_INSERT_VECTOR_ELT:
3226 return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3227 default:
3228 return UnableToLegalize;
3229 }
3230 }
3231
3232 // Legalize an instruction by changing the opcode in place.
changeOpcode(MachineInstr & MI,unsigned NewOpcode)3233 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3234 Observer.changingInstr(MI);
3235 MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3236 Observer.changedInstr(MI);
3237 }
3238
3239 LegalizerHelper::LegalizeResult
lower(MachineInstr & MI,unsigned TypeIdx,LLT LowerHintTy)3240 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3241 using namespace TargetOpcode;
3242
3243 switch(MI.getOpcode()) {
3244 default:
3245 return UnableToLegalize;
3246 case TargetOpcode::G_BITCAST:
3247 return lowerBitcast(MI);
3248 case TargetOpcode::G_SREM:
3249 case TargetOpcode::G_UREM: {
3250 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3251 auto Quot =
3252 MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3253 {MI.getOperand(1), MI.getOperand(2)});
3254
3255 auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3256 MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3257 MI.eraseFromParent();
3258 return Legalized;
3259 }
3260 case TargetOpcode::G_SADDO:
3261 case TargetOpcode::G_SSUBO:
3262 return lowerSADDO_SSUBO(MI);
3263 case TargetOpcode::G_UMULH:
3264 case TargetOpcode::G_SMULH:
3265 return lowerSMULH_UMULH(MI);
3266 case TargetOpcode::G_SMULO:
3267 case TargetOpcode::G_UMULO: {
3268 // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3269 // result.
3270 Register Res = MI.getOperand(0).getReg();
3271 Register Overflow = MI.getOperand(1).getReg();
3272 Register LHS = MI.getOperand(2).getReg();
3273 Register RHS = MI.getOperand(3).getReg();
3274 LLT Ty = MRI.getType(Res);
3275
3276 unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3277 ? TargetOpcode::G_SMULH
3278 : TargetOpcode::G_UMULH;
3279
3280 Observer.changingInstr(MI);
3281 const auto &TII = MIRBuilder.getTII();
3282 MI.setDesc(TII.get(TargetOpcode::G_MUL));
3283 MI.removeOperand(1);
3284 Observer.changedInstr(MI);
3285
3286 auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3287 auto Zero = MIRBuilder.buildConstant(Ty, 0);
3288
3289 // Move insert point forward so we can use the Res register if needed.
3290 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3291
3292 // For *signed* multiply, overflow is detected by checking:
3293 // (hi != (lo >> bitwidth-1))
3294 if (Opcode == TargetOpcode::G_SMULH) {
3295 auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3296 auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3297 MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3298 } else {
3299 MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3300 }
3301 return Legalized;
3302 }
3303 case TargetOpcode::G_FNEG: {
3304 Register Res = MI.getOperand(0).getReg();
3305 LLT Ty = MRI.getType(Res);
3306
3307 // TODO: Handle vector types once we are able to
3308 // represent them.
3309 if (Ty.isVector())
3310 return UnableToLegalize;
3311 auto SignMask =
3312 MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3313 Register SubByReg = MI.getOperand(1).getReg();
3314 MIRBuilder.buildXor(Res, SubByReg, SignMask);
3315 MI.eraseFromParent();
3316 return Legalized;
3317 }
3318 case TargetOpcode::G_FSUB: {
3319 Register Res = MI.getOperand(0).getReg();
3320 LLT Ty = MRI.getType(Res);
3321
3322 // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3323 // First, check if G_FNEG is marked as Lower. If so, we may
3324 // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
3325 if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
3326 return UnableToLegalize;
3327 Register LHS = MI.getOperand(1).getReg();
3328 Register RHS = MI.getOperand(2).getReg();
3329 Register Neg = MRI.createGenericVirtualRegister(Ty);
3330 MIRBuilder.buildFNeg(Neg, RHS);
3331 MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3332 MI.eraseFromParent();
3333 return Legalized;
3334 }
3335 case TargetOpcode::G_FMAD:
3336 return lowerFMad(MI);
3337 case TargetOpcode::G_FFLOOR:
3338 return lowerFFloor(MI);
3339 case TargetOpcode::G_INTRINSIC_ROUND:
3340 return lowerIntrinsicRound(MI);
3341 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
3342 // Since round even is the assumed rounding mode for unconstrained FP
3343 // operations, rint and roundeven are the same operation.
3344 changeOpcode(MI, TargetOpcode::G_FRINT);
3345 return Legalized;
3346 }
3347 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3348 Register OldValRes = MI.getOperand(0).getReg();
3349 Register SuccessRes = MI.getOperand(1).getReg();
3350 Register Addr = MI.getOperand(2).getReg();
3351 Register CmpVal = MI.getOperand(3).getReg();
3352 Register NewVal = MI.getOperand(4).getReg();
3353 MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3354 **MI.memoperands_begin());
3355 MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3356 MI.eraseFromParent();
3357 return Legalized;
3358 }
3359 case TargetOpcode::G_LOAD:
3360 case TargetOpcode::G_SEXTLOAD:
3361 case TargetOpcode::G_ZEXTLOAD:
3362 return lowerLoad(cast<GAnyLoad>(MI));
3363 case TargetOpcode::G_STORE:
3364 return lowerStore(cast<GStore>(MI));
3365 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3366 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3367 case TargetOpcode::G_CTLZ:
3368 case TargetOpcode::G_CTTZ:
3369 case TargetOpcode::G_CTPOP:
3370 return lowerBitCount(MI);
3371 case G_UADDO: {
3372 Register Res = MI.getOperand(0).getReg();
3373 Register CarryOut = MI.getOperand(1).getReg();
3374 Register LHS = MI.getOperand(2).getReg();
3375 Register RHS = MI.getOperand(3).getReg();
3376
3377 MIRBuilder.buildAdd(Res, LHS, RHS);
3378 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3379
3380 MI.eraseFromParent();
3381 return Legalized;
3382 }
3383 case G_UADDE: {
3384 Register Res = MI.getOperand(0).getReg();
3385 Register CarryOut = MI.getOperand(1).getReg();
3386 Register LHS = MI.getOperand(2).getReg();
3387 Register RHS = MI.getOperand(3).getReg();
3388 Register CarryIn = MI.getOperand(4).getReg();
3389 LLT Ty = MRI.getType(Res);
3390
3391 auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3392 auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3393 MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3394 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
3395
3396 MI.eraseFromParent();
3397 return Legalized;
3398 }
3399 case G_USUBO: {
3400 Register Res = MI.getOperand(0).getReg();
3401 Register BorrowOut = MI.getOperand(1).getReg();
3402 Register LHS = MI.getOperand(2).getReg();
3403 Register RHS = MI.getOperand(3).getReg();
3404
3405 MIRBuilder.buildSub(Res, LHS, RHS);
3406 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3407
3408 MI.eraseFromParent();
3409 return Legalized;
3410 }
3411 case G_USUBE: {
3412 Register Res = MI.getOperand(0).getReg();
3413 Register BorrowOut = MI.getOperand(1).getReg();
3414 Register LHS = MI.getOperand(2).getReg();
3415 Register RHS = MI.getOperand(3).getReg();
3416 Register BorrowIn = MI.getOperand(4).getReg();
3417 const LLT CondTy = MRI.getType(BorrowOut);
3418 const LLT Ty = MRI.getType(Res);
3419
3420 auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3421 auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3422 MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3423
3424 auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
3425 auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
3426 MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
3427
3428 MI.eraseFromParent();
3429 return Legalized;
3430 }
3431 case G_UITOFP:
3432 return lowerUITOFP(MI);
3433 case G_SITOFP:
3434 return lowerSITOFP(MI);
3435 case G_FPTOUI:
3436 return lowerFPTOUI(MI);
3437 case G_FPTOSI:
3438 return lowerFPTOSI(MI);
3439 case G_FPTRUNC:
3440 return lowerFPTRUNC(MI);
3441 case G_FPOWI:
3442 return lowerFPOWI(MI);
3443 case G_SMIN:
3444 case G_SMAX:
3445 case G_UMIN:
3446 case G_UMAX:
3447 return lowerMinMax(MI);
3448 case G_FCOPYSIGN:
3449 return lowerFCopySign(MI);
3450 case G_FMINNUM:
3451 case G_FMAXNUM:
3452 return lowerFMinNumMaxNum(MI);
3453 case G_MERGE_VALUES:
3454 return lowerMergeValues(MI);
3455 case G_UNMERGE_VALUES:
3456 return lowerUnmergeValues(MI);
3457 case TargetOpcode::G_SEXT_INREG: {
3458 assert(MI.getOperand(2).isImm() && "Expected immediate");
3459 int64_t SizeInBits = MI.getOperand(2).getImm();
3460
3461 Register DstReg = MI.getOperand(0).getReg();
3462 Register SrcReg = MI.getOperand(1).getReg();
3463 LLT DstTy = MRI.getType(DstReg);
3464 Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3465
3466 auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3467 MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3468 MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3469 MI.eraseFromParent();
3470 return Legalized;
3471 }
3472 case G_EXTRACT_VECTOR_ELT:
3473 case G_INSERT_VECTOR_ELT:
3474 return lowerExtractInsertVectorElt(MI);
3475 case G_SHUFFLE_VECTOR:
3476 return lowerShuffleVector(MI);
3477 case G_DYN_STACKALLOC:
3478 return lowerDynStackAlloc(MI);
3479 case G_EXTRACT:
3480 return lowerExtract(MI);
3481 case G_INSERT:
3482 return lowerInsert(MI);
3483 case G_BSWAP:
3484 return lowerBswap(MI);
3485 case G_BITREVERSE:
3486 return lowerBitreverse(MI);
3487 case G_READ_REGISTER:
3488 case G_WRITE_REGISTER:
3489 return lowerReadWriteRegister(MI);
3490 case G_UADDSAT:
3491 case G_USUBSAT: {
3492 // Try to make a reasonable guess about which lowering strategy to use. The
3493 // target can override this with custom lowering and calling the
3494 // implementation functions.
3495 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3496 if (LI.isLegalOrCustom({G_UMIN, Ty}))
3497 return lowerAddSubSatToMinMax(MI);
3498 return lowerAddSubSatToAddoSubo(MI);
3499 }
3500 case G_SADDSAT:
3501 case G_SSUBSAT: {
3502 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3503
3504 // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3505 // since it's a shorter expansion. However, we would need to figure out the
3506 // preferred boolean type for the carry out for the query.
3507 if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3508 return lowerAddSubSatToMinMax(MI);
3509 return lowerAddSubSatToAddoSubo(MI);
3510 }
3511 case G_SSHLSAT:
3512 case G_USHLSAT:
3513 return lowerShlSat(MI);
3514 case G_ABS:
3515 return lowerAbsToAddXor(MI);
3516 case G_SELECT:
3517 return lowerSelect(MI);
3518 case G_SDIVREM:
3519 case G_UDIVREM:
3520 return lowerDIVREM(MI);
3521 case G_FSHL:
3522 case G_FSHR:
3523 return lowerFunnelShift(MI);
3524 case G_ROTL:
3525 case G_ROTR:
3526 return lowerRotate(MI);
3527 case G_MEMSET:
3528 case G_MEMCPY:
3529 case G_MEMMOVE:
3530 return lowerMemCpyFamily(MI);
3531 case G_MEMCPY_INLINE:
3532 return lowerMemcpyInline(MI);
3533 GISEL_VECREDUCE_CASES_NONSEQ
3534 return lowerVectorReduction(MI);
3535 }
3536 }
3537
getStackTemporaryAlignment(LLT Ty,Align MinAlign) const3538 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3539 Align MinAlign) const {
3540 // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3541 // datalayout for the preferred alignment. Also there should be a target hook
3542 // for this to allow targets to reduce the alignment and ignore the
3543 // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3544 // the type.
3545 return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3546 }
3547
3548 MachineInstrBuilder
createStackTemporary(TypeSize Bytes,Align Alignment,MachinePointerInfo & PtrInfo)3549 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3550 MachinePointerInfo &PtrInfo) {
3551 MachineFunction &MF = MIRBuilder.getMF();
3552 const DataLayout &DL = MIRBuilder.getDataLayout();
3553 int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3554
3555 unsigned AddrSpace = DL.getAllocaAddrSpace();
3556 LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3557
3558 PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3559 return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3560 }
3561
clampDynamicVectorIndex(MachineIRBuilder & B,Register IdxReg,LLT VecTy)3562 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3563 LLT VecTy) {
3564 int64_t IdxVal;
3565 if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3566 return IdxReg;
3567
3568 LLT IdxTy = B.getMRI()->getType(IdxReg);
3569 unsigned NElts = VecTy.getNumElements();
3570 if (isPowerOf2_32(NElts)) {
3571 APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3572 return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3573 }
3574
3575 return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3576 .getReg(0);
3577 }
3578
getVectorElementPointer(Register VecPtr,LLT VecTy,Register Index)3579 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3580 Register Index) {
3581 LLT EltTy = VecTy.getElementType();
3582
3583 // Calculate the element offset and add it to the pointer.
3584 unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3585 assert(EltSize * 8 == EltTy.getSizeInBits() &&
3586 "Converting bits to bytes lost precision");
3587
3588 Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3589
3590 LLT IdxTy = MRI.getType(Index);
3591 auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3592 MIRBuilder.buildConstant(IdxTy, EltSize));
3593
3594 LLT PtrTy = MRI.getType(VecPtr);
3595 return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3596 }
3597
3598 #ifndef NDEBUG
3599 /// Check that all vector operands have same number of elements. Other operands
3600 /// should be listed in NonVecOp.
hasSameNumEltsOnAllVectorOperands(GenericMachineInstr & MI,MachineRegisterInfo & MRI,std::initializer_list<unsigned> NonVecOpIndices)3601 static bool hasSameNumEltsOnAllVectorOperands(
3602 GenericMachineInstr &MI, MachineRegisterInfo &MRI,
3603 std::initializer_list<unsigned> NonVecOpIndices) {
3604 if (MI.getNumMemOperands() != 0)
3605 return false;
3606
3607 LLT VecTy = MRI.getType(MI.getReg(0));
3608 if (!VecTy.isVector())
3609 return false;
3610 unsigned NumElts = VecTy.getNumElements();
3611
3612 for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
3613 MachineOperand &Op = MI.getOperand(OpIdx);
3614 if (!Op.isReg()) {
3615 if (!is_contained(NonVecOpIndices, OpIdx))
3616 return false;
3617 continue;
3618 }
3619
3620 LLT Ty = MRI.getType(Op.getReg());
3621 if (!Ty.isVector()) {
3622 if (!is_contained(NonVecOpIndices, OpIdx))
3623 return false;
3624 continue;
3625 }
3626
3627 if (Ty.getNumElements() != NumElts)
3628 return false;
3629 }
3630
3631 return true;
3632 }
3633 #endif
3634
3635 /// Fill \p DstOps with DstOps that have same number of elements combined as
3636 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
3637 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
3638 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
makeDstOps(SmallVectorImpl<DstOp> & DstOps,LLT Ty,unsigned NumElts)3639 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
3640 unsigned NumElts) {
3641 LLT LeftoverTy;
3642 assert(Ty.isVector() && "Expected vector type");
3643 LLT EltTy = Ty.getElementType();
3644 LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
3645 int NumParts, NumLeftover;
3646 std::tie(NumParts, NumLeftover) =
3647 getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
3648
3649 assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
3650 for (int i = 0; i < NumParts; ++i) {
3651 DstOps.push_back(NarrowTy);
3652 }
3653
3654 if (LeftoverTy.isValid()) {
3655 assert(NumLeftover == 1 && "expected exactly one leftover");
3656 DstOps.push_back(LeftoverTy);
3657 }
3658 }
3659
3660 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
3661 /// made from \p Op depending on operand type.
broadcastSrcOp(SmallVectorImpl<SrcOp> & Ops,unsigned N,MachineOperand & Op)3662 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
3663 MachineOperand &Op) {
3664 for (unsigned i = 0; i < N; ++i) {
3665 if (Op.isReg())
3666 Ops.push_back(Op.getReg());
3667 else if (Op.isImm())
3668 Ops.push_back(Op.getImm());
3669 else if (Op.isPredicate())
3670 Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
3671 else
3672 llvm_unreachable("Unsupported type");
3673 }
3674 }
3675
3676 // Handle splitting vector operations which need to have the same number of
3677 // elements in each type index, but each type index may have a different element
3678 // type.
3679 //
3680 // e.g. <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
3681 // <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3682 // <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3683 //
3684 // Also handles some irregular breakdown cases, e.g.
3685 // e.g. <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
3686 // <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3687 // s64 = G_SHL s64, s32
3688 LegalizerHelper::LegalizeResult
fewerElementsVectorMultiEltType(GenericMachineInstr & MI,unsigned NumElts,std::initializer_list<unsigned> NonVecOpIndices)3689 LegalizerHelper::fewerElementsVectorMultiEltType(
3690 GenericMachineInstr &MI, unsigned NumElts,
3691 std::initializer_list<unsigned> NonVecOpIndices) {
3692 assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
3693 "Non-compatible opcode or not specified non-vector operands");
3694 unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
3695
3696 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
3697 unsigned NumDefs = MI.getNumDefs();
3698
3699 // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
3700 // Build instructions with DstOps to use instruction found by CSE directly.
3701 // CSE copies found instruction into given vreg when building with vreg dest.
3702 SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
3703 // Output registers will be taken from created instructions.
3704 SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
3705 for (unsigned i = 0; i < NumDefs; ++i) {
3706 makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
3707 }
3708
3709 // Split vector input operands into sub-vectors with NumElts elts + Leftover.
3710 // Operands listed in NonVecOpIndices will be used as is without splitting;
3711 // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
3712 // scalar condition (op 1), immediate in sext_inreg (op 2).
3713 SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
3714 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
3715 ++UseIdx, ++UseNo) {
3716 if (is_contained(NonVecOpIndices, UseIdx)) {
3717 broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
3718 MI.getOperand(UseIdx));
3719 } else {
3720 SmallVector<Register, 8> SplitPieces;
3721 extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces);
3722 for (auto Reg : SplitPieces)
3723 InputOpsPieces[UseNo].push_back(Reg);
3724 }
3725 }
3726
3727 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
3728
3729 // Take i-th piece of each input operand split and build sub-vector/scalar
3730 // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
3731 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
3732 SmallVector<DstOp, 2> Defs;
3733 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
3734 Defs.push_back(OutputOpsPieces[DstNo][i]);
3735
3736 SmallVector<SrcOp, 3> Uses;
3737 for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
3738 Uses.push_back(InputOpsPieces[InputNo][i]);
3739
3740 auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
3741 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
3742 OutputRegs[DstNo].push_back(I.getReg(DstNo));
3743 }
3744
3745 // Merge small outputs into MI's output for each def operand.
3746 if (NumLeftovers) {
3747 for (unsigned i = 0; i < NumDefs; ++i)
3748 mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
3749 } else {
3750 for (unsigned i = 0; i < NumDefs; ++i)
3751 MIRBuilder.buildMerge(MI.getReg(i), OutputRegs[i]);
3752 }
3753
3754 MI.eraseFromParent();
3755 return Legalized;
3756 }
3757
3758 LegalizerHelper::LegalizeResult
fewerElementsVectorPhi(GenericMachineInstr & MI,unsigned NumElts)3759 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
3760 unsigned NumElts) {
3761 unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
3762
3763 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
3764 unsigned NumDefs = MI.getNumDefs();
3765
3766 SmallVector<DstOp, 8> OutputOpsPieces;
3767 SmallVector<Register, 8> OutputRegs;
3768 makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
3769
3770 // Instructions that perform register split will be inserted in basic block
3771 // where register is defined (basic block is in the next operand).
3772 SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
3773 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
3774 UseIdx += 2, ++UseNo) {
3775 MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
3776 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
3777 extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo]);
3778 }
3779
3780 // Build PHIs with fewer elements.
3781 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
3782 MIRBuilder.setInsertPt(*MI.getParent(), MI);
3783 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
3784 auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
3785 Phi.addDef(
3786 MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
3787 OutputRegs.push_back(Phi.getReg(0));
3788
3789 for (unsigned j = 0; j < NumInputs / 2; ++j) {
3790 Phi.addUse(InputOpsPieces[j][i]);
3791 Phi.add(MI.getOperand(1 + j * 2 + 1));
3792 }
3793 }
3794
3795 // Merge small outputs into MI's def.
3796 if (NumLeftovers) {
3797 mergeMixedSubvectors(MI.getReg(0), OutputRegs);
3798 } else {
3799 MIRBuilder.buildMerge(MI.getReg(0), OutputRegs);
3800 }
3801
3802 MI.eraseFromParent();
3803 return Legalized;
3804 }
3805
3806 LegalizerHelper::LegalizeResult
fewerElementsVectorUnmergeValues(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)3807 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
3808 unsigned TypeIdx,
3809 LLT NarrowTy) {
3810 const int NumDst = MI.getNumOperands() - 1;
3811 const Register SrcReg = MI.getOperand(NumDst).getReg();
3812 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3813 LLT SrcTy = MRI.getType(SrcReg);
3814
3815 if (TypeIdx != 1 || NarrowTy == DstTy)
3816 return UnableToLegalize;
3817
3818 // Requires compatible types. Otherwise SrcReg should have been defined by
3819 // merge-like instruction that would get artifact combined. Most likely
3820 // instruction that defines SrcReg has to perform more/fewer elements
3821 // legalization compatible with NarrowTy.
3822 assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
3823 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3824
3825 if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
3826 (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
3827 return UnableToLegalize;
3828
3829 // This is most likely DstTy (smaller then register size) packed in SrcTy
3830 // (larger then register size) and since unmerge was not combined it will be
3831 // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
3832 // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
3833
3834 // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
3835 //
3836 // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
3837 // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
3838 // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
3839 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
3840 const int NumUnmerge = Unmerge->getNumOperands() - 1;
3841 const int PartsPerUnmerge = NumDst / NumUnmerge;
3842
3843 for (int I = 0; I != NumUnmerge; ++I) {
3844 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
3845
3846 for (int J = 0; J != PartsPerUnmerge; ++J)
3847 MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
3848 MIB.addUse(Unmerge.getReg(I));
3849 }
3850
3851 MI.eraseFromParent();
3852 return Legalized;
3853 }
3854
3855 LegalizerHelper::LegalizeResult
fewerElementsVectorMerge(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)3856 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
3857 LLT NarrowTy) {
3858 Register DstReg = MI.getOperand(0).getReg();
3859 LLT DstTy = MRI.getType(DstReg);
3860 LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
3861 // Requires compatible types. Otherwise user of DstReg did not perform unmerge
3862 // that should have been artifact combined. Most likely instruction that uses
3863 // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
3864 assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
3865 assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3866 if (NarrowTy == SrcTy)
3867 return UnableToLegalize;
3868
3869 // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
3870 // is for old mir tests. Since the changes to more/fewer elements it should no
3871 // longer be possible to generate MIR like this when starting from llvm-ir
3872 // because LCMTy approach was replaced with merge/unmerge to vector elements.
3873 if (TypeIdx == 1) {
3874 assert(SrcTy.isVector() && "Expected vector types");
3875 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3876 if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
3877 (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
3878 return UnableToLegalize;
3879 // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
3880 //
3881 // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
3882 // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
3883 // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
3884 // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
3885 // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
3886 // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
3887
3888 SmallVector<Register, 8> Elts;
3889 LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
3890 for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
3891 auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
3892 for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
3893 Elts.push_back(Unmerge.getReg(j));
3894 }
3895
3896 SmallVector<Register, 8> NarrowTyElts;
3897 unsigned NumNarrowTyElts = NarrowTy.getNumElements();
3898 unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
3899 for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
3900 ++i, Offset += NumNarrowTyElts) {
3901 ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
3902 NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0));
3903 }
3904
3905 MIRBuilder.buildMerge(DstReg, NarrowTyElts);
3906 MI.eraseFromParent();
3907 return Legalized;
3908 }
3909
3910 assert(TypeIdx == 0 && "Bad type index");
3911 if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
3912 (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
3913 return UnableToLegalize;
3914
3915 // This is most likely SrcTy (smaller then register size) packed in DstTy
3916 // (larger then register size) and since merge was not combined it will be
3917 // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
3918 // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
3919
3920 // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
3921 //
3922 // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
3923 // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
3924 // %0:_(DstTy) = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
3925 SmallVector<Register, 8> NarrowTyElts;
3926 unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
3927 unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
3928 unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
3929 for (unsigned i = 0; i < NumParts; ++i) {
3930 SmallVector<Register, 8> Sources;
3931 for (unsigned j = 0; j < NumElts; ++j)
3932 Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
3933 NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Sources).getReg(0));
3934 }
3935
3936 MIRBuilder.buildMerge(DstReg, NarrowTyElts);
3937 MI.eraseFromParent();
3938 return Legalized;
3939 }
3940
3941 LegalizerHelper::LegalizeResult
fewerElementsVectorExtractInsertVectorElt(MachineInstr & MI,unsigned TypeIdx,LLT NarrowVecTy)3942 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
3943 unsigned TypeIdx,
3944 LLT NarrowVecTy) {
3945 Register DstReg = MI.getOperand(0).getReg();
3946 Register SrcVec = MI.getOperand(1).getReg();
3947 Register InsertVal;
3948 bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
3949
3950 assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
3951 if (IsInsert)
3952 InsertVal = MI.getOperand(2).getReg();
3953
3954 Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
3955
3956 // TODO: Handle total scalarization case.
3957 if (!NarrowVecTy.isVector())
3958 return UnableToLegalize;
3959
3960 LLT VecTy = MRI.getType(SrcVec);
3961
3962 // If the index is a constant, we can really break this down as you would
3963 // expect, and index into the target size pieces.
3964 int64_t IdxVal;
3965 auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
3966 if (MaybeCst) {
3967 IdxVal = MaybeCst->Value.getSExtValue();
3968 // Avoid out of bounds indexing the pieces.
3969 if (IdxVal >= VecTy.getNumElements()) {
3970 MIRBuilder.buildUndef(DstReg);
3971 MI.eraseFromParent();
3972 return Legalized;
3973 }
3974
3975 SmallVector<Register, 8> VecParts;
3976 LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
3977
3978 // Build a sequence of NarrowTy pieces in VecParts for this operand.
3979 LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
3980 TargetOpcode::G_ANYEXT);
3981
3982 unsigned NewNumElts = NarrowVecTy.getNumElements();
3983
3984 LLT IdxTy = MRI.getType(Idx);
3985 int64_t PartIdx = IdxVal / NewNumElts;
3986 auto NewIdx =
3987 MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
3988
3989 if (IsInsert) {
3990 LLT PartTy = MRI.getType(VecParts[PartIdx]);
3991
3992 // Use the adjusted index to insert into one of the subvectors.
3993 auto InsertPart = MIRBuilder.buildInsertVectorElement(
3994 PartTy, VecParts[PartIdx], InsertVal, NewIdx);
3995 VecParts[PartIdx] = InsertPart.getReg(0);
3996
3997 // Recombine the inserted subvector with the others to reform the result
3998 // vector.
3999 buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4000 } else {
4001 MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4002 }
4003
4004 MI.eraseFromParent();
4005 return Legalized;
4006 }
4007
4008 // With a variable index, we can't perform the operation in a smaller type, so
4009 // we're forced to expand this.
4010 //
4011 // TODO: We could emit a chain of compare/select to figure out which piece to
4012 // index.
4013 return lowerExtractInsertVectorElt(MI);
4014 }
4015
4016 LegalizerHelper::LegalizeResult
reduceLoadStoreWidth(GLoadStore & LdStMI,unsigned TypeIdx,LLT NarrowTy)4017 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4018 LLT NarrowTy) {
4019 // FIXME: Don't know how to handle secondary types yet.
4020 if (TypeIdx != 0)
4021 return UnableToLegalize;
4022
4023 // This implementation doesn't work for atomics. Give up instead of doing
4024 // something invalid.
4025 if (LdStMI.isAtomic())
4026 return UnableToLegalize;
4027
4028 bool IsLoad = isa<GLoad>(LdStMI);
4029 Register ValReg = LdStMI.getReg(0);
4030 Register AddrReg = LdStMI.getPointerReg();
4031 LLT ValTy = MRI.getType(ValReg);
4032
4033 // FIXME: Do we need a distinct NarrowMemory legalize action?
4034 if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4035 LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4036 return UnableToLegalize;
4037 }
4038
4039 int NumParts = -1;
4040 int NumLeftover = -1;
4041 LLT LeftoverTy;
4042 SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4043 if (IsLoad) {
4044 std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4045 } else {
4046 if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4047 NarrowLeftoverRegs)) {
4048 NumParts = NarrowRegs.size();
4049 NumLeftover = NarrowLeftoverRegs.size();
4050 }
4051 }
4052
4053 if (NumParts == -1)
4054 return UnableToLegalize;
4055
4056 LLT PtrTy = MRI.getType(AddrReg);
4057 const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4058
4059 unsigned TotalSize = ValTy.getSizeInBits();
4060
4061 // Split the load/store into PartTy sized pieces starting at Offset. If this
4062 // is a load, return the new registers in ValRegs. For a store, each elements
4063 // of ValRegs should be PartTy. Returns the next offset that needs to be
4064 // handled.
4065 bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
4066 auto MMO = LdStMI.getMMO();
4067 auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4068 unsigned NumParts, unsigned Offset) -> unsigned {
4069 MachineFunction &MF = MIRBuilder.getMF();
4070 unsigned PartSize = PartTy.getSizeInBits();
4071 for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4072 ++Idx) {
4073 unsigned ByteOffset = Offset / 8;
4074 Register NewAddrReg;
4075
4076 MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4077
4078 MachineMemOperand *NewMMO =
4079 MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4080
4081 if (IsLoad) {
4082 Register Dst = MRI.createGenericVirtualRegister(PartTy);
4083 ValRegs.push_back(Dst);
4084 MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4085 } else {
4086 MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4087 }
4088 Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
4089 }
4090
4091 return Offset;
4092 };
4093
4094 unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
4095 unsigned HandledOffset =
4096 splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
4097
4098 // Handle the rest of the register if this isn't an even type breakdown.
4099 if (LeftoverTy.isValid())
4100 splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
4101
4102 if (IsLoad) {
4103 insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4104 LeftoverTy, NarrowLeftoverRegs);
4105 }
4106
4107 LdStMI.eraseFromParent();
4108 return Legalized;
4109 }
4110
4111 LegalizerHelper::LegalizeResult
fewerElementsVector(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)4112 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4113 LLT NarrowTy) {
4114 using namespace TargetOpcode;
4115 GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
4116 unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
4117
4118 switch (MI.getOpcode()) {
4119 case G_IMPLICIT_DEF:
4120 case G_TRUNC:
4121 case G_AND:
4122 case G_OR:
4123 case G_XOR:
4124 case G_ADD:
4125 case G_SUB:
4126 case G_MUL:
4127 case G_PTR_ADD:
4128 case G_SMULH:
4129 case G_UMULH:
4130 case G_FADD:
4131 case G_FMUL:
4132 case G_FSUB:
4133 case G_FNEG:
4134 case G_FABS:
4135 case G_FCANONICALIZE:
4136 case G_FDIV:
4137 case G_FREM:
4138 case G_FMA:
4139 case G_FMAD:
4140 case G_FPOW:
4141 case G_FEXP:
4142 case G_FEXP2:
4143 case G_FLOG:
4144 case G_FLOG2:
4145 case G_FLOG10:
4146 case G_FNEARBYINT:
4147 case G_FCEIL:
4148 case G_FFLOOR:
4149 case G_FRINT:
4150 case G_INTRINSIC_ROUND:
4151 case G_INTRINSIC_ROUNDEVEN:
4152 case G_INTRINSIC_TRUNC:
4153 case G_FCOS:
4154 case G_FSIN:
4155 case G_FSQRT:
4156 case G_BSWAP:
4157 case G_BITREVERSE:
4158 case G_SDIV:
4159 case G_UDIV:
4160 case G_SREM:
4161 case G_UREM:
4162 case G_SDIVREM:
4163 case G_UDIVREM:
4164 case G_SMIN:
4165 case G_SMAX:
4166 case G_UMIN:
4167 case G_UMAX:
4168 case G_ABS:
4169 case G_FMINNUM:
4170 case G_FMAXNUM:
4171 case G_FMINNUM_IEEE:
4172 case G_FMAXNUM_IEEE:
4173 case G_FMINIMUM:
4174 case G_FMAXIMUM:
4175 case G_FSHL:
4176 case G_FSHR:
4177 case G_ROTL:
4178 case G_ROTR:
4179 case G_FREEZE:
4180 case G_SADDSAT:
4181 case G_SSUBSAT:
4182 case G_UADDSAT:
4183 case G_USUBSAT:
4184 case G_UMULO:
4185 case G_SMULO:
4186 case G_SHL:
4187 case G_LSHR:
4188 case G_ASHR:
4189 case G_SSHLSAT:
4190 case G_USHLSAT:
4191 case G_CTLZ:
4192 case G_CTLZ_ZERO_UNDEF:
4193 case G_CTTZ:
4194 case G_CTTZ_ZERO_UNDEF:
4195 case G_CTPOP:
4196 case G_FCOPYSIGN:
4197 case G_ZEXT:
4198 case G_SEXT:
4199 case G_ANYEXT:
4200 case G_FPEXT:
4201 case G_FPTRUNC:
4202 case G_SITOFP:
4203 case G_UITOFP:
4204 case G_FPTOSI:
4205 case G_FPTOUI:
4206 case G_INTTOPTR:
4207 case G_PTRTOINT:
4208 case G_ADDRSPACE_CAST:
4209 case G_UADDO:
4210 case G_USUBO:
4211 case G_UADDE:
4212 case G_USUBE:
4213 case G_SADDO:
4214 case G_SSUBO:
4215 case G_SADDE:
4216 case G_SSUBE:
4217 return fewerElementsVectorMultiEltType(GMI, NumElts);
4218 case G_ICMP:
4219 case G_FCMP:
4220 return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
4221 case G_SELECT:
4222 if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4223 return fewerElementsVectorMultiEltType(GMI, NumElts);
4224 return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
4225 case G_PHI:
4226 return fewerElementsVectorPhi(GMI, NumElts);
4227 case G_UNMERGE_VALUES:
4228 return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4229 case G_BUILD_VECTOR:
4230 assert(TypeIdx == 0 && "not a vector type index");
4231 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4232 case G_CONCAT_VECTORS:
4233 if (TypeIdx != 1) // TODO: This probably does work as expected already.
4234 return UnableToLegalize;
4235 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4236 case G_EXTRACT_VECTOR_ELT:
4237 case G_INSERT_VECTOR_ELT:
4238 return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4239 case G_LOAD:
4240 case G_STORE:
4241 return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4242 case G_SEXT_INREG:
4243 return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
4244 GISEL_VECREDUCE_CASES_NONSEQ
4245 return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4246 case G_SHUFFLE_VECTOR:
4247 return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4248 default:
4249 return UnableToLegalize;
4250 }
4251 }
4252
fewerElementsVectorShuffle(MachineInstr & MI,unsigned int TypeIdx,LLT NarrowTy)4253 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4254 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4255 assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4256 if (TypeIdx != 0)
4257 return UnableToLegalize;
4258
4259 Register DstReg = MI.getOperand(0).getReg();
4260 Register Src1Reg = MI.getOperand(1).getReg();
4261 Register Src2Reg = MI.getOperand(2).getReg();
4262 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4263 LLT DstTy = MRI.getType(DstReg);
4264 LLT Src1Ty = MRI.getType(Src1Reg);
4265 LLT Src2Ty = MRI.getType(Src2Reg);
4266 // The shuffle should be canonicalized by now.
4267 if (DstTy != Src1Ty)
4268 return UnableToLegalize;
4269 if (DstTy != Src2Ty)
4270 return UnableToLegalize;
4271
4272 if (!isPowerOf2_32(DstTy.getNumElements()))
4273 return UnableToLegalize;
4274
4275 // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4276 // Further legalization attempts will be needed to do split further.
4277 NarrowTy =
4278 DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4279 unsigned NewElts = NarrowTy.getNumElements();
4280
4281 SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4282 extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4283 extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4284 Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4285 SplitSrc2Regs[1]};
4286
4287 Register Hi, Lo;
4288
4289 // If Lo or Hi uses elements from at most two of the four input vectors, then
4290 // express it as a vector shuffle of those two inputs. Otherwise extract the
4291 // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4292 SmallVector<int, 16> Ops;
4293 for (unsigned High = 0; High < 2; ++High) {
4294 Register &Output = High ? Hi : Lo;
4295
4296 // Build a shuffle mask for the output, discovering on the fly which
4297 // input vectors to use as shuffle operands (recorded in InputUsed).
4298 // If building a suitable shuffle vector proves too hard, then bail
4299 // out with useBuildVector set.
4300 unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4301 unsigned FirstMaskIdx = High * NewElts;
4302 bool UseBuildVector = false;
4303 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4304 // The mask element. This indexes into the input.
4305 int Idx = Mask[FirstMaskIdx + MaskOffset];
4306
4307 // The input vector this mask element indexes into.
4308 unsigned Input = (unsigned)Idx / NewElts;
4309
4310 if (Input >= array_lengthof(Inputs)) {
4311 // The mask element does not index into any input vector.
4312 Ops.push_back(-1);
4313 continue;
4314 }
4315
4316 // Turn the index into an offset from the start of the input vector.
4317 Idx -= Input * NewElts;
4318
4319 // Find or create a shuffle vector operand to hold this input.
4320 unsigned OpNo;
4321 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
4322 if (InputUsed[OpNo] == Input) {
4323 // This input vector is already an operand.
4324 break;
4325 } else if (InputUsed[OpNo] == -1U) {
4326 // Create a new operand for this input vector.
4327 InputUsed[OpNo] = Input;
4328 break;
4329 }
4330 }
4331
4332 if (OpNo >= array_lengthof(InputUsed)) {
4333 // More than two input vectors used! Give up on trying to create a
4334 // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
4335 UseBuildVector = true;
4336 break;
4337 }
4338
4339 // Add the mask index for the new shuffle vector.
4340 Ops.push_back(Idx + OpNo * NewElts);
4341 }
4342
4343 if (UseBuildVector) {
4344 LLT EltTy = NarrowTy.getElementType();
4345 SmallVector<Register, 16> SVOps;
4346
4347 // Extract the input elements by hand.
4348 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4349 // The mask element. This indexes into the input.
4350 int Idx = Mask[FirstMaskIdx + MaskOffset];
4351
4352 // The input vector this mask element indexes into.
4353 unsigned Input = (unsigned)Idx / NewElts;
4354
4355 if (Input >= array_lengthof(Inputs)) {
4356 // The mask element is "undef" or indexes off the end of the input.
4357 SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4358 continue;
4359 }
4360
4361 // Turn the index into an offset from the start of the input vector.
4362 Idx -= Input * NewElts;
4363
4364 // Extract the vector element by hand.
4365 SVOps.push_back(MIRBuilder
4366 .buildExtractVectorElement(
4367 EltTy, Inputs[Input],
4368 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4369 .getReg(0));
4370 }
4371
4372 // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4373 Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4374 } else if (InputUsed[0] == -1U) {
4375 // No input vectors were used! The result is undefined.
4376 Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4377 } else {
4378 Register Op0 = Inputs[InputUsed[0]];
4379 // If only one input was used, use an undefined vector for the other.
4380 Register Op1 = InputUsed[1] == -1U
4381 ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4382 : Inputs[InputUsed[1]];
4383 // At least one input vector was used. Create a new shuffle vector.
4384 Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4385 }
4386
4387 Ops.clear();
4388 }
4389
4390 MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4391 MI.eraseFromParent();
4392 return Legalized;
4393 }
4394
getScalarOpcForReduction(unsigned Opc)4395 static unsigned getScalarOpcForReduction(unsigned Opc) {
4396 unsigned ScalarOpc;
4397 switch (Opc) {
4398 case TargetOpcode::G_VECREDUCE_FADD:
4399 ScalarOpc = TargetOpcode::G_FADD;
4400 break;
4401 case TargetOpcode::G_VECREDUCE_FMUL:
4402 ScalarOpc = TargetOpcode::G_FMUL;
4403 break;
4404 case TargetOpcode::G_VECREDUCE_FMAX:
4405 ScalarOpc = TargetOpcode::G_FMAXNUM;
4406 break;
4407 case TargetOpcode::G_VECREDUCE_FMIN:
4408 ScalarOpc = TargetOpcode::G_FMINNUM;
4409 break;
4410 case TargetOpcode::G_VECREDUCE_ADD:
4411 ScalarOpc = TargetOpcode::G_ADD;
4412 break;
4413 case TargetOpcode::G_VECREDUCE_MUL:
4414 ScalarOpc = TargetOpcode::G_MUL;
4415 break;
4416 case TargetOpcode::G_VECREDUCE_AND:
4417 ScalarOpc = TargetOpcode::G_AND;
4418 break;
4419 case TargetOpcode::G_VECREDUCE_OR:
4420 ScalarOpc = TargetOpcode::G_OR;
4421 break;
4422 case TargetOpcode::G_VECREDUCE_XOR:
4423 ScalarOpc = TargetOpcode::G_XOR;
4424 break;
4425 case TargetOpcode::G_VECREDUCE_SMAX:
4426 ScalarOpc = TargetOpcode::G_SMAX;
4427 break;
4428 case TargetOpcode::G_VECREDUCE_SMIN:
4429 ScalarOpc = TargetOpcode::G_SMIN;
4430 break;
4431 case TargetOpcode::G_VECREDUCE_UMAX:
4432 ScalarOpc = TargetOpcode::G_UMAX;
4433 break;
4434 case TargetOpcode::G_VECREDUCE_UMIN:
4435 ScalarOpc = TargetOpcode::G_UMIN;
4436 break;
4437 default:
4438 llvm_unreachable("Unhandled reduction");
4439 }
4440 return ScalarOpc;
4441 }
4442
fewerElementsVectorReductions(MachineInstr & MI,unsigned int TypeIdx,LLT NarrowTy)4443 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4444 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4445 unsigned Opc = MI.getOpcode();
4446 assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
4447 Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
4448 "Sequential reductions not expected");
4449
4450 if (TypeIdx != 1)
4451 return UnableToLegalize;
4452
4453 // The semantics of the normal non-sequential reductions allow us to freely
4454 // re-associate the operation.
4455 Register SrcReg = MI.getOperand(1).getReg();
4456 LLT SrcTy = MRI.getType(SrcReg);
4457 Register DstReg = MI.getOperand(0).getReg();
4458 LLT DstTy = MRI.getType(DstReg);
4459
4460 if (NarrowTy.isVector() &&
4461 (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4462 return UnableToLegalize;
4463
4464 unsigned ScalarOpc = getScalarOpcForReduction(Opc);
4465 SmallVector<Register> SplitSrcs;
4466 // If NarrowTy is a scalar then we're being asked to scalarize.
4467 const unsigned NumParts =
4468 NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4469 : SrcTy.getNumElements();
4470
4471 extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4472 if (NarrowTy.isScalar()) {
4473 if (DstTy != NarrowTy)
4474 return UnableToLegalize; // FIXME: handle implicit extensions.
4475
4476 if (isPowerOf2_32(NumParts)) {
4477 // Generate a tree of scalar operations to reduce the critical path.
4478 SmallVector<Register> PartialResults;
4479 unsigned NumPartsLeft = NumParts;
4480 while (NumPartsLeft > 1) {
4481 for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4482 PartialResults.emplace_back(
4483 MIRBuilder
4484 .buildInstr(ScalarOpc, {NarrowTy},
4485 {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4486 .getReg(0));
4487 }
4488 SplitSrcs = PartialResults;
4489 PartialResults.clear();
4490 NumPartsLeft = SplitSrcs.size();
4491 }
4492 assert(SplitSrcs.size() == 1);
4493 MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4494 MI.eraseFromParent();
4495 return Legalized;
4496 }
4497 // If we can't generate a tree, then just do sequential operations.
4498 Register Acc = SplitSrcs[0];
4499 for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4500 Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4501 .getReg(0);
4502 MIRBuilder.buildCopy(DstReg, Acc);
4503 MI.eraseFromParent();
4504 return Legalized;
4505 }
4506 SmallVector<Register> PartialReductions;
4507 for (unsigned Part = 0; Part < NumParts; ++Part) {
4508 PartialReductions.push_back(
4509 MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
4510 }
4511
4512
4513 // If the types involved are powers of 2, we can generate intermediate vector
4514 // ops, before generating a final reduction operation.
4515 if (isPowerOf2_32(SrcTy.getNumElements()) &&
4516 isPowerOf2_32(NarrowTy.getNumElements())) {
4517 return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4518 }
4519
4520 Register Acc = PartialReductions[0];
4521 for (unsigned Part = 1; Part < NumParts; ++Part) {
4522 if (Part == NumParts - 1) {
4523 MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4524 {Acc, PartialReductions[Part]});
4525 } else {
4526 Acc = MIRBuilder
4527 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4528 .getReg(0);
4529 }
4530 }
4531 MI.eraseFromParent();
4532 return Legalized;
4533 }
4534
4535 LegalizerHelper::LegalizeResult
tryNarrowPow2Reduction(MachineInstr & MI,Register SrcReg,LLT SrcTy,LLT NarrowTy,unsigned ScalarOpc)4536 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4537 LLT SrcTy, LLT NarrowTy,
4538 unsigned ScalarOpc) {
4539 SmallVector<Register> SplitSrcs;
4540 // Split the sources into NarrowTy size pieces.
4541 extractParts(SrcReg, NarrowTy,
4542 SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4543 // We're going to do a tree reduction using vector operations until we have
4544 // one NarrowTy size value left.
4545 while (SplitSrcs.size() > 1) {
4546 SmallVector<Register> PartialRdxs;
4547 for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4548 Register LHS = SplitSrcs[Idx];
4549 Register RHS = SplitSrcs[Idx + 1];
4550 // Create the intermediate vector op.
4551 Register Res =
4552 MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4553 PartialRdxs.push_back(Res);
4554 }
4555 SplitSrcs = std::move(PartialRdxs);
4556 }
4557 // Finally generate the requested NarrowTy based reduction.
4558 Observer.changingInstr(MI);
4559 MI.getOperand(1).setReg(SplitSrcs[0]);
4560 Observer.changedInstr(MI);
4561 return Legalized;
4562 }
4563
4564 LegalizerHelper::LegalizeResult
narrowScalarShiftByConstant(MachineInstr & MI,const APInt & Amt,const LLT HalfTy,const LLT AmtTy)4565 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4566 const LLT HalfTy, const LLT AmtTy) {
4567
4568 Register InL = MRI.createGenericVirtualRegister(HalfTy);
4569 Register InH = MRI.createGenericVirtualRegister(HalfTy);
4570 MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4571
4572 if (Amt.isZero()) {
4573 MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
4574 MI.eraseFromParent();
4575 return Legalized;
4576 }
4577
4578 LLT NVT = HalfTy;
4579 unsigned NVTBits = HalfTy.getSizeInBits();
4580 unsigned VTBits = 2 * NVTBits;
4581
4582 SrcOp Lo(Register(0)), Hi(Register(0));
4583 if (MI.getOpcode() == TargetOpcode::G_SHL) {
4584 if (Amt.ugt(VTBits)) {
4585 Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4586 } else if (Amt.ugt(NVTBits)) {
4587 Lo = MIRBuilder.buildConstant(NVT, 0);
4588 Hi = MIRBuilder.buildShl(NVT, InL,
4589 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4590 } else if (Amt == NVTBits) {
4591 Lo = MIRBuilder.buildConstant(NVT, 0);
4592 Hi = InL;
4593 } else {
4594 Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4595 auto OrLHS =
4596 MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4597 auto OrRHS = MIRBuilder.buildLShr(
4598 NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4599 Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4600 }
4601 } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4602 if (Amt.ugt(VTBits)) {
4603 Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4604 } else if (Amt.ugt(NVTBits)) {
4605 Lo = MIRBuilder.buildLShr(NVT, InH,
4606 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4607 Hi = MIRBuilder.buildConstant(NVT, 0);
4608 } else if (Amt == NVTBits) {
4609 Lo = InH;
4610 Hi = MIRBuilder.buildConstant(NVT, 0);
4611 } else {
4612 auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4613
4614 auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4615 auto OrRHS = MIRBuilder.buildShl(
4616 NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4617
4618 Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4619 Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4620 }
4621 } else {
4622 if (Amt.ugt(VTBits)) {
4623 Hi = Lo = MIRBuilder.buildAShr(
4624 NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4625 } else if (Amt.ugt(NVTBits)) {
4626 Lo = MIRBuilder.buildAShr(NVT, InH,
4627 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4628 Hi = MIRBuilder.buildAShr(NVT, InH,
4629 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4630 } else if (Amt == NVTBits) {
4631 Lo = InH;
4632 Hi = MIRBuilder.buildAShr(NVT, InH,
4633 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4634 } else {
4635 auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4636
4637 auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4638 auto OrRHS = MIRBuilder.buildShl(
4639 NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4640
4641 Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4642 Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
4643 }
4644 }
4645
4646 MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi});
4647 MI.eraseFromParent();
4648
4649 return Legalized;
4650 }
4651
4652 // TODO: Optimize if constant shift amount.
4653 LegalizerHelper::LegalizeResult
narrowScalarShift(MachineInstr & MI,unsigned TypeIdx,LLT RequestedTy)4654 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
4655 LLT RequestedTy) {
4656 if (TypeIdx == 1) {
4657 Observer.changingInstr(MI);
4658 narrowScalarSrc(MI, RequestedTy, 2);
4659 Observer.changedInstr(MI);
4660 return Legalized;
4661 }
4662
4663 Register DstReg = MI.getOperand(0).getReg();
4664 LLT DstTy = MRI.getType(DstReg);
4665 if (DstTy.isVector())
4666 return UnableToLegalize;
4667
4668 Register Amt = MI.getOperand(2).getReg();
4669 LLT ShiftAmtTy = MRI.getType(Amt);
4670 const unsigned DstEltSize = DstTy.getScalarSizeInBits();
4671 if (DstEltSize % 2 != 0)
4672 return UnableToLegalize;
4673
4674 // Ignore the input type. We can only go to exactly half the size of the
4675 // input. If that isn't small enough, the resulting pieces will be further
4676 // legalized.
4677 const unsigned NewBitSize = DstEltSize / 2;
4678 const LLT HalfTy = LLT::scalar(NewBitSize);
4679 const LLT CondTy = LLT::scalar(1);
4680
4681 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
4682 return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
4683 ShiftAmtTy);
4684 }
4685
4686 // TODO: Expand with known bits.
4687
4688 // Handle the fully general expansion by an unknown amount.
4689 auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
4690
4691 Register InL = MRI.createGenericVirtualRegister(HalfTy);
4692 Register InH = MRI.createGenericVirtualRegister(HalfTy);
4693 MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4694
4695 auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
4696 auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
4697
4698 auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
4699 auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
4700 auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
4701
4702 Register ResultRegs[2];
4703 switch (MI.getOpcode()) {
4704 case TargetOpcode::G_SHL: {
4705 // Short: ShAmt < NewBitSize
4706 auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
4707
4708 auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
4709 auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
4710 auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4711
4712 // Long: ShAmt >= NewBitSize
4713 auto LoL = MIRBuilder.buildConstant(HalfTy, 0); // Lo part is zero.
4714 auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
4715
4716 auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
4717 auto Hi = MIRBuilder.buildSelect(
4718 HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
4719
4720 ResultRegs[0] = Lo.getReg(0);
4721 ResultRegs[1] = Hi.getReg(0);
4722 break;
4723 }
4724 case TargetOpcode::G_LSHR:
4725 case TargetOpcode::G_ASHR: {
4726 // Short: ShAmt < NewBitSize
4727 auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
4728
4729 auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
4730 auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
4731 auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4732
4733 // Long: ShAmt >= NewBitSize
4734 MachineInstrBuilder HiL;
4735 if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4736 HiL = MIRBuilder.buildConstant(HalfTy, 0); // Hi part is zero.
4737 } else {
4738 auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
4739 HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt); // Sign of Hi part.
4740 }
4741 auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
4742 {InH, AmtExcess}); // Lo from Hi part.
4743
4744 auto Lo = MIRBuilder.buildSelect(
4745 HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
4746
4747 auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
4748
4749 ResultRegs[0] = Lo.getReg(0);
4750 ResultRegs[1] = Hi.getReg(0);
4751 break;
4752 }
4753 default:
4754 llvm_unreachable("not a shift");
4755 }
4756
4757 MIRBuilder.buildMerge(DstReg, ResultRegs);
4758 MI.eraseFromParent();
4759 return Legalized;
4760 }
4761
4762 LegalizerHelper::LegalizeResult
moreElementsVectorPhi(MachineInstr & MI,unsigned TypeIdx,LLT MoreTy)4763 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
4764 LLT MoreTy) {
4765 assert(TypeIdx == 0 && "Expecting only Idx 0");
4766
4767 Observer.changingInstr(MI);
4768 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
4769 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
4770 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
4771 moreElementsVectorSrc(MI, MoreTy, I);
4772 }
4773
4774 MachineBasicBlock &MBB = *MI.getParent();
4775 MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
4776 moreElementsVectorDst(MI, MoreTy, 0);
4777 Observer.changedInstr(MI);
4778 return Legalized;
4779 }
4780
4781 LegalizerHelper::LegalizeResult
moreElementsVector(MachineInstr & MI,unsigned TypeIdx,LLT MoreTy)4782 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
4783 LLT MoreTy) {
4784 unsigned Opc = MI.getOpcode();
4785 switch (Opc) {
4786 case TargetOpcode::G_IMPLICIT_DEF:
4787 case TargetOpcode::G_LOAD: {
4788 if (TypeIdx != 0)
4789 return UnableToLegalize;
4790 Observer.changingInstr(MI);
4791 moreElementsVectorDst(MI, MoreTy, 0);
4792 Observer.changedInstr(MI);
4793 return Legalized;
4794 }
4795 case TargetOpcode::G_STORE:
4796 if (TypeIdx != 0)
4797 return UnableToLegalize;
4798 Observer.changingInstr(MI);
4799 moreElementsVectorSrc(MI, MoreTy, 0);
4800 Observer.changedInstr(MI);
4801 return Legalized;
4802 case TargetOpcode::G_AND:
4803 case TargetOpcode::G_OR:
4804 case TargetOpcode::G_XOR:
4805 case TargetOpcode::G_ADD:
4806 case TargetOpcode::G_SUB:
4807 case TargetOpcode::G_MUL:
4808 case TargetOpcode::G_FADD:
4809 case TargetOpcode::G_FMUL:
4810 case TargetOpcode::G_UADDSAT:
4811 case TargetOpcode::G_USUBSAT:
4812 case TargetOpcode::G_SADDSAT:
4813 case TargetOpcode::G_SSUBSAT:
4814 case TargetOpcode::G_SMIN:
4815 case TargetOpcode::G_SMAX:
4816 case TargetOpcode::G_UMIN:
4817 case TargetOpcode::G_UMAX:
4818 case TargetOpcode::G_FMINNUM:
4819 case TargetOpcode::G_FMAXNUM:
4820 case TargetOpcode::G_FMINNUM_IEEE:
4821 case TargetOpcode::G_FMAXNUM_IEEE:
4822 case TargetOpcode::G_FMINIMUM:
4823 case TargetOpcode::G_FMAXIMUM: {
4824 Observer.changingInstr(MI);
4825 moreElementsVectorSrc(MI, MoreTy, 1);
4826 moreElementsVectorSrc(MI, MoreTy, 2);
4827 moreElementsVectorDst(MI, MoreTy, 0);
4828 Observer.changedInstr(MI);
4829 return Legalized;
4830 }
4831 case TargetOpcode::G_FMA:
4832 case TargetOpcode::G_FSHR:
4833 case TargetOpcode::G_FSHL: {
4834 Observer.changingInstr(MI);
4835 moreElementsVectorSrc(MI, MoreTy, 1);
4836 moreElementsVectorSrc(MI, MoreTy, 2);
4837 moreElementsVectorSrc(MI, MoreTy, 3);
4838 moreElementsVectorDst(MI, MoreTy, 0);
4839 Observer.changedInstr(MI);
4840 return Legalized;
4841 }
4842 case TargetOpcode::G_EXTRACT:
4843 if (TypeIdx != 1)
4844 return UnableToLegalize;
4845 Observer.changingInstr(MI);
4846 moreElementsVectorSrc(MI, MoreTy, 1);
4847 Observer.changedInstr(MI);
4848 return Legalized;
4849 case TargetOpcode::G_INSERT:
4850 case TargetOpcode::G_FREEZE:
4851 case TargetOpcode::G_FNEG:
4852 case TargetOpcode::G_FABS:
4853 case TargetOpcode::G_BSWAP:
4854 case TargetOpcode::G_FCANONICALIZE:
4855 case TargetOpcode::G_SEXT_INREG:
4856 if (TypeIdx != 0)
4857 return UnableToLegalize;
4858 Observer.changingInstr(MI);
4859 moreElementsVectorSrc(MI, MoreTy, 1);
4860 moreElementsVectorDst(MI, MoreTy, 0);
4861 Observer.changedInstr(MI);
4862 return Legalized;
4863 case TargetOpcode::G_SELECT: {
4864 Register DstReg = MI.getOperand(0).getReg();
4865 Register CondReg = MI.getOperand(1).getReg();
4866 LLT DstTy = MRI.getType(DstReg);
4867 LLT CondTy = MRI.getType(CondReg);
4868 if (TypeIdx == 1) {
4869 if (!CondTy.isScalar() ||
4870 DstTy.getElementCount() != MoreTy.getElementCount())
4871 return UnableToLegalize;
4872
4873 // This is turning a scalar select of vectors into a vector
4874 // select. Broadcast the select condition.
4875 auto ShufSplat = MIRBuilder.buildShuffleSplat(MoreTy, CondReg);
4876 Observer.changingInstr(MI);
4877 MI.getOperand(1).setReg(ShufSplat.getReg(0));
4878 Observer.changedInstr(MI);
4879 return Legalized;
4880 }
4881
4882 if (CondTy.isVector())
4883 return UnableToLegalize;
4884
4885 Observer.changingInstr(MI);
4886 moreElementsVectorSrc(MI, MoreTy, 2);
4887 moreElementsVectorSrc(MI, MoreTy, 3);
4888 moreElementsVectorDst(MI, MoreTy, 0);
4889 Observer.changedInstr(MI);
4890 return Legalized;
4891 }
4892 case TargetOpcode::G_UNMERGE_VALUES:
4893 return UnableToLegalize;
4894 case TargetOpcode::G_PHI:
4895 return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
4896 case TargetOpcode::G_SHUFFLE_VECTOR:
4897 return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
4898 case TargetOpcode::G_BUILD_VECTOR: {
4899 SmallVector<SrcOp, 8> Elts;
4900 for (auto Op : MI.uses()) {
4901 Elts.push_back(Op.getReg());
4902 }
4903
4904 for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
4905 Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
4906 }
4907
4908 MIRBuilder.buildDeleteTrailingVectorElements(
4909 MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
4910 MI.eraseFromParent();
4911 return Legalized;
4912 }
4913 case TargetOpcode::G_TRUNC: {
4914 Observer.changingInstr(MI);
4915 moreElementsVectorSrc(MI, MoreTy, 1);
4916 moreElementsVectorDst(MI, MoreTy, 0);
4917 Observer.changedInstr(MI);
4918 return Legalized;
4919 }
4920 default:
4921 return UnableToLegalize;
4922 }
4923 }
4924
4925 LegalizerHelper::LegalizeResult
moreElementsVectorShuffle(MachineInstr & MI,unsigned int TypeIdx,LLT MoreTy)4926 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
4927 unsigned int TypeIdx, LLT MoreTy) {
4928 if (TypeIdx != 0)
4929 return UnableToLegalize;
4930
4931 Register DstReg = MI.getOperand(0).getReg();
4932 Register Src1Reg = MI.getOperand(1).getReg();
4933 Register Src2Reg = MI.getOperand(2).getReg();
4934 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4935 LLT DstTy = MRI.getType(DstReg);
4936 LLT Src1Ty = MRI.getType(Src1Reg);
4937 LLT Src2Ty = MRI.getType(Src2Reg);
4938 unsigned NumElts = DstTy.getNumElements();
4939 unsigned WidenNumElts = MoreTy.getNumElements();
4940
4941 // Expect a canonicalized shuffle.
4942 if (DstTy != Src1Ty || DstTy != Src2Ty)
4943 return UnableToLegalize;
4944
4945 moreElementsVectorSrc(MI, MoreTy, 1);
4946 moreElementsVectorSrc(MI, MoreTy, 2);
4947
4948 // Adjust mask based on new input vector length.
4949 SmallVector<int, 16> NewMask;
4950 for (unsigned I = 0; I != NumElts; ++I) {
4951 int Idx = Mask[I];
4952 if (Idx < static_cast<int>(NumElts))
4953 NewMask.push_back(Idx);
4954 else
4955 NewMask.push_back(Idx - NumElts + WidenNumElts);
4956 }
4957 for (unsigned I = NumElts; I != WidenNumElts; ++I)
4958 NewMask.push_back(-1);
4959 moreElementsVectorDst(MI, MoreTy, 0);
4960 MIRBuilder.setInstrAndDebugLoc(MI);
4961 MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
4962 MI.getOperand(1).getReg(),
4963 MI.getOperand(2).getReg(), NewMask);
4964 MI.eraseFromParent();
4965 return Legalized;
4966 }
4967
multiplyRegisters(SmallVectorImpl<Register> & DstRegs,ArrayRef<Register> Src1Regs,ArrayRef<Register> Src2Regs,LLT NarrowTy)4968 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
4969 ArrayRef<Register> Src1Regs,
4970 ArrayRef<Register> Src2Regs,
4971 LLT NarrowTy) {
4972 MachineIRBuilder &B = MIRBuilder;
4973 unsigned SrcParts = Src1Regs.size();
4974 unsigned DstParts = DstRegs.size();
4975
4976 unsigned DstIdx = 0; // Low bits of the result.
4977 Register FactorSum =
4978 B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
4979 DstRegs[DstIdx] = FactorSum;
4980
4981 unsigned CarrySumPrevDstIdx;
4982 SmallVector<Register, 4> Factors;
4983
4984 for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
4985 // Collect low parts of muls for DstIdx.
4986 for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
4987 i <= std::min(DstIdx, SrcParts - 1); ++i) {
4988 MachineInstrBuilder Mul =
4989 B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
4990 Factors.push_back(Mul.getReg(0));
4991 }
4992 // Collect high parts of muls from previous DstIdx.
4993 for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
4994 i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
4995 MachineInstrBuilder Umulh =
4996 B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
4997 Factors.push_back(Umulh.getReg(0));
4998 }
4999 // Add CarrySum from additions calculated for previous DstIdx.
5000 if (DstIdx != 1) {
5001 Factors.push_back(CarrySumPrevDstIdx);
5002 }
5003
5004 Register CarrySum;
5005 // Add all factors and accumulate all carries into CarrySum.
5006 if (DstIdx != DstParts - 1) {
5007 MachineInstrBuilder Uaddo =
5008 B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5009 FactorSum = Uaddo.getReg(0);
5010 CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5011 for (unsigned i = 2; i < Factors.size(); ++i) {
5012 MachineInstrBuilder Uaddo =
5013 B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5014 FactorSum = Uaddo.getReg(0);
5015 MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5016 CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5017 }
5018 } else {
5019 // Since value for the next index is not calculated, neither is CarrySum.
5020 FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5021 for (unsigned i = 2; i < Factors.size(); ++i)
5022 FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5023 }
5024
5025 CarrySumPrevDstIdx = CarrySum;
5026 DstRegs[DstIdx] = FactorSum;
5027 Factors.clear();
5028 }
5029 }
5030
5031 LegalizerHelper::LegalizeResult
narrowScalarAddSub(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5032 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5033 LLT NarrowTy) {
5034 if (TypeIdx != 0)
5035 return UnableToLegalize;
5036
5037 Register DstReg = MI.getOperand(0).getReg();
5038 LLT DstType = MRI.getType(DstReg);
5039 // FIXME: add support for vector types
5040 if (DstType.isVector())
5041 return UnableToLegalize;
5042
5043 unsigned Opcode = MI.getOpcode();
5044 unsigned OpO, OpE, OpF;
5045 switch (Opcode) {
5046 case TargetOpcode::G_SADDO:
5047 case TargetOpcode::G_SADDE:
5048 case TargetOpcode::G_UADDO:
5049 case TargetOpcode::G_UADDE:
5050 case TargetOpcode::G_ADD:
5051 OpO = TargetOpcode::G_UADDO;
5052 OpE = TargetOpcode::G_UADDE;
5053 OpF = TargetOpcode::G_UADDE;
5054 if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5055 OpF = TargetOpcode::G_SADDE;
5056 break;
5057 case TargetOpcode::G_SSUBO:
5058 case TargetOpcode::G_SSUBE:
5059 case TargetOpcode::G_USUBO:
5060 case TargetOpcode::G_USUBE:
5061 case TargetOpcode::G_SUB:
5062 OpO = TargetOpcode::G_USUBO;
5063 OpE = TargetOpcode::G_USUBE;
5064 OpF = TargetOpcode::G_USUBE;
5065 if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5066 OpF = TargetOpcode::G_SSUBE;
5067 break;
5068 default:
5069 llvm_unreachable("Unexpected add/sub opcode!");
5070 }
5071
5072 // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5073 unsigned NumDefs = MI.getNumExplicitDefs();
5074 Register Src1 = MI.getOperand(NumDefs).getReg();
5075 Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5076 Register CarryDst, CarryIn;
5077 if (NumDefs == 2)
5078 CarryDst = MI.getOperand(1).getReg();
5079 if (MI.getNumOperands() == NumDefs + 3)
5080 CarryIn = MI.getOperand(NumDefs + 2).getReg();
5081
5082 LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5083 LLT LeftoverTy, DummyTy;
5084 SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5085 extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5086 extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5087
5088 int NarrowParts = Src1Regs.size();
5089 for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5090 Src1Regs.push_back(Src1Left[I]);
5091 Src2Regs.push_back(Src2Left[I]);
5092 }
5093 DstRegs.reserve(Src1Regs.size());
5094
5095 for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5096 Register DstReg =
5097 MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5098 Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5099 // Forward the final carry-out to the destination register
5100 if (i == e - 1 && CarryDst)
5101 CarryOut = CarryDst;
5102
5103 if (!CarryIn) {
5104 MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5105 {Src1Regs[i], Src2Regs[i]});
5106 } else if (i == e - 1) {
5107 MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5108 {Src1Regs[i], Src2Regs[i], CarryIn});
5109 } else {
5110 MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5111 {Src1Regs[i], Src2Regs[i], CarryIn});
5112 }
5113
5114 DstRegs.push_back(DstReg);
5115 CarryIn = CarryOut;
5116 }
5117 insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5118 makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5119 makeArrayRef(DstRegs).drop_front(NarrowParts));
5120
5121 MI.eraseFromParent();
5122 return Legalized;
5123 }
5124
5125 LegalizerHelper::LegalizeResult
narrowScalarMul(MachineInstr & MI,LLT NarrowTy)5126 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5127 Register DstReg = MI.getOperand(0).getReg();
5128 Register Src1 = MI.getOperand(1).getReg();
5129 Register Src2 = MI.getOperand(2).getReg();
5130
5131 LLT Ty = MRI.getType(DstReg);
5132 if (Ty.isVector())
5133 return UnableToLegalize;
5134
5135 unsigned Size = Ty.getSizeInBits();
5136 unsigned NarrowSize = NarrowTy.getSizeInBits();
5137 if (Size % NarrowSize != 0)
5138 return UnableToLegalize;
5139
5140 unsigned NumParts = Size / NarrowSize;
5141 bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5142 unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
5143
5144 SmallVector<Register, 2> Src1Parts, Src2Parts;
5145 SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5146 extractParts(Src1, NarrowTy, NumParts, Src1Parts);
5147 extractParts(Src2, NarrowTy, NumParts, Src2Parts);
5148 multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5149
5150 // Take only high half of registers if this is high mul.
5151 ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
5152 MIRBuilder.buildMerge(DstReg, DstRegs);
5153 MI.eraseFromParent();
5154 return Legalized;
5155 }
5156
5157 LegalizerHelper::LegalizeResult
narrowScalarFPTOI(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5158 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5159 LLT NarrowTy) {
5160 if (TypeIdx != 0)
5161 return UnableToLegalize;
5162
5163 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5164
5165 Register Src = MI.getOperand(1).getReg();
5166 LLT SrcTy = MRI.getType(Src);
5167
5168 // If all finite floats fit into the narrowed integer type, we can just swap
5169 // out the result type. This is practically only useful for conversions from
5170 // half to at least 16-bits, so just handle the one case.
5171 if (SrcTy.getScalarType() != LLT::scalar(16) ||
5172 NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5173 return UnableToLegalize;
5174
5175 Observer.changingInstr(MI);
5176 narrowScalarDst(MI, NarrowTy, 0,
5177 IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5178 Observer.changedInstr(MI);
5179 return Legalized;
5180 }
5181
5182 LegalizerHelper::LegalizeResult
narrowScalarExtract(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5183 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5184 LLT NarrowTy) {
5185 if (TypeIdx != 1)
5186 return UnableToLegalize;
5187
5188 uint64_t NarrowSize = NarrowTy.getSizeInBits();
5189
5190 int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5191 // FIXME: add support for when SizeOp1 isn't an exact multiple of
5192 // NarrowSize.
5193 if (SizeOp1 % NarrowSize != 0)
5194 return UnableToLegalize;
5195 int NumParts = SizeOp1 / NarrowSize;
5196
5197 SmallVector<Register, 2> SrcRegs, DstRegs;
5198 SmallVector<uint64_t, 2> Indexes;
5199 extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5200
5201 Register OpReg = MI.getOperand(0).getReg();
5202 uint64_t OpStart = MI.getOperand(2).getImm();
5203 uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5204 for (int i = 0; i < NumParts; ++i) {
5205 unsigned SrcStart = i * NarrowSize;
5206
5207 if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5208 // No part of the extract uses this subregister, ignore it.
5209 continue;
5210 } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5211 // The entire subregister is extracted, forward the value.
5212 DstRegs.push_back(SrcRegs[i]);
5213 continue;
5214 }
5215
5216 // OpSegStart is where this destination segment would start in OpReg if it
5217 // extended infinitely in both directions.
5218 int64_t ExtractOffset;
5219 uint64_t SegSize;
5220 if (OpStart < SrcStart) {
5221 ExtractOffset = 0;
5222 SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5223 } else {
5224 ExtractOffset = OpStart - SrcStart;
5225 SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5226 }
5227
5228 Register SegReg = SrcRegs[i];
5229 if (ExtractOffset != 0 || SegSize != NarrowSize) {
5230 // A genuine extract is needed.
5231 SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5232 MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5233 }
5234
5235 DstRegs.push_back(SegReg);
5236 }
5237
5238 Register DstReg = MI.getOperand(0).getReg();
5239 if (MRI.getType(DstReg).isVector())
5240 MIRBuilder.buildBuildVector(DstReg, DstRegs);
5241 else if (DstRegs.size() > 1)
5242 MIRBuilder.buildMerge(DstReg, DstRegs);
5243 else
5244 MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5245 MI.eraseFromParent();
5246 return Legalized;
5247 }
5248
5249 LegalizerHelper::LegalizeResult
narrowScalarInsert(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5250 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5251 LLT NarrowTy) {
5252 // FIXME: Don't know how to handle secondary types yet.
5253 if (TypeIdx != 0)
5254 return UnableToLegalize;
5255
5256 SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5257 SmallVector<uint64_t, 2> Indexes;
5258 LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5259 LLT LeftoverTy;
5260 extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5261 LeftoverRegs);
5262
5263 for (Register Reg : LeftoverRegs)
5264 SrcRegs.push_back(Reg);
5265
5266 uint64_t NarrowSize = NarrowTy.getSizeInBits();
5267 Register OpReg = MI.getOperand(2).getReg();
5268 uint64_t OpStart = MI.getOperand(3).getImm();
5269 uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5270 for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5271 unsigned DstStart = I * NarrowSize;
5272
5273 if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5274 // The entire subregister is defined by this insert, forward the new
5275 // value.
5276 DstRegs.push_back(OpReg);
5277 continue;
5278 }
5279
5280 Register SrcReg = SrcRegs[I];
5281 if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5282 // The leftover reg is smaller than NarrowTy, so we need to extend it.
5283 SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5284 MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5285 }
5286
5287 if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5288 // No part of the insert affects this subregister, forward the original.
5289 DstRegs.push_back(SrcReg);
5290 continue;
5291 }
5292
5293 // OpSegStart is where this destination segment would start in OpReg if it
5294 // extended infinitely in both directions.
5295 int64_t ExtractOffset, InsertOffset;
5296 uint64_t SegSize;
5297 if (OpStart < DstStart) {
5298 InsertOffset = 0;
5299 ExtractOffset = DstStart - OpStart;
5300 SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5301 } else {
5302 InsertOffset = OpStart - DstStart;
5303 ExtractOffset = 0;
5304 SegSize =
5305 std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5306 }
5307
5308 Register SegReg = OpReg;
5309 if (ExtractOffset != 0 || SegSize != OpSize) {
5310 // A genuine extract is needed.
5311 SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5312 MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5313 }
5314
5315 Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5316 MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5317 DstRegs.push_back(DstReg);
5318 }
5319
5320 uint64_t WideSize = DstRegs.size() * NarrowSize;
5321 Register DstReg = MI.getOperand(0).getReg();
5322 if (WideSize > RegTy.getSizeInBits()) {
5323 Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5324 MIRBuilder.buildMerge(MergeReg, DstRegs);
5325 MIRBuilder.buildTrunc(DstReg, MergeReg);
5326 } else
5327 MIRBuilder.buildMerge(DstReg, DstRegs);
5328
5329 MI.eraseFromParent();
5330 return Legalized;
5331 }
5332
5333 LegalizerHelper::LegalizeResult
narrowScalarBasic(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5334 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5335 LLT NarrowTy) {
5336 Register DstReg = MI.getOperand(0).getReg();
5337 LLT DstTy = MRI.getType(DstReg);
5338
5339 assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5340
5341 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5342 SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5343 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5344 LLT LeftoverTy;
5345 if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5346 Src0Regs, Src0LeftoverRegs))
5347 return UnableToLegalize;
5348
5349 LLT Unused;
5350 if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5351 Src1Regs, Src1LeftoverRegs))
5352 llvm_unreachable("inconsistent extractParts result");
5353
5354 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5355 auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5356 {Src0Regs[I], Src1Regs[I]});
5357 DstRegs.push_back(Inst.getReg(0));
5358 }
5359
5360 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5361 auto Inst = MIRBuilder.buildInstr(
5362 MI.getOpcode(),
5363 {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5364 DstLeftoverRegs.push_back(Inst.getReg(0));
5365 }
5366
5367 insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5368 LeftoverTy, DstLeftoverRegs);
5369
5370 MI.eraseFromParent();
5371 return Legalized;
5372 }
5373
5374 LegalizerHelper::LegalizeResult
narrowScalarExt(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5375 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5376 LLT NarrowTy) {
5377 if (TypeIdx != 0)
5378 return UnableToLegalize;
5379
5380 Register DstReg = MI.getOperand(0).getReg();
5381 Register SrcReg = MI.getOperand(1).getReg();
5382
5383 LLT DstTy = MRI.getType(DstReg);
5384 if (DstTy.isVector())
5385 return UnableToLegalize;
5386
5387 SmallVector<Register, 8> Parts;
5388 LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5389 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5390 buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5391
5392 MI.eraseFromParent();
5393 return Legalized;
5394 }
5395
5396 LegalizerHelper::LegalizeResult
narrowScalarSelect(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5397 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5398 LLT NarrowTy) {
5399 if (TypeIdx != 0)
5400 return UnableToLegalize;
5401
5402 Register CondReg = MI.getOperand(1).getReg();
5403 LLT CondTy = MRI.getType(CondReg);
5404 if (CondTy.isVector()) // TODO: Handle vselect
5405 return UnableToLegalize;
5406
5407 Register DstReg = MI.getOperand(0).getReg();
5408 LLT DstTy = MRI.getType(DstReg);
5409
5410 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5411 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5412 SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5413 LLT LeftoverTy;
5414 if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5415 Src1Regs, Src1LeftoverRegs))
5416 return UnableToLegalize;
5417
5418 LLT Unused;
5419 if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5420 Src2Regs, Src2LeftoverRegs))
5421 llvm_unreachable("inconsistent extractParts result");
5422
5423 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5424 auto Select = MIRBuilder.buildSelect(NarrowTy,
5425 CondReg, Src1Regs[I], Src2Regs[I]);
5426 DstRegs.push_back(Select.getReg(0));
5427 }
5428
5429 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5430 auto Select = MIRBuilder.buildSelect(
5431 LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5432 DstLeftoverRegs.push_back(Select.getReg(0));
5433 }
5434
5435 insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5436 LeftoverTy, DstLeftoverRegs);
5437
5438 MI.eraseFromParent();
5439 return Legalized;
5440 }
5441
5442 LegalizerHelper::LegalizeResult
narrowScalarCTLZ(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5443 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5444 LLT NarrowTy) {
5445 if (TypeIdx != 1)
5446 return UnableToLegalize;
5447
5448 Register DstReg = MI.getOperand(0).getReg();
5449 Register SrcReg = MI.getOperand(1).getReg();
5450 LLT DstTy = MRI.getType(DstReg);
5451 LLT SrcTy = MRI.getType(SrcReg);
5452 unsigned NarrowSize = NarrowTy.getSizeInBits();
5453
5454 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5455 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5456
5457 MachineIRBuilder &B = MIRBuilder;
5458 auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5459 // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5460 auto C_0 = B.buildConstant(NarrowTy, 0);
5461 auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5462 UnmergeSrc.getReg(1), C_0);
5463 auto LoCTLZ = IsUndef ?
5464 B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5465 B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5466 auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5467 auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5468 auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5469 B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5470
5471 MI.eraseFromParent();
5472 return Legalized;
5473 }
5474
5475 return UnableToLegalize;
5476 }
5477
5478 LegalizerHelper::LegalizeResult
narrowScalarCTTZ(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5479 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5480 LLT NarrowTy) {
5481 if (TypeIdx != 1)
5482 return UnableToLegalize;
5483
5484 Register DstReg = MI.getOperand(0).getReg();
5485 Register SrcReg = MI.getOperand(1).getReg();
5486 LLT DstTy = MRI.getType(DstReg);
5487 LLT SrcTy = MRI.getType(SrcReg);
5488 unsigned NarrowSize = NarrowTy.getSizeInBits();
5489
5490 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5491 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5492
5493 MachineIRBuilder &B = MIRBuilder;
5494 auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5495 // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5496 auto C_0 = B.buildConstant(NarrowTy, 0);
5497 auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5498 UnmergeSrc.getReg(0), C_0);
5499 auto HiCTTZ = IsUndef ?
5500 B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5501 B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5502 auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5503 auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5504 auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5505 B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5506
5507 MI.eraseFromParent();
5508 return Legalized;
5509 }
5510
5511 return UnableToLegalize;
5512 }
5513
5514 LegalizerHelper::LegalizeResult
narrowScalarCTPOP(MachineInstr & MI,unsigned TypeIdx,LLT NarrowTy)5515 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5516 LLT NarrowTy) {
5517 if (TypeIdx != 1)
5518 return UnableToLegalize;
5519
5520 Register DstReg = MI.getOperand(0).getReg();
5521 LLT DstTy = MRI.getType(DstReg);
5522 LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
5523 unsigned NarrowSize = NarrowTy.getSizeInBits();
5524
5525 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5526 auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
5527
5528 auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
5529 auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
5530 MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
5531
5532 MI.eraseFromParent();
5533 return Legalized;
5534 }
5535
5536 return UnableToLegalize;
5537 }
5538
5539 LegalizerHelper::LegalizeResult
lowerBitCount(MachineInstr & MI)5540 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
5541 unsigned Opc = MI.getOpcode();
5542 const auto &TII = MIRBuilder.getTII();
5543 auto isSupported = [this](const LegalityQuery &Q) {
5544 auto QAction = LI.getAction(Q).Action;
5545 return QAction == Legal || QAction == Libcall || QAction == Custom;
5546 };
5547 switch (Opc) {
5548 default:
5549 return UnableToLegalize;
5550 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
5551 // This trivially expands to CTLZ.
5552 Observer.changingInstr(MI);
5553 MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
5554 Observer.changedInstr(MI);
5555 return Legalized;
5556 }
5557 case TargetOpcode::G_CTLZ: {
5558 Register DstReg = MI.getOperand(0).getReg();
5559 Register SrcReg = MI.getOperand(1).getReg();
5560 LLT DstTy = MRI.getType(DstReg);
5561 LLT SrcTy = MRI.getType(SrcReg);
5562 unsigned Len = SrcTy.getSizeInBits();
5563
5564 if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5565 // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
5566 auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
5567 auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
5568 auto ICmp = MIRBuilder.buildICmp(
5569 CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
5570 auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5571 MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
5572 MI.eraseFromParent();
5573 return Legalized;
5574 }
5575 // for now, we do this:
5576 // NewLen = NextPowerOf2(Len);
5577 // x = x | (x >> 1);
5578 // x = x | (x >> 2);
5579 // ...
5580 // x = x | (x >>16);
5581 // x = x | (x >>32); // for 64-bit input
5582 // Upto NewLen/2
5583 // return Len - popcount(x);
5584 //
5585 // Ref: "Hacker's Delight" by Henry Warren
5586 Register Op = SrcReg;
5587 unsigned NewLen = PowerOf2Ceil(Len);
5588 for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
5589 auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
5590 auto MIBOp = MIRBuilder.buildOr(
5591 SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
5592 Op = MIBOp.getReg(0);
5593 }
5594 auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
5595 MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
5596 MIBPop);
5597 MI.eraseFromParent();
5598 return Legalized;
5599 }
5600 case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
5601 // This trivially expands to CTTZ.
5602 Observer.changingInstr(MI);
5603 MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
5604 Observer.changedInstr(MI);
5605 return Legalized;
5606 }
5607 case TargetOpcode::G_CTTZ: {
5608 Register DstReg = MI.getOperand(0).getReg();
5609 Register SrcReg = MI.getOperand(1).getReg();
5610 LLT DstTy = MRI.getType(DstReg);
5611 LLT SrcTy = MRI.getType(SrcReg);
5612
5613 unsigned Len = SrcTy.getSizeInBits();
5614 if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5615 // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
5616 // zero.
5617 auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
5618 auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
5619 auto ICmp = MIRBuilder.buildICmp(
5620 CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
5621 auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5622 MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
5623 MI.eraseFromParent();
5624 return Legalized;
5625 }
5626 // for now, we use: { return popcount(~x & (x - 1)); }
5627 // unless the target has ctlz but not ctpop, in which case we use:
5628 // { return 32 - nlz(~x & (x-1)); }
5629 // Ref: "Hacker's Delight" by Henry Warren
5630 auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
5631 auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
5632 auto MIBTmp = MIRBuilder.buildAnd(
5633 SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
5634 if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
5635 isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
5636 auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
5637 MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
5638 MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
5639 MI.eraseFromParent();
5640 return Legalized;
5641 }
5642 MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
5643 MI.getOperand(1).setReg(MIBTmp.getReg(0));
5644 return Legalized;
5645 }
5646 case TargetOpcode::G_CTPOP: {
5647 Register SrcReg = MI.getOperand(1).getReg();
5648 LLT Ty = MRI.getType(SrcReg);
5649 unsigned Size = Ty.getSizeInBits();
5650 MachineIRBuilder &B = MIRBuilder;
5651
5652 // Count set bits in blocks of 2 bits. Default approach would be
5653 // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
5654 // We use following formula instead:
5655 // B2Count = val - { (val >> 1) & 0x55555555 }
5656 // since it gives same result in blocks of 2 with one instruction less.
5657 auto C_1 = B.buildConstant(Ty, 1);
5658 auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
5659 APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
5660 auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
5661 auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
5662 auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
5663
5664 // In order to get count in blocks of 4 add values from adjacent block of 2.
5665 // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
5666 auto C_2 = B.buildConstant(Ty, 2);
5667 auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
5668 APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
5669 auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
5670 auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
5671 auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
5672 auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
5673
5674 // For count in blocks of 8 bits we don't have to mask high 4 bits before
5675 // addition since count value sits in range {0,...,8} and 4 bits are enough
5676 // to hold such binary values. After addition high 4 bits still hold count
5677 // of set bits in high 4 bit block, set them to zero and get 8 bit result.
5678 // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
5679 auto C_4 = B.buildConstant(Ty, 4);
5680 auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
5681 auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
5682 APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
5683 auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
5684 auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
5685
5686 assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
5687 // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
5688 // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
5689 auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
5690 auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
5691
5692 // Shift count result from 8 high bits to low bits.
5693 auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
5694 B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
5695
5696 MI.eraseFromParent();
5697 return Legalized;
5698 }
5699 }
5700 }
5701
5702 // Check that (every element of) Reg is undef or not an exact multiple of BW.
isNonZeroModBitWidthOrUndef(const MachineRegisterInfo & MRI,Register Reg,unsigned BW)5703 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
5704 Register Reg, unsigned BW) {
5705 return matchUnaryPredicate(
5706 MRI, Reg,
5707 [=](const Constant *C) {
5708 // Null constant here means an undef.
5709 const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
5710 return !CI || CI->getValue().urem(BW) != 0;
5711 },
5712 /*AllowUndefs*/ true);
5713 }
5714
5715 LegalizerHelper::LegalizeResult
lowerFunnelShiftWithInverse(MachineInstr & MI)5716 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
5717 Register Dst = MI.getOperand(0).getReg();
5718 Register X = MI.getOperand(1).getReg();
5719 Register Y = MI.getOperand(2).getReg();
5720 Register Z = MI.getOperand(3).getReg();
5721 LLT Ty = MRI.getType(Dst);
5722 LLT ShTy = MRI.getType(Z);
5723
5724 unsigned BW = Ty.getScalarSizeInBits();
5725
5726 if (!isPowerOf2_32(BW))
5727 return UnableToLegalize;
5728
5729 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5730 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5731
5732 if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5733 // fshl X, Y, Z -> fshr X, Y, -Z
5734 // fshr X, Y, Z -> fshl X, Y, -Z
5735 auto Zero = MIRBuilder.buildConstant(ShTy, 0);
5736 Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
5737 } else {
5738 // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
5739 // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
5740 auto One = MIRBuilder.buildConstant(ShTy, 1);
5741 if (IsFSHL) {
5742 Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5743 X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
5744 } else {
5745 X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5746 Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
5747 }
5748
5749 Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
5750 }
5751
5752 MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
5753 MI.eraseFromParent();
5754 return Legalized;
5755 }
5756
5757 LegalizerHelper::LegalizeResult
lowerFunnelShiftAsShifts(MachineInstr & MI)5758 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
5759 Register Dst = MI.getOperand(0).getReg();
5760 Register X = MI.getOperand(1).getReg();
5761 Register Y = MI.getOperand(2).getReg();
5762 Register Z = MI.getOperand(3).getReg();
5763 LLT Ty = MRI.getType(Dst);
5764 LLT ShTy = MRI.getType(Z);
5765
5766 const unsigned BW = Ty.getScalarSizeInBits();
5767 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5768
5769 Register ShX, ShY;
5770 Register ShAmt, InvShAmt;
5771
5772 // FIXME: Emit optimized urem by constant instead of letting it expand later.
5773 if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5774 // fshl: X << C | Y >> (BW - C)
5775 // fshr: X << (BW - C) | Y >> C
5776 // where C = Z % BW is not zero
5777 auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5778 ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5779 InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
5780 ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
5781 ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
5782 } else {
5783 // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
5784 // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
5785 auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
5786 if (isPowerOf2_32(BW)) {
5787 // Z % BW -> Z & (BW - 1)
5788 ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
5789 // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
5790 auto NotZ = MIRBuilder.buildNot(ShTy, Z);
5791 InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
5792 } else {
5793 auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5794 ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5795 InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
5796 }
5797
5798 auto One = MIRBuilder.buildConstant(ShTy, 1);
5799 if (IsFSHL) {
5800 ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
5801 auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
5802 ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
5803 } else {
5804 auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
5805 ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
5806 ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
5807 }
5808 }
5809
5810 MIRBuilder.buildOr(Dst, ShX, ShY);
5811 MI.eraseFromParent();
5812 return Legalized;
5813 }
5814
5815 LegalizerHelper::LegalizeResult
lowerFunnelShift(MachineInstr & MI)5816 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
5817 // These operations approximately do the following (while avoiding undefined
5818 // shifts by BW):
5819 // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
5820 // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
5821 Register Dst = MI.getOperand(0).getReg();
5822 LLT Ty = MRI.getType(Dst);
5823 LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
5824
5825 bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5826 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5827
5828 // TODO: Use smarter heuristic that accounts for vector legalization.
5829 if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
5830 return lowerFunnelShiftAsShifts(MI);
5831
5832 // This only works for powers of 2, fallback to shifts if it fails.
5833 LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
5834 if (Result == UnableToLegalize)
5835 return lowerFunnelShiftAsShifts(MI);
5836 return Result;
5837 }
5838
5839 LegalizerHelper::LegalizeResult
lowerRotateWithReverseRotate(MachineInstr & MI)5840 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
5841 Register Dst = MI.getOperand(0).getReg();
5842 Register Src = MI.getOperand(1).getReg();
5843 Register Amt = MI.getOperand(2).getReg();
5844 LLT AmtTy = MRI.getType(Amt);
5845 auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5846 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5847 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5848 auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5849 MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
5850 MI.eraseFromParent();
5851 return Legalized;
5852 }
5853
lowerRotate(MachineInstr & MI)5854 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
5855 Register Dst = MI.getOperand(0).getReg();
5856 Register Src = MI.getOperand(1).getReg();
5857 Register Amt = MI.getOperand(2).getReg();
5858 LLT DstTy = MRI.getType(Dst);
5859 LLT SrcTy = MRI.getType(Src);
5860 LLT AmtTy = MRI.getType(Amt);
5861
5862 unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
5863 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5864
5865 MIRBuilder.setInstrAndDebugLoc(MI);
5866
5867 // If a rotate in the other direction is supported, use it.
5868 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5869 if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
5870 isPowerOf2_32(EltSizeInBits))
5871 return lowerRotateWithReverseRotate(MI);
5872
5873 // If a funnel shift is supported, use it.
5874 unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
5875 unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
5876 bool IsFShLegal = false;
5877 if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
5878 LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
5879 auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
5880 Register R3) {
5881 MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
5882 MI.eraseFromParent();
5883 return Legalized;
5884 };
5885 // If a funnel shift in the other direction is supported, use it.
5886 if (IsFShLegal) {
5887 return buildFunnelShift(FShOpc, Dst, Src, Amt);
5888 } else if (isPowerOf2_32(EltSizeInBits)) {
5889 Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
5890 return buildFunnelShift(RevFsh, Dst, Src, Amt);
5891 }
5892 }
5893
5894 auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5895 unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
5896 unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
5897 auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
5898 Register ShVal;
5899 Register RevShiftVal;
5900 if (isPowerOf2_32(EltSizeInBits)) {
5901 // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
5902 // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
5903 auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5904 auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
5905 ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
5906 auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
5907 RevShiftVal =
5908 MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
5909 } else {
5910 // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
5911 // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
5912 auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
5913 auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
5914 ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
5915 auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
5916 auto One = MIRBuilder.buildConstant(AmtTy, 1);
5917 auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
5918 RevShiftVal =
5919 MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
5920 }
5921 MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
5922 MI.eraseFromParent();
5923 return Legalized;
5924 }
5925
5926 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
5927 // representation.
5928 LegalizerHelper::LegalizeResult
lowerU64ToF32BitOps(MachineInstr & MI)5929 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
5930 Register Dst = MI.getOperand(0).getReg();
5931 Register Src = MI.getOperand(1).getReg();
5932 const LLT S64 = LLT::scalar(64);
5933 const LLT S32 = LLT::scalar(32);
5934 const LLT S1 = LLT::scalar(1);
5935
5936 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
5937
5938 // unsigned cul2f(ulong u) {
5939 // uint lz = clz(u);
5940 // uint e = (u != 0) ? 127U + 63U - lz : 0;
5941 // u = (u << lz) & 0x7fffffffffffffffUL;
5942 // ulong t = u & 0xffffffffffUL;
5943 // uint v = (e << 23) | (uint)(u >> 40);
5944 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
5945 // return as_float(v + r);
5946 // }
5947
5948 auto Zero32 = MIRBuilder.buildConstant(S32, 0);
5949 auto Zero64 = MIRBuilder.buildConstant(S64, 0);
5950
5951 auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
5952
5953 auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
5954 auto Sub = MIRBuilder.buildSub(S32, K, LZ);
5955
5956 auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
5957 auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
5958
5959 auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
5960 auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
5961
5962 auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
5963
5964 auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
5965 auto T = MIRBuilder.buildAnd(S64, U, Mask1);
5966
5967 auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
5968 auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
5969 auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
5970
5971 auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
5972 auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
5973 auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
5974 auto One = MIRBuilder.buildConstant(S32, 1);
5975
5976 auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
5977 auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
5978 auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
5979 MIRBuilder.buildAdd(Dst, V, R);
5980
5981 MI.eraseFromParent();
5982 return Legalized;
5983 }
5984
lowerUITOFP(MachineInstr & MI)5985 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
5986 Register Dst = MI.getOperand(0).getReg();
5987 Register Src = MI.getOperand(1).getReg();
5988 LLT DstTy = MRI.getType(Dst);
5989 LLT SrcTy = MRI.getType(Src);
5990
5991 if (SrcTy == LLT::scalar(1)) {
5992 auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
5993 auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
5994 MIRBuilder.buildSelect(Dst, Src, True, False);
5995 MI.eraseFromParent();
5996 return Legalized;
5997 }
5998
5999 if (SrcTy != LLT::scalar(64))
6000 return UnableToLegalize;
6001
6002 if (DstTy == LLT::scalar(32)) {
6003 // TODO: SelectionDAG has several alternative expansions to port which may
6004 // be more reasonble depending on the available instructions. If a target
6005 // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6006 // intermediate type, this is probably worse.
6007 return lowerU64ToF32BitOps(MI);
6008 }
6009
6010 return UnableToLegalize;
6011 }
6012
lowerSITOFP(MachineInstr & MI)6013 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6014 Register Dst = MI.getOperand(0).getReg();
6015 Register Src = MI.getOperand(1).getReg();
6016 LLT DstTy = MRI.getType(Dst);
6017 LLT SrcTy = MRI.getType(Src);
6018
6019 const LLT S64 = LLT::scalar(64);
6020 const LLT S32 = LLT::scalar(32);
6021 const LLT S1 = LLT::scalar(1);
6022
6023 if (SrcTy == S1) {
6024 auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6025 auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6026 MIRBuilder.buildSelect(Dst, Src, True, False);
6027 MI.eraseFromParent();
6028 return Legalized;
6029 }
6030
6031 if (SrcTy != S64)
6032 return UnableToLegalize;
6033
6034 if (DstTy == S32) {
6035 // signed cl2f(long l) {
6036 // long s = l >> 63;
6037 // float r = cul2f((l + s) ^ s);
6038 // return s ? -r : r;
6039 // }
6040 Register L = Src;
6041 auto SignBit = MIRBuilder.buildConstant(S64, 63);
6042 auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6043
6044 auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6045 auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6046 auto R = MIRBuilder.buildUITOFP(S32, Xor);
6047
6048 auto RNeg = MIRBuilder.buildFNeg(S32, R);
6049 auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6050 MIRBuilder.buildConstant(S64, 0));
6051 MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6052 MI.eraseFromParent();
6053 return Legalized;
6054 }
6055
6056 return UnableToLegalize;
6057 }
6058
lowerFPTOUI(MachineInstr & MI)6059 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6060 Register Dst = MI.getOperand(0).getReg();
6061 Register Src = MI.getOperand(1).getReg();
6062 LLT DstTy = MRI.getType(Dst);
6063 LLT SrcTy = MRI.getType(Src);
6064 const LLT S64 = LLT::scalar(64);
6065 const LLT S32 = LLT::scalar(32);
6066
6067 if (SrcTy != S64 && SrcTy != S32)
6068 return UnableToLegalize;
6069 if (DstTy != S32 && DstTy != S64)
6070 return UnableToLegalize;
6071
6072 // FPTOSI gives same result as FPTOUI for positive signed integers.
6073 // FPTOUI needs to deal with fp values that convert to unsigned integers
6074 // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6075
6076 APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6077 APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6078 : APFloat::IEEEdouble(),
6079 APInt::getZero(SrcTy.getSizeInBits()));
6080 TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6081
6082 MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6083
6084 MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6085 // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6086 // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6087 MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6088 MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6089 MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6090 MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6091
6092 const LLT S1 = LLT::scalar(1);
6093
6094 MachineInstrBuilder FCMP =
6095 MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6096 MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6097
6098 MI.eraseFromParent();
6099 return Legalized;
6100 }
6101
lowerFPTOSI(MachineInstr & MI)6102 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6103 Register Dst = MI.getOperand(0).getReg();
6104 Register Src = MI.getOperand(1).getReg();
6105 LLT DstTy = MRI.getType(Dst);
6106 LLT SrcTy = MRI.getType(Src);
6107 const LLT S64 = LLT::scalar(64);
6108 const LLT S32 = LLT::scalar(32);
6109
6110 // FIXME: Only f32 to i64 conversions are supported.
6111 if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6112 return UnableToLegalize;
6113
6114 // Expand f32 -> i64 conversion
6115 // This algorithm comes from compiler-rt's implementation of fixsfdi:
6116 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6117
6118 unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6119
6120 auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6121 auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6122
6123 auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6124 auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6125
6126 auto SignMask = MIRBuilder.buildConstant(SrcTy,
6127 APInt::getSignMask(SrcEltBits));
6128 auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6129 auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6130 auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6131 Sign = MIRBuilder.buildSExt(DstTy, Sign);
6132
6133 auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6134 auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6135 auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6136
6137 auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6138 R = MIRBuilder.buildZExt(DstTy, R);
6139
6140 auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6141 auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6142 auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6143 auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6144
6145 auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6146 auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6147
6148 const LLT S1 = LLT::scalar(1);
6149 auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6150 S1, Exponent, ExponentLoBit);
6151
6152 R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6153
6154 auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6155 auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6156
6157 auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6158
6159 auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6160 S1, Exponent, ZeroSrcTy);
6161
6162 auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6163 MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6164
6165 MI.eraseFromParent();
6166 return Legalized;
6167 }
6168
6169 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6170 LegalizerHelper::LegalizeResult
lowerFPTRUNC_F64_TO_F16(MachineInstr & MI)6171 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6172 Register Dst = MI.getOperand(0).getReg();
6173 Register Src = MI.getOperand(1).getReg();
6174
6175 if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6176 return UnableToLegalize;
6177
6178 const unsigned ExpMask = 0x7ff;
6179 const unsigned ExpBiasf64 = 1023;
6180 const unsigned ExpBiasf16 = 15;
6181 const LLT S32 = LLT::scalar(32);
6182 const LLT S1 = LLT::scalar(1);
6183
6184 auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6185 Register U = Unmerge.getReg(0);
6186 Register UH = Unmerge.getReg(1);
6187
6188 auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6189 E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6190
6191 // Subtract the fp64 exponent bias (1023) to get the real exponent and
6192 // add the f16 bias (15) to get the biased exponent for the f16 format.
6193 E = MIRBuilder.buildAdd(
6194 S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6195
6196 auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6197 M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6198
6199 auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6200 MIRBuilder.buildConstant(S32, 0x1ff));
6201 MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6202
6203 auto Zero = MIRBuilder.buildConstant(S32, 0);
6204 auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6205 auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6206 M = MIRBuilder.buildOr(S32, M, Lo40Set);
6207
6208 // (M != 0 ? 0x0200 : 0) | 0x7c00;
6209 auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6210 auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6211 auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6212
6213 auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6214 auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6215
6216 // N = M | (E << 12);
6217 auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6218 auto N = MIRBuilder.buildOr(S32, M, EShl12);
6219
6220 // B = clamp(1-E, 0, 13);
6221 auto One = MIRBuilder.buildConstant(S32, 1);
6222 auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6223 auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6224 B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6225
6226 auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6227 MIRBuilder.buildConstant(S32, 0x1000));
6228
6229 auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6230 auto D0 = MIRBuilder.buildShl(S32, D, B);
6231
6232 auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6233 D0, SigSetHigh);
6234 auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6235 D = MIRBuilder.buildOr(S32, D, D1);
6236
6237 auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6238 auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6239
6240 auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6241 V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6242
6243 auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6244 MIRBuilder.buildConstant(S32, 3));
6245 auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6246
6247 auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6248 MIRBuilder.buildConstant(S32, 5));
6249 auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6250
6251 V1 = MIRBuilder.buildOr(S32, V0, V1);
6252 V = MIRBuilder.buildAdd(S32, V, V1);
6253
6254 auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1,
6255 E, MIRBuilder.buildConstant(S32, 30));
6256 V = MIRBuilder.buildSelect(S32, CmpEGt30,
6257 MIRBuilder.buildConstant(S32, 0x7c00), V);
6258
6259 auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6260 E, MIRBuilder.buildConstant(S32, 1039));
6261 V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6262
6263 // Extract the sign bit.
6264 auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6265 Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6266
6267 // Insert the sign bit
6268 V = MIRBuilder.buildOr(S32, Sign, V);
6269
6270 MIRBuilder.buildTrunc(Dst, V);
6271 MI.eraseFromParent();
6272 return Legalized;
6273 }
6274
6275 LegalizerHelper::LegalizeResult
lowerFPTRUNC(MachineInstr & MI)6276 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6277 Register Dst = MI.getOperand(0).getReg();
6278 Register Src = MI.getOperand(1).getReg();
6279
6280 LLT DstTy = MRI.getType(Dst);
6281 LLT SrcTy = MRI.getType(Src);
6282 const LLT S64 = LLT::scalar(64);
6283 const LLT S16 = LLT::scalar(16);
6284
6285 if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6286 return lowerFPTRUNC_F64_TO_F16(MI);
6287
6288 return UnableToLegalize;
6289 }
6290
6291 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6292 // multiplication tree.
lowerFPOWI(MachineInstr & MI)6293 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6294 Register Dst = MI.getOperand(0).getReg();
6295 Register Src0 = MI.getOperand(1).getReg();
6296 Register Src1 = MI.getOperand(2).getReg();
6297 LLT Ty = MRI.getType(Dst);
6298
6299 auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6300 MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6301 MI.eraseFromParent();
6302 return Legalized;
6303 }
6304
minMaxToCompare(unsigned Opc)6305 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6306 switch (Opc) {
6307 case TargetOpcode::G_SMIN:
6308 return CmpInst::ICMP_SLT;
6309 case TargetOpcode::G_SMAX:
6310 return CmpInst::ICMP_SGT;
6311 case TargetOpcode::G_UMIN:
6312 return CmpInst::ICMP_ULT;
6313 case TargetOpcode::G_UMAX:
6314 return CmpInst::ICMP_UGT;
6315 default:
6316 llvm_unreachable("not in integer min/max");
6317 }
6318 }
6319
lowerMinMax(MachineInstr & MI)6320 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6321 Register Dst = MI.getOperand(0).getReg();
6322 Register Src0 = MI.getOperand(1).getReg();
6323 Register Src1 = MI.getOperand(2).getReg();
6324
6325 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6326 LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6327
6328 auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6329 MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6330
6331 MI.eraseFromParent();
6332 return Legalized;
6333 }
6334
6335 LegalizerHelper::LegalizeResult
lowerFCopySign(MachineInstr & MI)6336 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6337 Register Dst = MI.getOperand(0).getReg();
6338 Register Src0 = MI.getOperand(1).getReg();
6339 Register Src1 = MI.getOperand(2).getReg();
6340
6341 const LLT Src0Ty = MRI.getType(Src0);
6342 const LLT Src1Ty = MRI.getType(Src1);
6343
6344 const int Src0Size = Src0Ty.getScalarSizeInBits();
6345 const int Src1Size = Src1Ty.getScalarSizeInBits();
6346
6347 auto SignBitMask = MIRBuilder.buildConstant(
6348 Src0Ty, APInt::getSignMask(Src0Size));
6349
6350 auto NotSignBitMask = MIRBuilder.buildConstant(
6351 Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6352
6353 Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6354 Register And1;
6355 if (Src0Ty == Src1Ty) {
6356 And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6357 } else if (Src0Size > Src1Size) {
6358 auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6359 auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6360 auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6361 And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6362 } else {
6363 auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6364 auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6365 auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6366 And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6367 }
6368
6369 // Be careful about setting nsz/nnan/ninf on every instruction, since the
6370 // constants are a nan and -0.0, but the final result should preserve
6371 // everything.
6372 unsigned Flags = MI.getFlags();
6373 MIRBuilder.buildOr(Dst, And0, And1, Flags);
6374
6375 MI.eraseFromParent();
6376 return Legalized;
6377 }
6378
6379 LegalizerHelper::LegalizeResult
lowerFMinNumMaxNum(MachineInstr & MI)6380 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6381 unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6382 TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6383
6384 Register Dst = MI.getOperand(0).getReg();
6385 Register Src0 = MI.getOperand(1).getReg();
6386 Register Src1 = MI.getOperand(2).getReg();
6387 LLT Ty = MRI.getType(Dst);
6388
6389 if (!MI.getFlag(MachineInstr::FmNoNans)) {
6390 // Insert canonicalizes if it's possible we need to quiet to get correct
6391 // sNaN behavior.
6392
6393 // Note this must be done here, and not as an optimization combine in the
6394 // absence of a dedicate quiet-snan instruction as we're using an
6395 // omni-purpose G_FCANONICALIZE.
6396 if (!isKnownNeverSNaN(Src0, MRI))
6397 Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6398
6399 if (!isKnownNeverSNaN(Src1, MRI))
6400 Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6401 }
6402
6403 // If there are no nans, it's safe to simply replace this with the non-IEEE
6404 // version.
6405 MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6406 MI.eraseFromParent();
6407 return Legalized;
6408 }
6409
lowerFMad(MachineInstr & MI)6410 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6411 // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6412 Register DstReg = MI.getOperand(0).getReg();
6413 LLT Ty = MRI.getType(DstReg);
6414 unsigned Flags = MI.getFlags();
6415
6416 auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6417 Flags);
6418 MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6419 MI.eraseFromParent();
6420 return Legalized;
6421 }
6422
6423 LegalizerHelper::LegalizeResult
lowerIntrinsicRound(MachineInstr & MI)6424 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6425 Register DstReg = MI.getOperand(0).getReg();
6426 Register X = MI.getOperand(1).getReg();
6427 const unsigned Flags = MI.getFlags();
6428 const LLT Ty = MRI.getType(DstReg);
6429 const LLT CondTy = Ty.changeElementSize(1);
6430
6431 // round(x) =>
6432 // t = trunc(x);
6433 // d = fabs(x - t);
6434 // o = copysign(1.0f, x);
6435 // return t + (d >= 0.5 ? o : 0.0);
6436
6437 auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6438
6439 auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
6440 auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6441 auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6442 auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6443 auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6444 auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6445
6446 auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6447 Flags);
6448 auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6449
6450 MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6451
6452 MI.eraseFromParent();
6453 return Legalized;
6454 }
6455
6456 LegalizerHelper::LegalizeResult
lowerFFloor(MachineInstr & MI)6457 LegalizerHelper::lowerFFloor(MachineInstr &MI) {
6458 Register DstReg = MI.getOperand(0).getReg();
6459 Register SrcReg = MI.getOperand(1).getReg();
6460 unsigned Flags = MI.getFlags();
6461 LLT Ty = MRI.getType(DstReg);
6462 const LLT CondTy = Ty.changeElementSize(1);
6463
6464 // result = trunc(src);
6465 // if (src < 0.0 && src != result)
6466 // result += -1.0.
6467
6468 auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
6469 auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6470
6471 auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
6472 SrcReg, Zero, Flags);
6473 auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
6474 SrcReg, Trunc, Flags);
6475 auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
6476 auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
6477
6478 MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
6479 MI.eraseFromParent();
6480 return Legalized;
6481 }
6482
6483 LegalizerHelper::LegalizeResult
lowerMergeValues(MachineInstr & MI)6484 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
6485 const unsigned NumOps = MI.getNumOperands();
6486 Register DstReg = MI.getOperand(0).getReg();
6487 Register Src0Reg = MI.getOperand(1).getReg();
6488 LLT DstTy = MRI.getType(DstReg);
6489 LLT SrcTy = MRI.getType(Src0Reg);
6490 unsigned PartSize = SrcTy.getSizeInBits();
6491
6492 LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
6493 Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
6494
6495 for (unsigned I = 2; I != NumOps; ++I) {
6496 const unsigned Offset = (I - 1) * PartSize;
6497
6498 Register SrcReg = MI.getOperand(I).getReg();
6499 auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
6500
6501 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
6502 MRI.createGenericVirtualRegister(WideTy);
6503
6504 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
6505 auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
6506 MIRBuilder.buildOr(NextResult, ResultReg, Shl);
6507 ResultReg = NextResult;
6508 }
6509
6510 if (DstTy.isPointer()) {
6511 if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
6512 DstTy.getAddressSpace())) {
6513 LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
6514 return UnableToLegalize;
6515 }
6516
6517 MIRBuilder.buildIntToPtr(DstReg, ResultReg);
6518 }
6519
6520 MI.eraseFromParent();
6521 return Legalized;
6522 }
6523
6524 LegalizerHelper::LegalizeResult
lowerUnmergeValues(MachineInstr & MI)6525 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
6526 const unsigned NumDst = MI.getNumOperands() - 1;
6527 Register SrcReg = MI.getOperand(NumDst).getReg();
6528 Register Dst0Reg = MI.getOperand(0).getReg();
6529 LLT DstTy = MRI.getType(Dst0Reg);
6530 if (DstTy.isPointer())
6531 return UnableToLegalize; // TODO
6532
6533 SrcReg = coerceToScalar(SrcReg);
6534 if (!SrcReg)
6535 return UnableToLegalize;
6536
6537 // Expand scalarizing unmerge as bitcast to integer and shift.
6538 LLT IntTy = MRI.getType(SrcReg);
6539
6540 MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
6541
6542 const unsigned DstSize = DstTy.getSizeInBits();
6543 unsigned Offset = DstSize;
6544 for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
6545 auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
6546 auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
6547 MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
6548 }
6549
6550 MI.eraseFromParent();
6551 return Legalized;
6552 }
6553
6554 /// Lower a vector extract or insert by writing the vector to a stack temporary
6555 /// and reloading the element or vector.
6556 ///
6557 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
6558 /// =>
6559 /// %stack_temp = G_FRAME_INDEX
6560 /// G_STORE %vec, %stack_temp
6561 /// %idx = clamp(%idx, %vec.getNumElements())
6562 /// %element_ptr = G_PTR_ADD %stack_temp, %idx
6563 /// %dst = G_LOAD %element_ptr
6564 LegalizerHelper::LegalizeResult
lowerExtractInsertVectorElt(MachineInstr & MI)6565 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
6566 Register DstReg = MI.getOperand(0).getReg();
6567 Register SrcVec = MI.getOperand(1).getReg();
6568 Register InsertVal;
6569 if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
6570 InsertVal = MI.getOperand(2).getReg();
6571
6572 Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
6573
6574 LLT VecTy = MRI.getType(SrcVec);
6575 LLT EltTy = VecTy.getElementType();
6576 unsigned NumElts = VecTy.getNumElements();
6577
6578 int64_t IdxVal;
6579 if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
6580 SmallVector<Register, 8> SrcRegs;
6581 extractParts(SrcVec, EltTy, NumElts, SrcRegs);
6582
6583 if (InsertVal) {
6584 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
6585 MIRBuilder.buildMerge(DstReg, SrcRegs);
6586 } else {
6587 MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
6588 }
6589
6590 MI.eraseFromParent();
6591 return Legalized;
6592 }
6593
6594 if (!EltTy.isByteSized()) { // Not implemented.
6595 LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
6596 return UnableToLegalize;
6597 }
6598
6599 unsigned EltBytes = EltTy.getSizeInBytes();
6600 Align VecAlign = getStackTemporaryAlignment(VecTy);
6601 Align EltAlign;
6602
6603 MachinePointerInfo PtrInfo;
6604 auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
6605 VecAlign, PtrInfo);
6606 MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
6607
6608 // Get the pointer to the element, and be sure not to hit undefined behavior
6609 // if the index is out of bounds.
6610 Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
6611
6612 if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
6613 int64_t Offset = IdxVal * EltBytes;
6614 PtrInfo = PtrInfo.getWithOffset(Offset);
6615 EltAlign = commonAlignment(VecAlign, Offset);
6616 } else {
6617 // We lose information with a variable offset.
6618 EltAlign = getStackTemporaryAlignment(EltTy);
6619 PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
6620 }
6621
6622 if (InsertVal) {
6623 // Write the inserted element
6624 MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
6625
6626 // Reload the whole vector.
6627 MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
6628 } else {
6629 MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
6630 }
6631
6632 MI.eraseFromParent();
6633 return Legalized;
6634 }
6635
6636 LegalizerHelper::LegalizeResult
lowerShuffleVector(MachineInstr & MI)6637 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
6638 Register DstReg = MI.getOperand(0).getReg();
6639 Register Src0Reg = MI.getOperand(1).getReg();
6640 Register Src1Reg = MI.getOperand(2).getReg();
6641 LLT Src0Ty = MRI.getType(Src0Reg);
6642 LLT DstTy = MRI.getType(DstReg);
6643 LLT IdxTy = LLT::scalar(32);
6644
6645 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6646
6647 if (DstTy.isScalar()) {
6648 if (Src0Ty.isVector())
6649 return UnableToLegalize;
6650
6651 // This is just a SELECT.
6652 assert(Mask.size() == 1 && "Expected a single mask element");
6653 Register Val;
6654 if (Mask[0] < 0 || Mask[0] > 1)
6655 Val = MIRBuilder.buildUndef(DstTy).getReg(0);
6656 else
6657 Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
6658 MIRBuilder.buildCopy(DstReg, Val);
6659 MI.eraseFromParent();
6660 return Legalized;
6661 }
6662
6663 Register Undef;
6664 SmallVector<Register, 32> BuildVec;
6665 LLT EltTy = DstTy.getElementType();
6666
6667 for (int Idx : Mask) {
6668 if (Idx < 0) {
6669 if (!Undef.isValid())
6670 Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
6671 BuildVec.push_back(Undef);
6672 continue;
6673 }
6674
6675 if (Src0Ty.isScalar()) {
6676 BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
6677 } else {
6678 int NumElts = Src0Ty.getNumElements();
6679 Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
6680 int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
6681 auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
6682 auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
6683 BuildVec.push_back(Extract.getReg(0));
6684 }
6685 }
6686
6687 MIRBuilder.buildBuildVector(DstReg, BuildVec);
6688 MI.eraseFromParent();
6689 return Legalized;
6690 }
6691
6692 LegalizerHelper::LegalizeResult
lowerDynStackAlloc(MachineInstr & MI)6693 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
6694 const auto &MF = *MI.getMF();
6695 const auto &TFI = *MF.getSubtarget().getFrameLowering();
6696 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
6697 return UnableToLegalize;
6698
6699 Register Dst = MI.getOperand(0).getReg();
6700 Register AllocSize = MI.getOperand(1).getReg();
6701 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
6702
6703 LLT PtrTy = MRI.getType(Dst);
6704 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
6705
6706 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
6707 auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
6708 SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
6709
6710 // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
6711 // have to generate an extra instruction to negate the alloc and then use
6712 // G_PTR_ADD to add the negative offset.
6713 auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
6714 if (Alignment > Align(1)) {
6715 APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
6716 AlignMask.negate();
6717 auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
6718 Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
6719 }
6720
6721 SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
6722 MIRBuilder.buildCopy(SPReg, SPTmp);
6723 MIRBuilder.buildCopy(Dst, SPTmp);
6724
6725 MI.eraseFromParent();
6726 return Legalized;
6727 }
6728
6729 LegalizerHelper::LegalizeResult
lowerExtract(MachineInstr & MI)6730 LegalizerHelper::lowerExtract(MachineInstr &MI) {
6731 Register Dst = MI.getOperand(0).getReg();
6732 Register Src = MI.getOperand(1).getReg();
6733 unsigned Offset = MI.getOperand(2).getImm();
6734
6735 LLT DstTy = MRI.getType(Dst);
6736 LLT SrcTy = MRI.getType(Src);
6737
6738 // Extract sub-vector or one element
6739 if (SrcTy.isVector()) {
6740 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
6741 unsigned DstSize = DstTy.getSizeInBits();
6742
6743 if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
6744 (Offset + DstSize <= SrcTy.getSizeInBits())) {
6745 // Unmerge and allow access to each Src element for the artifact combiner.
6746 auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), Src);
6747
6748 // Take element(s) we need to extract and copy it (merge them).
6749 SmallVector<Register, 8> SubVectorElts;
6750 for (unsigned Idx = Offset / SrcEltSize;
6751 Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
6752 SubVectorElts.push_back(Unmerge.getReg(Idx));
6753 }
6754 if (SubVectorElts.size() == 1)
6755 MIRBuilder.buildCopy(Dst, SubVectorElts[0]);
6756 else
6757 MIRBuilder.buildMerge(Dst, SubVectorElts);
6758
6759 MI.eraseFromParent();
6760 return Legalized;
6761 }
6762 }
6763
6764 if (DstTy.isScalar() &&
6765 (SrcTy.isScalar() ||
6766 (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
6767 LLT SrcIntTy = SrcTy;
6768 if (!SrcTy.isScalar()) {
6769 SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
6770 Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
6771 }
6772
6773 if (Offset == 0)
6774 MIRBuilder.buildTrunc(Dst, Src);
6775 else {
6776 auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
6777 auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
6778 MIRBuilder.buildTrunc(Dst, Shr);
6779 }
6780
6781 MI.eraseFromParent();
6782 return Legalized;
6783 }
6784
6785 return UnableToLegalize;
6786 }
6787
lowerInsert(MachineInstr & MI)6788 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
6789 Register Dst = MI.getOperand(0).getReg();
6790 Register Src = MI.getOperand(1).getReg();
6791 Register InsertSrc = MI.getOperand(2).getReg();
6792 uint64_t Offset = MI.getOperand(3).getImm();
6793
6794 LLT DstTy = MRI.getType(Src);
6795 LLT InsertTy = MRI.getType(InsertSrc);
6796
6797 // Insert sub-vector or one element
6798 if (DstTy.isVector() && !InsertTy.isPointer()) {
6799 LLT EltTy = DstTy.getElementType();
6800 unsigned EltSize = EltTy.getSizeInBits();
6801 unsigned InsertSize = InsertTy.getSizeInBits();
6802
6803 if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
6804 (Offset + InsertSize <= DstTy.getSizeInBits())) {
6805 auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
6806 SmallVector<Register, 8> DstElts;
6807 unsigned Idx = 0;
6808 // Elements from Src before insert start Offset
6809 for (; Idx < Offset / EltSize; ++Idx) {
6810 DstElts.push_back(UnmergeSrc.getReg(Idx));
6811 }
6812
6813 // Replace elements in Src with elements from InsertSrc
6814 if (InsertTy.getSizeInBits() > EltSize) {
6815 auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
6816 for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
6817 ++Idx, ++i) {
6818 DstElts.push_back(UnmergeInsertSrc.getReg(i));
6819 }
6820 } else {
6821 DstElts.push_back(InsertSrc);
6822 ++Idx;
6823 }
6824
6825 // Remaining elements from Src after insert
6826 for (; Idx < DstTy.getNumElements(); ++Idx) {
6827 DstElts.push_back(UnmergeSrc.getReg(Idx));
6828 }
6829
6830 MIRBuilder.buildMerge(Dst, DstElts);
6831 MI.eraseFromParent();
6832 return Legalized;
6833 }
6834 }
6835
6836 if (InsertTy.isVector() ||
6837 (DstTy.isVector() && DstTy.getElementType() != InsertTy))
6838 return UnableToLegalize;
6839
6840 const DataLayout &DL = MIRBuilder.getDataLayout();
6841 if ((DstTy.isPointer() &&
6842 DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
6843 (InsertTy.isPointer() &&
6844 DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
6845 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
6846 return UnableToLegalize;
6847 }
6848
6849 LLT IntDstTy = DstTy;
6850
6851 if (!DstTy.isScalar()) {
6852 IntDstTy = LLT::scalar(DstTy.getSizeInBits());
6853 Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
6854 }
6855
6856 if (!InsertTy.isScalar()) {
6857 const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
6858 InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
6859 }
6860
6861 Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
6862 if (Offset != 0) {
6863 auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
6864 ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
6865 }
6866
6867 APInt MaskVal = APInt::getBitsSetWithWrap(
6868 DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
6869
6870 auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
6871 auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
6872 auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
6873
6874 MIRBuilder.buildCast(Dst, Or);
6875 MI.eraseFromParent();
6876 return Legalized;
6877 }
6878
6879 LegalizerHelper::LegalizeResult
lowerSADDO_SSUBO(MachineInstr & MI)6880 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
6881 Register Dst0 = MI.getOperand(0).getReg();
6882 Register Dst1 = MI.getOperand(1).getReg();
6883 Register LHS = MI.getOperand(2).getReg();
6884 Register RHS = MI.getOperand(3).getReg();
6885 const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
6886
6887 LLT Ty = MRI.getType(Dst0);
6888 LLT BoolTy = MRI.getType(Dst1);
6889
6890 if (IsAdd)
6891 MIRBuilder.buildAdd(Dst0, LHS, RHS);
6892 else
6893 MIRBuilder.buildSub(Dst0, LHS, RHS);
6894
6895 // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
6896
6897 auto Zero = MIRBuilder.buildConstant(Ty, 0);
6898
6899 // For an addition, the result should be less than one of the operands (LHS)
6900 // if and only if the other operand (RHS) is negative, otherwise there will
6901 // be overflow.
6902 // For a subtraction, the result should be less than one of the operands
6903 // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
6904 // otherwise there will be overflow.
6905 auto ResultLowerThanLHS =
6906 MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
6907 auto ConditionRHS = MIRBuilder.buildICmp(
6908 IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
6909
6910 MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
6911 MI.eraseFromParent();
6912 return Legalized;
6913 }
6914
6915 LegalizerHelper::LegalizeResult
lowerAddSubSatToMinMax(MachineInstr & MI)6916 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
6917 Register Res = MI.getOperand(0).getReg();
6918 Register LHS = MI.getOperand(1).getReg();
6919 Register RHS = MI.getOperand(2).getReg();
6920 LLT Ty = MRI.getType(Res);
6921 bool IsSigned;
6922 bool IsAdd;
6923 unsigned BaseOp;
6924 switch (MI.getOpcode()) {
6925 default:
6926 llvm_unreachable("unexpected addsat/subsat opcode");
6927 case TargetOpcode::G_UADDSAT:
6928 IsSigned = false;
6929 IsAdd = true;
6930 BaseOp = TargetOpcode::G_ADD;
6931 break;
6932 case TargetOpcode::G_SADDSAT:
6933 IsSigned = true;
6934 IsAdd = true;
6935 BaseOp = TargetOpcode::G_ADD;
6936 break;
6937 case TargetOpcode::G_USUBSAT:
6938 IsSigned = false;
6939 IsAdd = false;
6940 BaseOp = TargetOpcode::G_SUB;
6941 break;
6942 case TargetOpcode::G_SSUBSAT:
6943 IsSigned = true;
6944 IsAdd = false;
6945 BaseOp = TargetOpcode::G_SUB;
6946 break;
6947 }
6948
6949 if (IsSigned) {
6950 // sadd.sat(a, b) ->
6951 // hi = 0x7fffffff - smax(a, 0)
6952 // lo = 0x80000000 - smin(a, 0)
6953 // a + smin(smax(lo, b), hi)
6954 // ssub.sat(a, b) ->
6955 // lo = smax(a, -1) - 0x7fffffff
6956 // hi = smin(a, -1) - 0x80000000
6957 // a - smin(smax(lo, b), hi)
6958 // TODO: AMDGPU can use a "median of 3" instruction here:
6959 // a +/- med3(lo, b, hi)
6960 uint64_t NumBits = Ty.getScalarSizeInBits();
6961 auto MaxVal =
6962 MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
6963 auto MinVal =
6964 MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
6965 MachineInstrBuilder Hi, Lo;
6966 if (IsAdd) {
6967 auto Zero = MIRBuilder.buildConstant(Ty, 0);
6968 Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
6969 Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
6970 } else {
6971 auto NegOne = MIRBuilder.buildConstant(Ty, -1);
6972 Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
6973 MaxVal);
6974 Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
6975 MinVal);
6976 }
6977 auto RHSClamped =
6978 MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
6979 MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
6980 } else {
6981 // uadd.sat(a, b) -> a + umin(~a, b)
6982 // usub.sat(a, b) -> a - umin(a, b)
6983 Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
6984 auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
6985 MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
6986 }
6987
6988 MI.eraseFromParent();
6989 return Legalized;
6990 }
6991
6992 LegalizerHelper::LegalizeResult
lowerAddSubSatToAddoSubo(MachineInstr & MI)6993 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
6994 Register Res = MI.getOperand(0).getReg();
6995 Register LHS = MI.getOperand(1).getReg();
6996 Register RHS = MI.getOperand(2).getReg();
6997 LLT Ty = MRI.getType(Res);
6998 LLT BoolTy = Ty.changeElementSize(1);
6999 bool IsSigned;
7000 bool IsAdd;
7001 unsigned OverflowOp;
7002 switch (MI.getOpcode()) {
7003 default:
7004 llvm_unreachable("unexpected addsat/subsat opcode");
7005 case TargetOpcode::G_UADDSAT:
7006 IsSigned = false;
7007 IsAdd = true;
7008 OverflowOp = TargetOpcode::G_UADDO;
7009 break;
7010 case TargetOpcode::G_SADDSAT:
7011 IsSigned = true;
7012 IsAdd = true;
7013 OverflowOp = TargetOpcode::G_SADDO;
7014 break;
7015 case TargetOpcode::G_USUBSAT:
7016 IsSigned = false;
7017 IsAdd = false;
7018 OverflowOp = TargetOpcode::G_USUBO;
7019 break;
7020 case TargetOpcode::G_SSUBSAT:
7021 IsSigned = true;
7022 IsAdd = false;
7023 OverflowOp = TargetOpcode::G_SSUBO;
7024 break;
7025 }
7026
7027 auto OverflowRes =
7028 MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7029 Register Tmp = OverflowRes.getReg(0);
7030 Register Ov = OverflowRes.getReg(1);
7031 MachineInstrBuilder Clamp;
7032 if (IsSigned) {
7033 // sadd.sat(a, b) ->
7034 // {tmp, ov} = saddo(a, b)
7035 // ov ? (tmp >>s 31) + 0x80000000 : r
7036 // ssub.sat(a, b) ->
7037 // {tmp, ov} = ssubo(a, b)
7038 // ov ? (tmp >>s 31) + 0x80000000 : r
7039 uint64_t NumBits = Ty.getScalarSizeInBits();
7040 auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7041 auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7042 auto MinVal =
7043 MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7044 Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7045 } else {
7046 // uadd.sat(a, b) ->
7047 // {tmp, ov} = uaddo(a, b)
7048 // ov ? 0xffffffff : tmp
7049 // usub.sat(a, b) ->
7050 // {tmp, ov} = usubo(a, b)
7051 // ov ? 0 : tmp
7052 Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7053 }
7054 MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7055
7056 MI.eraseFromParent();
7057 return Legalized;
7058 }
7059
7060 LegalizerHelper::LegalizeResult
lowerShlSat(MachineInstr & MI)7061 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7062 assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7063 MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7064 "Expected shlsat opcode!");
7065 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7066 Register Res = MI.getOperand(0).getReg();
7067 Register LHS = MI.getOperand(1).getReg();
7068 Register RHS = MI.getOperand(2).getReg();
7069 LLT Ty = MRI.getType(Res);
7070 LLT BoolTy = Ty.changeElementSize(1);
7071
7072 unsigned BW = Ty.getScalarSizeInBits();
7073 auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7074 auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7075 : MIRBuilder.buildLShr(Ty, Result, RHS);
7076
7077 MachineInstrBuilder SatVal;
7078 if (IsSigned) {
7079 auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7080 auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7081 auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7082 MIRBuilder.buildConstant(Ty, 0));
7083 SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7084 } else {
7085 SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7086 }
7087 auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7088 MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7089
7090 MI.eraseFromParent();
7091 return Legalized;
7092 }
7093
7094 LegalizerHelper::LegalizeResult
lowerBswap(MachineInstr & MI)7095 LegalizerHelper::lowerBswap(MachineInstr &MI) {
7096 Register Dst = MI.getOperand(0).getReg();
7097 Register Src = MI.getOperand(1).getReg();
7098 const LLT Ty = MRI.getType(Src);
7099 unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7100 unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7101
7102 // Swap most and least significant byte, set remaining bytes in Res to zero.
7103 auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7104 auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7105 auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7106 auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7107
7108 // Set i-th high/low byte in Res to i-th low/high byte from Src.
7109 for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7110 // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7111 APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7112 auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7113 auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7114 // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7115 auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7116 auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7117 Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7118 // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7119 auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7120 auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7121 Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7122 }
7123 Res.getInstr()->getOperand(0).setReg(Dst);
7124
7125 MI.eraseFromParent();
7126 return Legalized;
7127 }
7128
7129 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
SwapN(unsigned N,DstOp Dst,MachineIRBuilder & B,MachineInstrBuilder Src,APInt Mask)7130 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7131 MachineInstrBuilder Src, APInt Mask) {
7132 const LLT Ty = Dst.getLLTTy(*B.getMRI());
7133 MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7134 MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7135 auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7136 auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7137 return B.buildOr(Dst, LHS, RHS);
7138 }
7139
7140 LegalizerHelper::LegalizeResult
lowerBitreverse(MachineInstr & MI)7141 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7142 Register Dst = MI.getOperand(0).getReg();
7143 Register Src = MI.getOperand(1).getReg();
7144 const LLT Ty = MRI.getType(Src);
7145 unsigned Size = Ty.getSizeInBits();
7146
7147 MachineInstrBuilder BSWAP =
7148 MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7149
7150 // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7151 // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7152 // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7153 MachineInstrBuilder Swap4 =
7154 SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7155
7156 // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7157 // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7158 // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7159 MachineInstrBuilder Swap2 =
7160 SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7161
7162 // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7163 // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7164 // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7165 SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7166
7167 MI.eraseFromParent();
7168 return Legalized;
7169 }
7170
7171 LegalizerHelper::LegalizeResult
lowerReadWriteRegister(MachineInstr & MI)7172 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7173 MachineFunction &MF = MIRBuilder.getMF();
7174
7175 bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7176 int NameOpIdx = IsRead ? 1 : 0;
7177 int ValRegIndex = IsRead ? 0 : 1;
7178
7179 Register ValReg = MI.getOperand(ValRegIndex).getReg();
7180 const LLT Ty = MRI.getType(ValReg);
7181 const MDString *RegStr = cast<MDString>(
7182 cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7183
7184 Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7185 if (!PhysReg.isValid())
7186 return UnableToLegalize;
7187
7188 if (IsRead)
7189 MIRBuilder.buildCopy(ValReg, PhysReg);
7190 else
7191 MIRBuilder.buildCopy(PhysReg, ValReg);
7192
7193 MI.eraseFromParent();
7194 return Legalized;
7195 }
7196
7197 LegalizerHelper::LegalizeResult
lowerSMULH_UMULH(MachineInstr & MI)7198 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7199 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7200 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7201 Register Result = MI.getOperand(0).getReg();
7202 LLT OrigTy = MRI.getType(Result);
7203 auto SizeInBits = OrigTy.getScalarSizeInBits();
7204 LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7205
7206 auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7207 auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7208 auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7209 unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7210
7211 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7212 auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7213 MIRBuilder.buildTrunc(Result, Shifted);
7214
7215 MI.eraseFromParent();
7216 return Legalized;
7217 }
7218
lowerSelect(MachineInstr & MI)7219 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7220 // Implement vector G_SELECT in terms of XOR, AND, OR.
7221 Register DstReg = MI.getOperand(0).getReg();
7222 Register MaskReg = MI.getOperand(1).getReg();
7223 Register Op1Reg = MI.getOperand(2).getReg();
7224 Register Op2Reg = MI.getOperand(3).getReg();
7225 LLT DstTy = MRI.getType(DstReg);
7226 LLT MaskTy = MRI.getType(MaskReg);
7227 if (!DstTy.isVector())
7228 return UnableToLegalize;
7229
7230 if (MaskTy.isScalar()) {
7231 // Turn the scalar condition into a vector condition mask.
7232
7233 Register MaskElt = MaskReg;
7234
7235 // The condition was potentially zero extended before, but we want a sign
7236 // extended boolean.
7237 if (MaskTy.getSizeInBits() <= DstTy.getScalarSizeInBits() &&
7238 MaskTy != LLT::scalar(1)) {
7239 MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0);
7240 }
7241
7242 // Continue the sign extension (or truncate) to match the data type.
7243 MaskElt = MIRBuilder.buildSExtOrTrunc(DstTy.getElementType(),
7244 MaskElt).getReg(0);
7245
7246 // Generate a vector splat idiom.
7247 auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7248 MaskReg = ShufSplat.getReg(0);
7249 MaskTy = DstTy;
7250 }
7251
7252 if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
7253 return UnableToLegalize;
7254 }
7255
7256 auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7257 auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7258 auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7259 MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7260 MI.eraseFromParent();
7261 return Legalized;
7262 }
7263
lowerDIVREM(MachineInstr & MI)7264 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
7265 // Split DIVREM into individual instructions.
7266 unsigned Opcode = MI.getOpcode();
7267
7268 MIRBuilder.buildInstr(
7269 Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
7270 : TargetOpcode::G_UDIV,
7271 {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7272 MIRBuilder.buildInstr(
7273 Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
7274 : TargetOpcode::G_UREM,
7275 {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7276 MI.eraseFromParent();
7277 return Legalized;
7278 }
7279
7280 LegalizerHelper::LegalizeResult
lowerAbsToAddXor(MachineInstr & MI)7281 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7282 // Expand %res = G_ABS %a into:
7283 // %v1 = G_ASHR %a, scalar_size-1
7284 // %v2 = G_ADD %a, %v1
7285 // %res = G_XOR %v2, %v1
7286 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7287 Register OpReg = MI.getOperand(1).getReg();
7288 auto ShiftAmt =
7289 MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7290 auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7291 auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7292 MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7293 MI.eraseFromParent();
7294 return Legalized;
7295 }
7296
7297 LegalizerHelper::LegalizeResult
lowerAbsToMaxNeg(MachineInstr & MI)7298 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7299 // Expand %res = G_ABS %a into:
7300 // %v1 = G_CONSTANT 0
7301 // %v2 = G_SUB %v1, %a
7302 // %res = G_SMAX %a, %v2
7303 Register SrcReg = MI.getOperand(1).getReg();
7304 LLT Ty = MRI.getType(SrcReg);
7305 auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7306 auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7307 MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7308 MI.eraseFromParent();
7309 return Legalized;
7310 }
7311
7312 LegalizerHelper::LegalizeResult
lowerVectorReduction(MachineInstr & MI)7313 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
7314 Register SrcReg = MI.getOperand(1).getReg();
7315 LLT SrcTy = MRI.getType(SrcReg);
7316 LLT DstTy = MRI.getType(SrcReg);
7317
7318 // The source could be a scalar if the IR type was <1 x sN>.
7319 if (SrcTy.isScalar()) {
7320 if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
7321 return UnableToLegalize; // FIXME: handle extension.
7322 // This can be just a plain copy.
7323 Observer.changingInstr(MI);
7324 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
7325 Observer.changedInstr(MI);
7326 return Legalized;
7327 }
7328 return UnableToLegalize;;
7329 }
7330
shouldLowerMemFuncForSize(const MachineFunction & MF)7331 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
7332 // On Darwin, -Os means optimize for size without hurting performance, so
7333 // only really optimize for size when -Oz (MinSize) is used.
7334 if (MF.getTarget().getTargetTriple().isOSDarwin())
7335 return MF.getFunction().hasMinSize();
7336 return MF.getFunction().hasOptSize();
7337 }
7338
7339 // Returns a list of types to use for memory op lowering in MemOps. A partial
7340 // port of findOptimalMemOpLowering in TargetLowering.
findGISelOptimalMemOpLowering(std::vector<LLT> & MemOps,unsigned Limit,const MemOp & Op,unsigned DstAS,unsigned SrcAS,const AttributeList & FuncAttributes,const TargetLowering & TLI)7341 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
7342 unsigned Limit, const MemOp &Op,
7343 unsigned DstAS, unsigned SrcAS,
7344 const AttributeList &FuncAttributes,
7345 const TargetLowering &TLI) {
7346 if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
7347 return false;
7348
7349 LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
7350
7351 if (Ty == LLT()) {
7352 // Use the largest scalar type whose alignment constraints are satisfied.
7353 // We only need to check DstAlign here as SrcAlign is always greater or
7354 // equal to DstAlign (or zero).
7355 Ty = LLT::scalar(64);
7356 if (Op.isFixedDstAlign())
7357 while (Op.getDstAlign() < Ty.getSizeInBytes() &&
7358 !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
7359 Ty = LLT::scalar(Ty.getSizeInBytes());
7360 assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
7361 // FIXME: check for the largest legal type we can load/store to.
7362 }
7363
7364 unsigned NumMemOps = 0;
7365 uint64_t Size = Op.size();
7366 while (Size) {
7367 unsigned TySize = Ty.getSizeInBytes();
7368 while (TySize > Size) {
7369 // For now, only use non-vector load / store's for the left-over pieces.
7370 LLT NewTy = Ty;
7371 // FIXME: check for mem op safety and legality of the types. Not all of
7372 // SDAGisms map cleanly to GISel concepts.
7373 if (NewTy.isVector())
7374 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
7375 NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
7376 unsigned NewTySize = NewTy.getSizeInBytes();
7377 assert(NewTySize > 0 && "Could not find appropriate type");
7378
7379 // If the new LLT cannot cover all of the remaining bits, then consider
7380 // issuing a (or a pair of) unaligned and overlapping load / store.
7381 bool Fast;
7382 // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
7383 MVT VT = getMVTForLLT(Ty);
7384 if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
7385 TLI.allowsMisalignedMemoryAccesses(
7386 VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
7387 MachineMemOperand::MONone, &Fast) &&
7388 Fast)
7389 TySize = Size;
7390 else {
7391 Ty = NewTy;
7392 TySize = NewTySize;
7393 }
7394 }
7395
7396 if (++NumMemOps > Limit)
7397 return false;
7398
7399 MemOps.push_back(Ty);
7400 Size -= TySize;
7401 }
7402
7403 return true;
7404 }
7405
getTypeForLLT(LLT Ty,LLVMContext & C)7406 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
7407 if (Ty.isVector())
7408 return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
7409 Ty.getNumElements());
7410 return IntegerType::get(C, Ty.getSizeInBits());
7411 }
7412
7413 // Get a vectorized representation of the memset value operand, GISel edition.
getMemsetValue(Register Val,LLT Ty,MachineIRBuilder & MIB)7414 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
7415 MachineRegisterInfo &MRI = *MIB.getMRI();
7416 unsigned NumBits = Ty.getScalarSizeInBits();
7417 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7418 if (!Ty.isVector() && ValVRegAndVal) {
7419 APInt Scalar = ValVRegAndVal->Value.trunc(8);
7420 APInt SplatVal = APInt::getSplat(NumBits, Scalar);
7421 return MIB.buildConstant(Ty, SplatVal).getReg(0);
7422 }
7423
7424 // Extend the byte value to the larger type, and then multiply by a magic
7425 // value 0x010101... in order to replicate it across every byte.
7426 // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
7427 if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
7428 return MIB.buildConstant(Ty, 0).getReg(0);
7429 }
7430
7431 LLT ExtType = Ty.getScalarType();
7432 auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
7433 if (NumBits > 8) {
7434 APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
7435 auto MagicMI = MIB.buildConstant(ExtType, Magic);
7436 Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
7437 }
7438
7439 // For vector types create a G_BUILD_VECTOR.
7440 if (Ty.isVector())
7441 Val = MIB.buildSplatVector(Ty, Val).getReg(0);
7442
7443 return Val;
7444 }
7445
7446 LegalizerHelper::LegalizeResult
lowerMemset(MachineInstr & MI,Register Dst,Register Val,uint64_t KnownLen,Align Alignment,bool IsVolatile)7447 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
7448 uint64_t KnownLen, Align Alignment,
7449 bool IsVolatile) {
7450 auto &MF = *MI.getParent()->getParent();
7451 const auto &TLI = *MF.getSubtarget().getTargetLowering();
7452 auto &DL = MF.getDataLayout();
7453 LLVMContext &C = MF.getFunction().getContext();
7454
7455 assert(KnownLen != 0 && "Have a zero length memset length!");
7456
7457 bool DstAlignCanChange = false;
7458 MachineFrameInfo &MFI = MF.getFrameInfo();
7459 bool OptSize = shouldLowerMemFuncForSize(MF);
7460
7461 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7462 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7463 DstAlignCanChange = true;
7464
7465 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
7466 std::vector<LLT> MemOps;
7467
7468 const auto &DstMMO = **MI.memoperands_begin();
7469 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7470
7471 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7472 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
7473
7474 if (!findGISelOptimalMemOpLowering(MemOps, Limit,
7475 MemOp::Set(KnownLen, DstAlignCanChange,
7476 Alignment,
7477 /*IsZeroMemset=*/IsZeroVal,
7478 /*IsVolatile=*/IsVolatile),
7479 DstPtrInfo.getAddrSpace(), ~0u,
7480 MF.getFunction().getAttributes(), TLI))
7481 return UnableToLegalize;
7482
7483 if (DstAlignCanChange) {
7484 // Get an estimate of the type from the LLT.
7485 Type *IRTy = getTypeForLLT(MemOps[0], C);
7486 Align NewAlign = DL.getABITypeAlign(IRTy);
7487 if (NewAlign > Alignment) {
7488 Alignment = NewAlign;
7489 unsigned FI = FIDef->getOperand(1).getIndex();
7490 // Give the stack frame object a larger alignment if needed.
7491 if (MFI.getObjectAlign(FI) < Alignment)
7492 MFI.setObjectAlignment(FI, Alignment);
7493 }
7494 }
7495
7496 MachineIRBuilder MIB(MI);
7497 // Find the largest store and generate the bit pattern for it.
7498 LLT LargestTy = MemOps[0];
7499 for (unsigned i = 1; i < MemOps.size(); i++)
7500 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
7501 LargestTy = MemOps[i];
7502
7503 // The memset stored value is always defined as an s8, so in order to make it
7504 // work with larger store types we need to repeat the bit pattern across the
7505 // wider type.
7506 Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
7507
7508 if (!MemSetValue)
7509 return UnableToLegalize;
7510
7511 // Generate the stores. For each store type in the list, we generate the
7512 // matching store of that type to the destination address.
7513 LLT PtrTy = MRI.getType(Dst);
7514 unsigned DstOff = 0;
7515 unsigned Size = KnownLen;
7516 for (unsigned I = 0; I < MemOps.size(); I++) {
7517 LLT Ty = MemOps[I];
7518 unsigned TySize = Ty.getSizeInBytes();
7519 if (TySize > Size) {
7520 // Issuing an unaligned load / store pair that overlaps with the previous
7521 // pair. Adjust the offset accordingly.
7522 assert(I == MemOps.size() - 1 && I != 0);
7523 DstOff -= TySize - Size;
7524 }
7525
7526 // If this store is smaller than the largest store see whether we can get
7527 // the smaller value for free with a truncate.
7528 Register Value = MemSetValue;
7529 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
7530 MVT VT = getMVTForLLT(Ty);
7531 MVT LargestVT = getMVTForLLT(LargestTy);
7532 if (!LargestTy.isVector() && !Ty.isVector() &&
7533 TLI.isTruncateFree(LargestVT, VT))
7534 Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
7535 else
7536 Value = getMemsetValue(Val, Ty, MIB);
7537 if (!Value)
7538 return UnableToLegalize;
7539 }
7540
7541 auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
7542
7543 Register Ptr = Dst;
7544 if (DstOff != 0) {
7545 auto Offset =
7546 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
7547 Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7548 }
7549
7550 MIB.buildStore(Value, Ptr, *StoreMMO);
7551 DstOff += Ty.getSizeInBytes();
7552 Size -= TySize;
7553 }
7554
7555 MI.eraseFromParent();
7556 return Legalized;
7557 }
7558
7559 LegalizerHelper::LegalizeResult
lowerMemcpyInline(MachineInstr & MI)7560 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
7561 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7562
7563 Register Dst = MI.getOperand(0).getReg();
7564 Register Src = MI.getOperand(1).getReg();
7565 Register Len = MI.getOperand(2).getReg();
7566
7567 const auto *MMOIt = MI.memoperands_begin();
7568 const MachineMemOperand *MemOp = *MMOIt;
7569 bool IsVolatile = MemOp->isVolatile();
7570
7571 // See if this is a constant length copy
7572 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7573 // FIXME: support dynamically sized G_MEMCPY_INLINE
7574 assert(LenVRegAndVal &&
7575 "inline memcpy with dynamic size is not yet supported");
7576 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7577 if (KnownLen == 0) {
7578 MI.eraseFromParent();
7579 return Legalized;
7580 }
7581
7582 const auto &DstMMO = **MI.memoperands_begin();
7583 const auto &SrcMMO = **std::next(MI.memoperands_begin());
7584 Align DstAlign = DstMMO.getBaseAlign();
7585 Align SrcAlign = SrcMMO.getBaseAlign();
7586
7587 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7588 IsVolatile);
7589 }
7590
7591 LegalizerHelper::LegalizeResult
lowerMemcpyInline(MachineInstr & MI,Register Dst,Register Src,uint64_t KnownLen,Align DstAlign,Align SrcAlign,bool IsVolatile)7592 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
7593 uint64_t KnownLen, Align DstAlign,
7594 Align SrcAlign, bool IsVolatile) {
7595 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7596 return lowerMemcpy(MI, Dst, Src, KnownLen,
7597 std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
7598 IsVolatile);
7599 }
7600
7601 LegalizerHelper::LegalizeResult
lowerMemcpy(MachineInstr & MI,Register Dst,Register Src,uint64_t KnownLen,uint64_t Limit,Align DstAlign,Align SrcAlign,bool IsVolatile)7602 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
7603 uint64_t KnownLen, uint64_t Limit, Align DstAlign,
7604 Align SrcAlign, bool IsVolatile) {
7605 auto &MF = *MI.getParent()->getParent();
7606 const auto &TLI = *MF.getSubtarget().getTargetLowering();
7607 auto &DL = MF.getDataLayout();
7608 LLVMContext &C = MF.getFunction().getContext();
7609
7610 assert(KnownLen != 0 && "Have a zero length memcpy length!");
7611
7612 bool DstAlignCanChange = false;
7613 MachineFrameInfo &MFI = MF.getFrameInfo();
7614 Align Alignment = std::min(DstAlign, SrcAlign);
7615
7616 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7617 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7618 DstAlignCanChange = true;
7619
7620 // FIXME: infer better src pointer alignment like SelectionDAG does here.
7621 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
7622 // if the memcpy is in a tail call position.
7623
7624 std::vector<LLT> MemOps;
7625
7626 const auto &DstMMO = **MI.memoperands_begin();
7627 const auto &SrcMMO = **std::next(MI.memoperands_begin());
7628 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7629 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7630
7631 if (!findGISelOptimalMemOpLowering(
7632 MemOps, Limit,
7633 MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7634 IsVolatile),
7635 DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7636 MF.getFunction().getAttributes(), TLI))
7637 return UnableToLegalize;
7638
7639 if (DstAlignCanChange) {
7640 // Get an estimate of the type from the LLT.
7641 Type *IRTy = getTypeForLLT(MemOps[0], C);
7642 Align NewAlign = DL.getABITypeAlign(IRTy);
7643
7644 // Don't promote to an alignment that would require dynamic stack
7645 // realignment.
7646 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7647 if (!TRI->hasStackRealignment(MF))
7648 while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7649 NewAlign = NewAlign.previous();
7650
7651 if (NewAlign > Alignment) {
7652 Alignment = NewAlign;
7653 unsigned FI = FIDef->getOperand(1).getIndex();
7654 // Give the stack frame object a larger alignment if needed.
7655 if (MFI.getObjectAlign(FI) < Alignment)
7656 MFI.setObjectAlignment(FI, Alignment);
7657 }
7658 }
7659
7660 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
7661
7662 MachineIRBuilder MIB(MI);
7663 // Now we need to emit a pair of load and stores for each of the types we've
7664 // collected. I.e. for each type, generate a load from the source pointer of
7665 // that type width, and then generate a corresponding store to the dest buffer
7666 // of that value loaded. This can result in a sequence of loads and stores
7667 // mixed types, depending on what the target specifies as good types to use.
7668 unsigned CurrOffset = 0;
7669 unsigned Size = KnownLen;
7670 for (auto CopyTy : MemOps) {
7671 // Issuing an unaligned load / store pair that overlaps with the previous
7672 // pair. Adjust the offset accordingly.
7673 if (CopyTy.getSizeInBytes() > Size)
7674 CurrOffset -= CopyTy.getSizeInBytes() - Size;
7675
7676 // Construct MMOs for the accesses.
7677 auto *LoadMMO =
7678 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7679 auto *StoreMMO =
7680 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7681
7682 // Create the load.
7683 Register LoadPtr = Src;
7684 Register Offset;
7685 if (CurrOffset != 0) {
7686 LLT SrcTy = MRI.getType(Src);
7687 Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
7688 .getReg(0);
7689 LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
7690 }
7691 auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
7692
7693 // Create the store.
7694 Register StorePtr = Dst;
7695 if (CurrOffset != 0) {
7696 LLT DstTy = MRI.getType(Dst);
7697 StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
7698 }
7699 MIB.buildStore(LdVal, StorePtr, *StoreMMO);
7700 CurrOffset += CopyTy.getSizeInBytes();
7701 Size -= CopyTy.getSizeInBytes();
7702 }
7703
7704 MI.eraseFromParent();
7705 return Legalized;
7706 }
7707
7708 LegalizerHelper::LegalizeResult
lowerMemmove(MachineInstr & MI,Register Dst,Register Src,uint64_t KnownLen,Align DstAlign,Align SrcAlign,bool IsVolatile)7709 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
7710 uint64_t KnownLen, Align DstAlign, Align SrcAlign,
7711 bool IsVolatile) {
7712 auto &MF = *MI.getParent()->getParent();
7713 const auto &TLI = *MF.getSubtarget().getTargetLowering();
7714 auto &DL = MF.getDataLayout();
7715 LLVMContext &C = MF.getFunction().getContext();
7716
7717 assert(KnownLen != 0 && "Have a zero length memmove length!");
7718
7719 bool DstAlignCanChange = false;
7720 MachineFrameInfo &MFI = MF.getFrameInfo();
7721 bool OptSize = shouldLowerMemFuncForSize(MF);
7722 Align Alignment = std::min(DstAlign, SrcAlign);
7723
7724 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7725 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7726 DstAlignCanChange = true;
7727
7728 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
7729 std::vector<LLT> MemOps;
7730
7731 const auto &DstMMO = **MI.memoperands_begin();
7732 const auto &SrcMMO = **std::next(MI.memoperands_begin());
7733 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7734 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7735
7736 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
7737 // to a bug in it's findOptimalMemOpLowering implementation. For now do the
7738 // same thing here.
7739 if (!findGISelOptimalMemOpLowering(
7740 MemOps, Limit,
7741 MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7742 /*IsVolatile*/ true),
7743 DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7744 MF.getFunction().getAttributes(), TLI))
7745 return UnableToLegalize;
7746
7747 if (DstAlignCanChange) {
7748 // Get an estimate of the type from the LLT.
7749 Type *IRTy = getTypeForLLT(MemOps[0], C);
7750 Align NewAlign = DL.getABITypeAlign(IRTy);
7751
7752 // Don't promote to an alignment that would require dynamic stack
7753 // realignment.
7754 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7755 if (!TRI->hasStackRealignment(MF))
7756 while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7757 NewAlign = NewAlign.previous();
7758
7759 if (NewAlign > Alignment) {
7760 Alignment = NewAlign;
7761 unsigned FI = FIDef->getOperand(1).getIndex();
7762 // Give the stack frame object a larger alignment if needed.
7763 if (MFI.getObjectAlign(FI) < Alignment)
7764 MFI.setObjectAlignment(FI, Alignment);
7765 }
7766 }
7767
7768 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
7769
7770 MachineIRBuilder MIB(MI);
7771 // Memmove requires that we perform the loads first before issuing the stores.
7772 // Apart from that, this loop is pretty much doing the same thing as the
7773 // memcpy codegen function.
7774 unsigned CurrOffset = 0;
7775 SmallVector<Register, 16> LoadVals;
7776 for (auto CopyTy : MemOps) {
7777 // Construct MMO for the load.
7778 auto *LoadMMO =
7779 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7780
7781 // Create the load.
7782 Register LoadPtr = Src;
7783 if (CurrOffset != 0) {
7784 LLT SrcTy = MRI.getType(Src);
7785 auto Offset =
7786 MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
7787 LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
7788 }
7789 LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
7790 CurrOffset += CopyTy.getSizeInBytes();
7791 }
7792
7793 CurrOffset = 0;
7794 for (unsigned I = 0; I < MemOps.size(); ++I) {
7795 LLT CopyTy = MemOps[I];
7796 // Now store the values loaded.
7797 auto *StoreMMO =
7798 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7799
7800 Register StorePtr = Dst;
7801 if (CurrOffset != 0) {
7802 LLT DstTy = MRI.getType(Dst);
7803 auto Offset =
7804 MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
7805 StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
7806 }
7807 MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
7808 CurrOffset += CopyTy.getSizeInBytes();
7809 }
7810 MI.eraseFromParent();
7811 return Legalized;
7812 }
7813
7814 LegalizerHelper::LegalizeResult
lowerMemCpyFamily(MachineInstr & MI,unsigned MaxLen)7815 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
7816 const unsigned Opc = MI.getOpcode();
7817 // This combine is fairly complex so it's not written with a separate
7818 // matcher function.
7819 assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
7820 Opc == TargetOpcode::G_MEMSET) &&
7821 "Expected memcpy like instruction");
7822
7823 auto MMOIt = MI.memoperands_begin();
7824 const MachineMemOperand *MemOp = *MMOIt;
7825
7826 Align DstAlign = MemOp->getBaseAlign();
7827 Align SrcAlign;
7828 Register Dst = MI.getOperand(0).getReg();
7829 Register Src = MI.getOperand(1).getReg();
7830 Register Len = MI.getOperand(2).getReg();
7831
7832 if (Opc != TargetOpcode::G_MEMSET) {
7833 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
7834 MemOp = *(++MMOIt);
7835 SrcAlign = MemOp->getBaseAlign();
7836 }
7837
7838 // See if this is a constant length copy
7839 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7840 if (!LenVRegAndVal)
7841 return UnableToLegalize;
7842 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7843
7844 if (KnownLen == 0) {
7845 MI.eraseFromParent();
7846 return Legalized;
7847 }
7848
7849 bool IsVolatile = MemOp->isVolatile();
7850 if (Opc == TargetOpcode::G_MEMCPY_INLINE)
7851 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7852 IsVolatile);
7853
7854 // Don't try to optimize volatile.
7855 if (IsVolatile)
7856 return UnableToLegalize;
7857
7858 if (MaxLen && KnownLen > MaxLen)
7859 return UnableToLegalize;
7860
7861 if (Opc == TargetOpcode::G_MEMCPY) {
7862 auto &MF = *MI.getParent()->getParent();
7863 const auto &TLI = *MF.getSubtarget().getTargetLowering();
7864 bool OptSize = shouldLowerMemFuncForSize(MF);
7865 uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
7866 return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
7867 IsVolatile);
7868 }
7869 if (Opc == TargetOpcode::G_MEMMOVE)
7870 return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
7871 if (Opc == TargetOpcode::G_MEMSET)
7872 return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
7873 return UnableToLegalize;
7874 }
7875