1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This file implements the LegalizerHelper class to legalize 10 /// individual instructions and the LegalizeMachineIR wrapper pass for the 11 /// primary legalization. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 16 #include "llvm/CodeGen/GlobalISel/CallLowering.h" 17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" 18 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 19 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 20 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h" 21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/Utils.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineRegisterInfo.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include "llvm/CodeGen/TargetInstrInfo.h" 29 #include "llvm/CodeGen/TargetLowering.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/TargetSubtargetInfo.h" 32 #include "llvm/IR/Instructions.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/MathExtras.h" 35 #include "llvm/Support/raw_ostream.h" 36 #include "llvm/Target/TargetMachine.h" 37 38 #define DEBUG_TYPE "legalizer" 39 40 using namespace llvm; 41 using namespace LegalizeActions; 42 using namespace MIPatternMatch; 43 44 /// Try to break down \p OrigTy into \p NarrowTy sized pieces. 45 /// 46 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy, 47 /// with any leftover piece as type \p LeftoverTy 48 /// 49 /// Returns -1 in the first element of the pair if the breakdown is not 50 /// satisfiable. 51 static std::pair<int, int> 52 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) { 53 assert(!LeftoverTy.isValid() && "this is an out argument"); 54 55 unsigned Size = OrigTy.getSizeInBits(); 56 unsigned NarrowSize = NarrowTy.getSizeInBits(); 57 unsigned NumParts = Size / NarrowSize; 58 unsigned LeftoverSize = Size - NumParts * NarrowSize; 59 assert(Size > NarrowSize); 60 61 if (LeftoverSize == 0) 62 return {NumParts, 0}; 63 64 if (NarrowTy.isVector()) { 65 unsigned EltSize = OrigTy.getScalarSizeInBits(); 66 if (LeftoverSize % EltSize != 0) 67 return {-1, -1}; 68 LeftoverTy = LLT::scalarOrVector( 69 ElementCount::getFixed(LeftoverSize / EltSize), EltSize); 70 } else { 71 LeftoverTy = LLT::scalar(LeftoverSize); 72 } 73 74 int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits(); 75 return std::make_pair(NumParts, NumLeftover); 76 } 77 78 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) { 79 80 if (!Ty.isScalar()) 81 return nullptr; 82 83 switch (Ty.getSizeInBits()) { 84 case 16: 85 return Type::getHalfTy(Ctx); 86 case 32: 87 return Type::getFloatTy(Ctx); 88 case 64: 89 return Type::getDoubleTy(Ctx); 90 case 80: 91 return Type::getX86_FP80Ty(Ctx); 92 case 128: 93 return Type::getFP128Ty(Ctx); 94 default: 95 return nullptr; 96 } 97 } 98 99 LegalizerHelper::LegalizerHelper(MachineFunction &MF, 100 GISelChangeObserver &Observer, 101 MachineIRBuilder &Builder) 102 : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()), 103 LI(*MF.getSubtarget().getLegalizerInfo()), 104 TLI(*MF.getSubtarget().getTargetLowering()) { } 105 106 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI, 107 GISelChangeObserver &Observer, 108 MachineIRBuilder &B) 109 : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI), 110 TLI(*MF.getSubtarget().getTargetLowering()) { } 111 112 LegalizerHelper::LegalizeResult 113 LegalizerHelper::legalizeInstrStep(MachineInstr &MI, 114 LostDebugLocObserver &LocObserver) { 115 LLVM_DEBUG(dbgs() << "Legalizing: " << MI); 116 117 MIRBuilder.setInstrAndDebugLoc(MI); 118 119 if (MI.getOpcode() == TargetOpcode::G_INTRINSIC || 120 MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS) 121 return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize; 122 auto Step = LI.getAction(MI, MRI); 123 switch (Step.Action) { 124 case Legal: 125 LLVM_DEBUG(dbgs() << ".. Already legal\n"); 126 return AlreadyLegal; 127 case Libcall: 128 LLVM_DEBUG(dbgs() << ".. Convert to libcall\n"); 129 return libcall(MI, LocObserver); 130 case NarrowScalar: 131 LLVM_DEBUG(dbgs() << ".. Narrow scalar\n"); 132 return narrowScalar(MI, Step.TypeIdx, Step.NewType); 133 case WidenScalar: 134 LLVM_DEBUG(dbgs() << ".. Widen scalar\n"); 135 return widenScalar(MI, Step.TypeIdx, Step.NewType); 136 case Bitcast: 137 LLVM_DEBUG(dbgs() << ".. Bitcast type\n"); 138 return bitcast(MI, Step.TypeIdx, Step.NewType); 139 case Lower: 140 LLVM_DEBUG(dbgs() << ".. Lower\n"); 141 return lower(MI, Step.TypeIdx, Step.NewType); 142 case FewerElements: 143 LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n"); 144 return fewerElementsVector(MI, Step.TypeIdx, Step.NewType); 145 case MoreElements: 146 LLVM_DEBUG(dbgs() << ".. Increase number of elements\n"); 147 return moreElementsVector(MI, Step.TypeIdx, Step.NewType); 148 case Custom: 149 LLVM_DEBUG(dbgs() << ".. Custom legalization\n"); 150 return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize; 151 default: 152 LLVM_DEBUG(dbgs() << ".. Unable to legalize\n"); 153 return UnableToLegalize; 154 } 155 } 156 157 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts, 158 SmallVectorImpl<Register> &VRegs) { 159 for (int i = 0; i < NumParts; ++i) 160 VRegs.push_back(MRI.createGenericVirtualRegister(Ty)); 161 MIRBuilder.buildUnmerge(VRegs, Reg); 162 } 163 164 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy, 165 LLT MainTy, LLT &LeftoverTy, 166 SmallVectorImpl<Register> &VRegs, 167 SmallVectorImpl<Register> &LeftoverRegs) { 168 assert(!LeftoverTy.isValid() && "this is an out argument"); 169 170 unsigned RegSize = RegTy.getSizeInBits(); 171 unsigned MainSize = MainTy.getSizeInBits(); 172 unsigned NumParts = RegSize / MainSize; 173 unsigned LeftoverSize = RegSize - NumParts * MainSize; 174 175 // Use an unmerge when possible. 176 if (LeftoverSize == 0) { 177 for (unsigned I = 0; I < NumParts; ++I) 178 VRegs.push_back(MRI.createGenericVirtualRegister(MainTy)); 179 MIRBuilder.buildUnmerge(VRegs, Reg); 180 return true; 181 } 182 183 // Perform irregular split. Leftover is last element of RegPieces. 184 if (MainTy.isVector()) { 185 SmallVector<Register, 8> RegPieces; 186 extractVectorParts(Reg, MainTy.getNumElements(), RegPieces); 187 for (unsigned i = 0; i < RegPieces.size() - 1; ++i) 188 VRegs.push_back(RegPieces[i]); 189 LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]); 190 LeftoverTy = MRI.getType(LeftoverRegs[0]); 191 return true; 192 } 193 194 LeftoverTy = LLT::scalar(LeftoverSize); 195 // For irregular sizes, extract the individual parts. 196 for (unsigned I = 0; I != NumParts; ++I) { 197 Register NewReg = MRI.createGenericVirtualRegister(MainTy); 198 VRegs.push_back(NewReg); 199 MIRBuilder.buildExtract(NewReg, Reg, MainSize * I); 200 } 201 202 for (unsigned Offset = MainSize * NumParts; Offset < RegSize; 203 Offset += LeftoverSize) { 204 Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy); 205 LeftoverRegs.push_back(NewReg); 206 MIRBuilder.buildExtract(NewReg, Reg, Offset); 207 } 208 209 return true; 210 } 211 212 void LegalizerHelper::extractVectorParts(Register Reg, unsigned NumElts, 213 SmallVectorImpl<Register> &VRegs) { 214 LLT RegTy = MRI.getType(Reg); 215 assert(RegTy.isVector() && "Expected a vector type"); 216 217 LLT EltTy = RegTy.getElementType(); 218 LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy); 219 unsigned RegNumElts = RegTy.getNumElements(); 220 unsigned LeftoverNumElts = RegNumElts % NumElts; 221 unsigned NumNarrowTyPieces = RegNumElts / NumElts; 222 223 // Perfect split without leftover 224 if (LeftoverNumElts == 0) 225 return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs); 226 227 // Irregular split. Provide direct access to all elements for artifact 228 // combiner using unmerge to elements. Then build vectors with NumElts 229 // elements. Remaining element(s) will be (used to build vector) Leftover. 230 SmallVector<Register, 8> Elts; 231 extractParts(Reg, EltTy, RegNumElts, Elts); 232 233 unsigned Offset = 0; 234 // Requested sub-vectors of NarrowTy. 235 for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) { 236 ArrayRef<Register> Pieces(&Elts[Offset], NumElts); 237 VRegs.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0)); 238 } 239 240 // Leftover element(s). 241 if (LeftoverNumElts == 1) { 242 VRegs.push_back(Elts[Offset]); 243 } else { 244 LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy); 245 ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts); 246 VRegs.push_back(MIRBuilder.buildMerge(LeftoverTy, Pieces).getReg(0)); 247 } 248 } 249 250 void LegalizerHelper::insertParts(Register DstReg, 251 LLT ResultTy, LLT PartTy, 252 ArrayRef<Register> PartRegs, 253 LLT LeftoverTy, 254 ArrayRef<Register> LeftoverRegs) { 255 if (!LeftoverTy.isValid()) { 256 assert(LeftoverRegs.empty()); 257 258 if (!ResultTy.isVector()) { 259 MIRBuilder.buildMerge(DstReg, PartRegs); 260 return; 261 } 262 263 if (PartTy.isVector()) 264 MIRBuilder.buildConcatVectors(DstReg, PartRegs); 265 else 266 MIRBuilder.buildBuildVector(DstReg, PartRegs); 267 return; 268 } 269 270 // Merge sub-vectors with different number of elements and insert into DstReg. 271 if (ResultTy.isVector()) { 272 assert(LeftoverRegs.size() == 1 && "Expected one leftover register"); 273 SmallVector<Register, 8> AllRegs; 274 for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs)) 275 AllRegs.push_back(Reg); 276 return mergeMixedSubvectors(DstReg, AllRegs); 277 } 278 279 SmallVector<Register> GCDRegs; 280 LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy); 281 for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs)) 282 extractGCDType(GCDRegs, GCDTy, PartReg); 283 LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs); 284 buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs); 285 } 286 287 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts, 288 Register Reg) { 289 LLT Ty = MRI.getType(Reg); 290 SmallVector<Register, 8> RegElts; 291 extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts); 292 Elts.append(RegElts); 293 } 294 295 /// Merge \p PartRegs with different types into \p DstReg. 296 void LegalizerHelper::mergeMixedSubvectors(Register DstReg, 297 ArrayRef<Register> PartRegs) { 298 SmallVector<Register, 8> AllElts; 299 for (unsigned i = 0; i < PartRegs.size() - 1; ++i) 300 appendVectorElts(AllElts, PartRegs[i]); 301 302 Register Leftover = PartRegs[PartRegs.size() - 1]; 303 if (MRI.getType(Leftover).isScalar()) 304 AllElts.push_back(Leftover); 305 else 306 appendVectorElts(AllElts, Leftover); 307 308 MIRBuilder.buildMerge(DstReg, AllElts); 309 } 310 311 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs. 312 static void getUnmergeResults(SmallVectorImpl<Register> &Regs, 313 const MachineInstr &MI) { 314 assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES); 315 316 const int StartIdx = Regs.size(); 317 const int NumResults = MI.getNumOperands() - 1; 318 Regs.resize(Regs.size() + NumResults); 319 for (int I = 0; I != NumResults; ++I) 320 Regs[StartIdx + I] = MI.getOperand(I).getReg(); 321 } 322 323 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, 324 LLT GCDTy, Register SrcReg) { 325 LLT SrcTy = MRI.getType(SrcReg); 326 if (SrcTy == GCDTy) { 327 // If the source already evenly divides the result type, we don't need to do 328 // anything. 329 Parts.push_back(SrcReg); 330 } else { 331 // Need to split into common type sized pieces. 332 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg); 333 getUnmergeResults(Parts, *Unmerge); 334 } 335 } 336 337 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy, 338 LLT NarrowTy, Register SrcReg) { 339 LLT SrcTy = MRI.getType(SrcReg); 340 LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy); 341 extractGCDType(Parts, GCDTy, SrcReg); 342 return GCDTy; 343 } 344 345 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy, 346 SmallVectorImpl<Register> &VRegs, 347 unsigned PadStrategy) { 348 LLT LCMTy = getLCMType(DstTy, NarrowTy); 349 350 int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits(); 351 int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits(); 352 int NumOrigSrc = VRegs.size(); 353 354 Register PadReg; 355 356 // Get a value we can use to pad the source value if the sources won't evenly 357 // cover the result type. 358 if (NumOrigSrc < NumParts * NumSubParts) { 359 if (PadStrategy == TargetOpcode::G_ZEXT) 360 PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0); 361 else if (PadStrategy == TargetOpcode::G_ANYEXT) 362 PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0); 363 else { 364 assert(PadStrategy == TargetOpcode::G_SEXT); 365 366 // Shift the sign bit of the low register through the high register. 367 auto ShiftAmt = 368 MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1); 369 PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0); 370 } 371 } 372 373 // Registers for the final merge to be produced. 374 SmallVector<Register, 4> Remerge(NumParts); 375 376 // Registers needed for intermediate merges, which will be merged into a 377 // source for Remerge. 378 SmallVector<Register, 4> SubMerge(NumSubParts); 379 380 // Once we've fully read off the end of the original source bits, we can reuse 381 // the same high bits for remaining padding elements. 382 Register AllPadReg; 383 384 // Build merges to the LCM type to cover the original result type. 385 for (int I = 0; I != NumParts; ++I) { 386 bool AllMergePartsArePadding = true; 387 388 // Build the requested merges to the requested type. 389 for (int J = 0; J != NumSubParts; ++J) { 390 int Idx = I * NumSubParts + J; 391 if (Idx >= NumOrigSrc) { 392 SubMerge[J] = PadReg; 393 continue; 394 } 395 396 SubMerge[J] = VRegs[Idx]; 397 398 // There are meaningful bits here we can't reuse later. 399 AllMergePartsArePadding = false; 400 } 401 402 // If we've filled up a complete piece with padding bits, we can directly 403 // emit the natural sized constant if applicable, rather than a merge of 404 // smaller constants. 405 if (AllMergePartsArePadding && !AllPadReg) { 406 if (PadStrategy == TargetOpcode::G_ANYEXT) 407 AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0); 408 else if (PadStrategy == TargetOpcode::G_ZEXT) 409 AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0); 410 411 // If this is a sign extension, we can't materialize a trivial constant 412 // with the right type and have to produce a merge. 413 } 414 415 if (AllPadReg) { 416 // Avoid creating additional instructions if we're just adding additional 417 // copies of padding bits. 418 Remerge[I] = AllPadReg; 419 continue; 420 } 421 422 if (NumSubParts == 1) 423 Remerge[I] = SubMerge[0]; 424 else 425 Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0); 426 427 // In the sign extend padding case, re-use the first all-signbit merge. 428 if (AllMergePartsArePadding && !AllPadReg) 429 AllPadReg = Remerge[I]; 430 } 431 432 VRegs = std::move(Remerge); 433 return LCMTy; 434 } 435 436 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy, 437 ArrayRef<Register> RemergeRegs) { 438 LLT DstTy = MRI.getType(DstReg); 439 440 // Create the merge to the widened source, and extract the relevant bits into 441 // the result. 442 443 if (DstTy == LCMTy) { 444 MIRBuilder.buildMerge(DstReg, RemergeRegs); 445 return; 446 } 447 448 auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs); 449 if (DstTy.isScalar() && LCMTy.isScalar()) { 450 MIRBuilder.buildTrunc(DstReg, Remerge); 451 return; 452 } 453 454 if (LCMTy.isVector()) { 455 unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits(); 456 SmallVector<Register, 8> UnmergeDefs(NumDefs); 457 UnmergeDefs[0] = DstReg; 458 for (unsigned I = 1; I != NumDefs; ++I) 459 UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy); 460 461 MIRBuilder.buildUnmerge(UnmergeDefs, 462 MIRBuilder.buildMerge(LCMTy, RemergeRegs)); 463 return; 464 } 465 466 llvm_unreachable("unhandled case"); 467 } 468 469 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { 470 #define RTLIBCASE_INT(LibcallPrefix) \ 471 do { \ 472 switch (Size) { \ 473 case 32: \ 474 return RTLIB::LibcallPrefix##32; \ 475 case 64: \ 476 return RTLIB::LibcallPrefix##64; \ 477 case 128: \ 478 return RTLIB::LibcallPrefix##128; \ 479 default: \ 480 llvm_unreachable("unexpected size"); \ 481 } \ 482 } while (0) 483 484 #define RTLIBCASE(LibcallPrefix) \ 485 do { \ 486 switch (Size) { \ 487 case 32: \ 488 return RTLIB::LibcallPrefix##32; \ 489 case 64: \ 490 return RTLIB::LibcallPrefix##64; \ 491 case 80: \ 492 return RTLIB::LibcallPrefix##80; \ 493 case 128: \ 494 return RTLIB::LibcallPrefix##128; \ 495 default: \ 496 llvm_unreachable("unexpected size"); \ 497 } \ 498 } while (0) 499 500 switch (Opcode) { 501 case TargetOpcode::G_SDIV: 502 RTLIBCASE_INT(SDIV_I); 503 case TargetOpcode::G_UDIV: 504 RTLIBCASE_INT(UDIV_I); 505 case TargetOpcode::G_SREM: 506 RTLIBCASE_INT(SREM_I); 507 case TargetOpcode::G_UREM: 508 RTLIBCASE_INT(UREM_I); 509 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 510 RTLIBCASE_INT(CTLZ_I); 511 case TargetOpcode::G_FADD: 512 RTLIBCASE(ADD_F); 513 case TargetOpcode::G_FSUB: 514 RTLIBCASE(SUB_F); 515 case TargetOpcode::G_FMUL: 516 RTLIBCASE(MUL_F); 517 case TargetOpcode::G_FDIV: 518 RTLIBCASE(DIV_F); 519 case TargetOpcode::G_FEXP: 520 RTLIBCASE(EXP_F); 521 case TargetOpcode::G_FEXP2: 522 RTLIBCASE(EXP2_F); 523 case TargetOpcode::G_FREM: 524 RTLIBCASE(REM_F); 525 case TargetOpcode::G_FPOW: 526 RTLIBCASE(POW_F); 527 case TargetOpcode::G_FMA: 528 RTLIBCASE(FMA_F); 529 case TargetOpcode::G_FSIN: 530 RTLIBCASE(SIN_F); 531 case TargetOpcode::G_FCOS: 532 RTLIBCASE(COS_F); 533 case TargetOpcode::G_FLOG10: 534 RTLIBCASE(LOG10_F); 535 case TargetOpcode::G_FLOG: 536 RTLIBCASE(LOG_F); 537 case TargetOpcode::G_FLOG2: 538 RTLIBCASE(LOG2_F); 539 case TargetOpcode::G_FCEIL: 540 RTLIBCASE(CEIL_F); 541 case TargetOpcode::G_FFLOOR: 542 RTLIBCASE(FLOOR_F); 543 case TargetOpcode::G_FMINNUM: 544 RTLIBCASE(FMIN_F); 545 case TargetOpcode::G_FMAXNUM: 546 RTLIBCASE(FMAX_F); 547 case TargetOpcode::G_FSQRT: 548 RTLIBCASE(SQRT_F); 549 case TargetOpcode::G_FRINT: 550 RTLIBCASE(RINT_F); 551 case TargetOpcode::G_FNEARBYINT: 552 RTLIBCASE(NEARBYINT_F); 553 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 554 RTLIBCASE(ROUNDEVEN_F); 555 } 556 llvm_unreachable("Unknown libcall function"); 557 } 558 559 /// True if an instruction is in tail position in its caller. Intended for 560 /// legalizing libcalls as tail calls when possible. 561 static bool isLibCallInTailPosition(MachineInstr &MI, 562 const TargetInstrInfo &TII, 563 MachineRegisterInfo &MRI) { 564 MachineBasicBlock &MBB = *MI.getParent(); 565 const Function &F = MBB.getParent()->getFunction(); 566 567 // Conservatively require the attributes of the call to match those of 568 // the return. Ignore NoAlias and NonNull because they don't affect the 569 // call sequence. 570 AttributeList CallerAttrs = F.getAttributes(); 571 if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs()) 572 .removeAttribute(Attribute::NoAlias) 573 .removeAttribute(Attribute::NonNull) 574 .hasAttributes()) 575 return false; 576 577 // It's not safe to eliminate the sign / zero extension of the return value. 578 if (CallerAttrs.hasRetAttr(Attribute::ZExt) || 579 CallerAttrs.hasRetAttr(Attribute::SExt)) 580 return false; 581 582 // Only tail call if the following instruction is a standard return or if we 583 // have a `thisreturn` callee, and a sequence like: 584 // 585 // G_MEMCPY %0, %1, %2 586 // $x0 = COPY %0 587 // RET_ReallyLR implicit $x0 588 auto Next = next_nodbg(MI.getIterator(), MBB.instr_end()); 589 if (Next != MBB.instr_end() && Next->isCopy()) { 590 switch (MI.getOpcode()) { 591 default: 592 llvm_unreachable("unsupported opcode"); 593 case TargetOpcode::G_BZERO: 594 return false; 595 case TargetOpcode::G_MEMCPY: 596 case TargetOpcode::G_MEMMOVE: 597 case TargetOpcode::G_MEMSET: 598 break; 599 } 600 601 Register VReg = MI.getOperand(0).getReg(); 602 if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg()) 603 return false; 604 605 Register PReg = Next->getOperand(0).getReg(); 606 if (!PReg.isPhysical()) 607 return false; 608 609 auto Ret = next_nodbg(Next, MBB.instr_end()); 610 if (Ret == MBB.instr_end() || !Ret->isReturn()) 611 return false; 612 613 if (Ret->getNumImplicitOperands() != 1) 614 return false; 615 616 if (PReg != Ret->getOperand(0).getReg()) 617 return false; 618 619 // Skip over the COPY that we just validated. 620 Next = Ret; 621 } 622 623 if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn()) 624 return false; 625 626 return true; 627 } 628 629 LegalizerHelper::LegalizeResult 630 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name, 631 const CallLowering::ArgInfo &Result, 632 ArrayRef<CallLowering::ArgInfo> Args, 633 const CallingConv::ID CC) { 634 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); 635 636 CallLowering::CallLoweringInfo Info; 637 Info.CallConv = CC; 638 Info.Callee = MachineOperand::CreateES(Name); 639 Info.OrigRet = Result; 640 std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs)); 641 if (!CLI.lowerCall(MIRBuilder, Info)) 642 return LegalizerHelper::UnableToLegalize; 643 644 return LegalizerHelper::Legalized; 645 } 646 647 LegalizerHelper::LegalizeResult 648 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall, 649 const CallLowering::ArgInfo &Result, 650 ArrayRef<CallLowering::ArgInfo> Args) { 651 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); 652 const char *Name = TLI.getLibcallName(Libcall); 653 const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall); 654 return createLibcall(MIRBuilder, Name, Result, Args, CC); 655 } 656 657 // Useful for libcalls where all operands have the same type. 658 static LegalizerHelper::LegalizeResult 659 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, 660 Type *OpType) { 661 auto Libcall = getRTLibDesc(MI.getOpcode(), Size); 662 663 // FIXME: What does the original arg index mean here? 664 SmallVector<CallLowering::ArgInfo, 3> Args; 665 for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) 666 Args.push_back({MO.getReg(), OpType, 0}); 667 return createLibcall(MIRBuilder, Libcall, 668 {MI.getOperand(0).getReg(), OpType, 0}, Args); 669 } 670 671 LegalizerHelper::LegalizeResult 672 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 673 MachineInstr &MI, LostDebugLocObserver &LocObserver) { 674 auto &Ctx = MIRBuilder.getMF().getFunction().getContext(); 675 676 SmallVector<CallLowering::ArgInfo, 3> Args; 677 // Add all the args, except for the last which is an imm denoting 'tail'. 678 for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) { 679 Register Reg = MI.getOperand(i).getReg(); 680 681 // Need derive an IR type for call lowering. 682 LLT OpLLT = MRI.getType(Reg); 683 Type *OpTy = nullptr; 684 if (OpLLT.isPointer()) 685 OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace()); 686 else 687 OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits()); 688 Args.push_back({Reg, OpTy, 0}); 689 } 690 691 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); 692 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); 693 RTLIB::Libcall RTLibcall; 694 unsigned Opc = MI.getOpcode(); 695 switch (Opc) { 696 case TargetOpcode::G_BZERO: 697 RTLibcall = RTLIB::BZERO; 698 break; 699 case TargetOpcode::G_MEMCPY: 700 RTLibcall = RTLIB::MEMCPY; 701 Args[0].Flags[0].setReturned(); 702 break; 703 case TargetOpcode::G_MEMMOVE: 704 RTLibcall = RTLIB::MEMMOVE; 705 Args[0].Flags[0].setReturned(); 706 break; 707 case TargetOpcode::G_MEMSET: 708 RTLibcall = RTLIB::MEMSET; 709 Args[0].Flags[0].setReturned(); 710 break; 711 default: 712 llvm_unreachable("unsupported opcode"); 713 } 714 const char *Name = TLI.getLibcallName(RTLibcall); 715 716 // Unsupported libcall on the target. 717 if (!Name) { 718 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for " 719 << MIRBuilder.getTII().getName(Opc) << "\n"); 720 return LegalizerHelper::UnableToLegalize; 721 } 722 723 CallLowering::CallLoweringInfo Info; 724 Info.CallConv = TLI.getLibcallCallingConv(RTLibcall); 725 Info.Callee = MachineOperand::CreateES(Name); 726 Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0); 727 Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() && 728 isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI); 729 730 std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs)); 731 if (!CLI.lowerCall(MIRBuilder, Info)) 732 return LegalizerHelper::UnableToLegalize; 733 734 if (Info.LoweredTailCall) { 735 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?"); 736 737 // Check debug locations before removing the return. 738 LocObserver.checkpoint(true); 739 740 // We must have a return following the call (or debug insts) to get past 741 // isLibCallInTailPosition. 742 do { 743 MachineInstr *Next = MI.getNextNode(); 744 assert(Next && 745 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) && 746 "Expected instr following MI to be return or debug inst?"); 747 // We lowered a tail call, so the call is now the return from the block. 748 // Delete the old return. 749 Next->eraseFromParent(); 750 } while (MI.getNextNode()); 751 752 // We expect to lose the debug location from the return. 753 LocObserver.checkpoint(false); 754 } 755 756 return LegalizerHelper::Legalized; 757 } 758 759 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType, 760 Type *FromType) { 761 auto ToMVT = MVT::getVT(ToType); 762 auto FromMVT = MVT::getVT(FromType); 763 764 switch (Opcode) { 765 case TargetOpcode::G_FPEXT: 766 return RTLIB::getFPEXT(FromMVT, ToMVT); 767 case TargetOpcode::G_FPTRUNC: 768 return RTLIB::getFPROUND(FromMVT, ToMVT); 769 case TargetOpcode::G_FPTOSI: 770 return RTLIB::getFPTOSINT(FromMVT, ToMVT); 771 case TargetOpcode::G_FPTOUI: 772 return RTLIB::getFPTOUINT(FromMVT, ToMVT); 773 case TargetOpcode::G_SITOFP: 774 return RTLIB::getSINTTOFP(FromMVT, ToMVT); 775 case TargetOpcode::G_UITOFP: 776 return RTLIB::getUINTTOFP(FromMVT, ToMVT); 777 } 778 llvm_unreachable("Unsupported libcall function"); 779 } 780 781 static LegalizerHelper::LegalizeResult 782 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType, 783 Type *FromType) { 784 RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType); 785 return createLibcall(MIRBuilder, Libcall, 786 {MI.getOperand(0).getReg(), ToType, 0}, 787 {{MI.getOperand(1).getReg(), FromType, 0}}); 788 } 789 790 LegalizerHelper::LegalizeResult 791 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) { 792 LLT LLTy = MRI.getType(MI.getOperand(0).getReg()); 793 unsigned Size = LLTy.getSizeInBits(); 794 auto &Ctx = MIRBuilder.getMF().getFunction().getContext(); 795 796 switch (MI.getOpcode()) { 797 default: 798 return UnableToLegalize; 799 case TargetOpcode::G_SDIV: 800 case TargetOpcode::G_UDIV: 801 case TargetOpcode::G_SREM: 802 case TargetOpcode::G_UREM: 803 case TargetOpcode::G_CTLZ_ZERO_UNDEF: { 804 Type *HLTy = IntegerType::get(Ctx, Size); 805 auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy); 806 if (Status != Legalized) 807 return Status; 808 break; 809 } 810 case TargetOpcode::G_FADD: 811 case TargetOpcode::G_FSUB: 812 case TargetOpcode::G_FMUL: 813 case TargetOpcode::G_FDIV: 814 case TargetOpcode::G_FMA: 815 case TargetOpcode::G_FPOW: 816 case TargetOpcode::G_FREM: 817 case TargetOpcode::G_FCOS: 818 case TargetOpcode::G_FSIN: 819 case TargetOpcode::G_FLOG10: 820 case TargetOpcode::G_FLOG: 821 case TargetOpcode::G_FLOG2: 822 case TargetOpcode::G_FEXP: 823 case TargetOpcode::G_FEXP2: 824 case TargetOpcode::G_FCEIL: 825 case TargetOpcode::G_FFLOOR: 826 case TargetOpcode::G_FMINNUM: 827 case TargetOpcode::G_FMAXNUM: 828 case TargetOpcode::G_FSQRT: 829 case TargetOpcode::G_FRINT: 830 case TargetOpcode::G_FNEARBYINT: 831 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: { 832 Type *HLTy = getFloatTypeForLLT(Ctx, LLTy); 833 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) { 834 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n"); 835 return UnableToLegalize; 836 } 837 auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy); 838 if (Status != Legalized) 839 return Status; 840 break; 841 } 842 case TargetOpcode::G_FPEXT: 843 case TargetOpcode::G_FPTRUNC: { 844 Type *FromTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(1).getReg())); 845 Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg())); 846 if (!FromTy || !ToTy) 847 return UnableToLegalize; 848 LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy ); 849 if (Status != Legalized) 850 return Status; 851 break; 852 } 853 case TargetOpcode::G_FPTOSI: 854 case TargetOpcode::G_FPTOUI: { 855 // FIXME: Support other types 856 unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 857 unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 858 if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64)) 859 return UnableToLegalize; 860 LegalizeResult Status = conversionLibcall( 861 MI, MIRBuilder, 862 ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx), 863 FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx)); 864 if (Status != Legalized) 865 return Status; 866 break; 867 } 868 case TargetOpcode::G_SITOFP: 869 case TargetOpcode::G_UITOFP: { 870 // FIXME: Support other types 871 unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 872 unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 873 if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64)) 874 return UnableToLegalize; 875 LegalizeResult Status = conversionLibcall( 876 MI, MIRBuilder, 877 ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx), 878 FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx)); 879 if (Status != Legalized) 880 return Status; 881 break; 882 } 883 case TargetOpcode::G_BZERO: 884 case TargetOpcode::G_MEMCPY: 885 case TargetOpcode::G_MEMMOVE: 886 case TargetOpcode::G_MEMSET: { 887 LegalizeResult Result = 888 createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver); 889 if (Result != Legalized) 890 return Result; 891 MI.eraseFromParent(); 892 return Result; 893 } 894 } 895 896 MI.eraseFromParent(); 897 return Legalized; 898 } 899 900 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, 901 unsigned TypeIdx, 902 LLT NarrowTy) { 903 uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 904 uint64_t NarrowSize = NarrowTy.getSizeInBits(); 905 906 switch (MI.getOpcode()) { 907 default: 908 return UnableToLegalize; 909 case TargetOpcode::G_IMPLICIT_DEF: { 910 Register DstReg = MI.getOperand(0).getReg(); 911 LLT DstTy = MRI.getType(DstReg); 912 913 // If SizeOp0 is not an exact multiple of NarrowSize, emit 914 // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed. 915 // FIXME: Although this would also be legal for the general case, it causes 916 // a lot of regressions in the emitted code (superfluous COPYs, artifact 917 // combines not being hit). This seems to be a problem related to the 918 // artifact combiner. 919 if (SizeOp0 % NarrowSize != 0) { 920 LLT ImplicitTy = NarrowTy; 921 if (DstTy.isVector()) 922 ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy); 923 924 Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0); 925 MIRBuilder.buildAnyExt(DstReg, ImplicitReg); 926 927 MI.eraseFromParent(); 928 return Legalized; 929 } 930 931 int NumParts = SizeOp0 / NarrowSize; 932 933 SmallVector<Register, 2> DstRegs; 934 for (int i = 0; i < NumParts; ++i) 935 DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0)); 936 937 if (DstTy.isVector()) 938 MIRBuilder.buildBuildVector(DstReg, DstRegs); 939 else 940 MIRBuilder.buildMerge(DstReg, DstRegs); 941 MI.eraseFromParent(); 942 return Legalized; 943 } 944 case TargetOpcode::G_CONSTANT: { 945 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 946 const APInt &Val = MI.getOperand(1).getCImm()->getValue(); 947 unsigned TotalSize = Ty.getSizeInBits(); 948 unsigned NarrowSize = NarrowTy.getSizeInBits(); 949 int NumParts = TotalSize / NarrowSize; 950 951 SmallVector<Register, 4> PartRegs; 952 for (int I = 0; I != NumParts; ++I) { 953 unsigned Offset = I * NarrowSize; 954 auto K = MIRBuilder.buildConstant(NarrowTy, 955 Val.lshr(Offset).trunc(NarrowSize)); 956 PartRegs.push_back(K.getReg(0)); 957 } 958 959 LLT LeftoverTy; 960 unsigned LeftoverBits = TotalSize - NumParts * NarrowSize; 961 SmallVector<Register, 1> LeftoverRegs; 962 if (LeftoverBits != 0) { 963 LeftoverTy = LLT::scalar(LeftoverBits); 964 auto K = MIRBuilder.buildConstant( 965 LeftoverTy, 966 Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits)); 967 LeftoverRegs.push_back(K.getReg(0)); 968 } 969 970 insertParts(MI.getOperand(0).getReg(), 971 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs); 972 973 MI.eraseFromParent(); 974 return Legalized; 975 } 976 case TargetOpcode::G_SEXT: 977 case TargetOpcode::G_ZEXT: 978 case TargetOpcode::G_ANYEXT: 979 return narrowScalarExt(MI, TypeIdx, NarrowTy); 980 case TargetOpcode::G_TRUNC: { 981 if (TypeIdx != 1) 982 return UnableToLegalize; 983 984 uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 985 if (NarrowTy.getSizeInBits() * 2 != SizeOp1) { 986 LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n"); 987 return UnableToLegalize; 988 } 989 990 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1)); 991 MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0)); 992 MI.eraseFromParent(); 993 return Legalized; 994 } 995 996 case TargetOpcode::G_FREEZE: { 997 if (TypeIdx != 0) 998 return UnableToLegalize; 999 1000 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1001 // Should widen scalar first 1002 if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0) 1003 return UnableToLegalize; 1004 1005 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg()); 1006 SmallVector<Register, 8> Parts; 1007 for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) { 1008 Parts.push_back( 1009 MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0)); 1010 } 1011 1012 MIRBuilder.buildMerge(MI.getOperand(0).getReg(), Parts); 1013 MI.eraseFromParent(); 1014 return Legalized; 1015 } 1016 case TargetOpcode::G_ADD: 1017 case TargetOpcode::G_SUB: 1018 case TargetOpcode::G_SADDO: 1019 case TargetOpcode::G_SSUBO: 1020 case TargetOpcode::G_SADDE: 1021 case TargetOpcode::G_SSUBE: 1022 case TargetOpcode::G_UADDO: 1023 case TargetOpcode::G_USUBO: 1024 case TargetOpcode::G_UADDE: 1025 case TargetOpcode::G_USUBE: 1026 return narrowScalarAddSub(MI, TypeIdx, NarrowTy); 1027 case TargetOpcode::G_MUL: 1028 case TargetOpcode::G_UMULH: 1029 return narrowScalarMul(MI, NarrowTy); 1030 case TargetOpcode::G_EXTRACT: 1031 return narrowScalarExtract(MI, TypeIdx, NarrowTy); 1032 case TargetOpcode::G_INSERT: 1033 return narrowScalarInsert(MI, TypeIdx, NarrowTy); 1034 case TargetOpcode::G_LOAD: { 1035 auto &LoadMI = cast<GLoad>(MI); 1036 Register DstReg = LoadMI.getDstReg(); 1037 LLT DstTy = MRI.getType(DstReg); 1038 if (DstTy.isVector()) 1039 return UnableToLegalize; 1040 1041 if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) { 1042 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); 1043 MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO()); 1044 MIRBuilder.buildAnyExt(DstReg, TmpReg); 1045 LoadMI.eraseFromParent(); 1046 return Legalized; 1047 } 1048 1049 return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy); 1050 } 1051 case TargetOpcode::G_ZEXTLOAD: 1052 case TargetOpcode::G_SEXTLOAD: { 1053 auto &LoadMI = cast<GExtLoad>(MI); 1054 Register DstReg = LoadMI.getDstReg(); 1055 Register PtrReg = LoadMI.getPointerReg(); 1056 1057 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); 1058 auto &MMO = LoadMI.getMMO(); 1059 unsigned MemSize = MMO.getSizeInBits(); 1060 1061 if (MemSize == NarrowSize) { 1062 MIRBuilder.buildLoad(TmpReg, PtrReg, MMO); 1063 } else if (MemSize < NarrowSize) { 1064 MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO); 1065 } else if (MemSize > NarrowSize) { 1066 // FIXME: Need to split the load. 1067 return UnableToLegalize; 1068 } 1069 1070 if (isa<GZExtLoad>(LoadMI)) 1071 MIRBuilder.buildZExt(DstReg, TmpReg); 1072 else 1073 MIRBuilder.buildSExt(DstReg, TmpReg); 1074 1075 LoadMI.eraseFromParent(); 1076 return Legalized; 1077 } 1078 case TargetOpcode::G_STORE: { 1079 auto &StoreMI = cast<GStore>(MI); 1080 1081 Register SrcReg = StoreMI.getValueReg(); 1082 LLT SrcTy = MRI.getType(SrcReg); 1083 if (SrcTy.isVector()) 1084 return UnableToLegalize; 1085 1086 int NumParts = SizeOp0 / NarrowSize; 1087 unsigned HandledSize = NumParts * NarrowTy.getSizeInBits(); 1088 unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize; 1089 if (SrcTy.isVector() && LeftoverBits != 0) 1090 return UnableToLegalize; 1091 1092 if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) { 1093 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy); 1094 MIRBuilder.buildTrunc(TmpReg, SrcReg); 1095 MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO()); 1096 StoreMI.eraseFromParent(); 1097 return Legalized; 1098 } 1099 1100 return reduceLoadStoreWidth(StoreMI, 0, NarrowTy); 1101 } 1102 case TargetOpcode::G_SELECT: 1103 return narrowScalarSelect(MI, TypeIdx, NarrowTy); 1104 case TargetOpcode::G_AND: 1105 case TargetOpcode::G_OR: 1106 case TargetOpcode::G_XOR: { 1107 // Legalize bitwise operation: 1108 // A = BinOp<Ty> B, C 1109 // into: 1110 // B1, ..., BN = G_UNMERGE_VALUES B 1111 // C1, ..., CN = G_UNMERGE_VALUES C 1112 // A1 = BinOp<Ty/N> B1, C2 1113 // ... 1114 // AN = BinOp<Ty/N> BN, CN 1115 // A = G_MERGE_VALUES A1, ..., AN 1116 return narrowScalarBasic(MI, TypeIdx, NarrowTy); 1117 } 1118 case TargetOpcode::G_SHL: 1119 case TargetOpcode::G_LSHR: 1120 case TargetOpcode::G_ASHR: 1121 return narrowScalarShift(MI, TypeIdx, NarrowTy); 1122 case TargetOpcode::G_CTLZ: 1123 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 1124 case TargetOpcode::G_CTTZ: 1125 case TargetOpcode::G_CTTZ_ZERO_UNDEF: 1126 case TargetOpcode::G_CTPOP: 1127 if (TypeIdx == 1) 1128 switch (MI.getOpcode()) { 1129 case TargetOpcode::G_CTLZ: 1130 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 1131 return narrowScalarCTLZ(MI, TypeIdx, NarrowTy); 1132 case TargetOpcode::G_CTTZ: 1133 case TargetOpcode::G_CTTZ_ZERO_UNDEF: 1134 return narrowScalarCTTZ(MI, TypeIdx, NarrowTy); 1135 case TargetOpcode::G_CTPOP: 1136 return narrowScalarCTPOP(MI, TypeIdx, NarrowTy); 1137 default: 1138 return UnableToLegalize; 1139 } 1140 1141 Observer.changingInstr(MI); 1142 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT); 1143 Observer.changedInstr(MI); 1144 return Legalized; 1145 case TargetOpcode::G_INTTOPTR: 1146 if (TypeIdx != 1) 1147 return UnableToLegalize; 1148 1149 Observer.changingInstr(MI); 1150 narrowScalarSrc(MI, NarrowTy, 1); 1151 Observer.changedInstr(MI); 1152 return Legalized; 1153 case TargetOpcode::G_PTRTOINT: 1154 if (TypeIdx != 0) 1155 return UnableToLegalize; 1156 1157 Observer.changingInstr(MI); 1158 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT); 1159 Observer.changedInstr(MI); 1160 return Legalized; 1161 case TargetOpcode::G_PHI: { 1162 // FIXME: add support for when SizeOp0 isn't an exact multiple of 1163 // NarrowSize. 1164 if (SizeOp0 % NarrowSize != 0) 1165 return UnableToLegalize; 1166 1167 unsigned NumParts = SizeOp0 / NarrowSize; 1168 SmallVector<Register, 2> DstRegs(NumParts); 1169 SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2); 1170 Observer.changingInstr(MI); 1171 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { 1172 MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB(); 1173 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator()); 1174 extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts, 1175 SrcRegs[i / 2]); 1176 } 1177 MachineBasicBlock &MBB = *MI.getParent(); 1178 MIRBuilder.setInsertPt(MBB, MI); 1179 for (unsigned i = 0; i < NumParts; ++i) { 1180 DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy); 1181 MachineInstrBuilder MIB = 1182 MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]); 1183 for (unsigned j = 1; j < MI.getNumOperands(); j += 2) 1184 MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1)); 1185 } 1186 MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI()); 1187 MIRBuilder.buildMerge(MI.getOperand(0), DstRegs); 1188 Observer.changedInstr(MI); 1189 MI.eraseFromParent(); 1190 return Legalized; 1191 } 1192 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1193 case TargetOpcode::G_INSERT_VECTOR_ELT: { 1194 if (TypeIdx != 2) 1195 return UnableToLegalize; 1196 1197 int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3; 1198 Observer.changingInstr(MI); 1199 narrowScalarSrc(MI, NarrowTy, OpIdx); 1200 Observer.changedInstr(MI); 1201 return Legalized; 1202 } 1203 case TargetOpcode::G_ICMP: { 1204 Register LHS = MI.getOperand(2).getReg(); 1205 LLT SrcTy = MRI.getType(LHS); 1206 uint64_t SrcSize = SrcTy.getSizeInBits(); 1207 CmpInst::Predicate Pred = 1208 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 1209 1210 // TODO: Handle the non-equality case for weird sizes. 1211 if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred)) 1212 return UnableToLegalize; 1213 1214 LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover) 1215 SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs; 1216 if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs, 1217 LHSLeftoverRegs)) 1218 return UnableToLegalize; 1219 1220 LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type. 1221 SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs; 1222 if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused, 1223 RHSPartRegs, RHSLeftoverRegs)) 1224 return UnableToLegalize; 1225 1226 // We now have the LHS and RHS of the compare split into narrow-type 1227 // registers, plus potentially some leftover type. 1228 Register Dst = MI.getOperand(0).getReg(); 1229 LLT ResTy = MRI.getType(Dst); 1230 if (ICmpInst::isEquality(Pred)) { 1231 // For each part on the LHS and RHS, keep track of the result of XOR-ing 1232 // them together. For each equal part, the result should be all 0s. For 1233 // each non-equal part, we'll get at least one 1. 1234 auto Zero = MIRBuilder.buildConstant(NarrowTy, 0); 1235 SmallVector<Register, 4> Xors; 1236 for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) { 1237 auto LHS = std::get<0>(LHSAndRHS); 1238 auto RHS = std::get<1>(LHSAndRHS); 1239 auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0); 1240 Xors.push_back(Xor); 1241 } 1242 1243 // Build a G_XOR for each leftover register. Each G_XOR must be widened 1244 // to the desired narrow type so that we can OR them together later. 1245 SmallVector<Register, 4> WidenedXors; 1246 for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) { 1247 auto LHS = std::get<0>(LHSAndRHS); 1248 auto RHS = std::get<1>(LHSAndRHS); 1249 auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0); 1250 LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor); 1251 buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors, 1252 /* PadStrategy = */ TargetOpcode::G_ZEXT); 1253 Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end()); 1254 } 1255 1256 // Now, for each part we broke up, we know if they are equal/not equal 1257 // based off the G_XOR. We can OR these all together and compare against 1258 // 0 to get the result. 1259 assert(Xors.size() >= 2 && "Should have gotten at least two Xors?"); 1260 auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]); 1261 for (unsigned I = 2, E = Xors.size(); I < E; ++I) 1262 Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]); 1263 MIRBuilder.buildICmp(Pred, Dst, Or, Zero); 1264 } else { 1265 // TODO: Handle non-power-of-two types. 1266 assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?"); 1267 assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?"); 1268 Register LHSL = LHSPartRegs[0]; 1269 Register LHSH = LHSPartRegs[1]; 1270 Register RHSL = RHSPartRegs[0]; 1271 Register RHSH = RHSPartRegs[1]; 1272 MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH); 1273 MachineInstrBuilder CmpHEQ = 1274 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH); 1275 MachineInstrBuilder CmpLU = MIRBuilder.buildICmp( 1276 ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL); 1277 MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH); 1278 } 1279 MI.eraseFromParent(); 1280 return Legalized; 1281 } 1282 case TargetOpcode::G_SEXT_INREG: { 1283 if (TypeIdx != 0) 1284 return UnableToLegalize; 1285 1286 int64_t SizeInBits = MI.getOperand(2).getImm(); 1287 1288 // So long as the new type has more bits than the bits we're extending we 1289 // don't need to break it apart. 1290 if (NarrowTy.getScalarSizeInBits() >= SizeInBits) { 1291 Observer.changingInstr(MI); 1292 // We don't lose any non-extension bits by truncating the src and 1293 // sign-extending the dst. 1294 MachineOperand &MO1 = MI.getOperand(1); 1295 auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1); 1296 MO1.setReg(TruncMIB.getReg(0)); 1297 1298 MachineOperand &MO2 = MI.getOperand(0); 1299 Register DstExt = MRI.createGenericVirtualRegister(NarrowTy); 1300 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1301 MIRBuilder.buildSExt(MO2, DstExt); 1302 MO2.setReg(DstExt); 1303 Observer.changedInstr(MI); 1304 return Legalized; 1305 } 1306 1307 // Break it apart. Components below the extension point are unmodified. The 1308 // component containing the extension point becomes a narrower SEXT_INREG. 1309 // Components above it are ashr'd from the component containing the 1310 // extension point. 1311 if (SizeOp0 % NarrowSize != 0) 1312 return UnableToLegalize; 1313 int NumParts = SizeOp0 / NarrowSize; 1314 1315 // List the registers where the destination will be scattered. 1316 SmallVector<Register, 2> DstRegs; 1317 // List the registers where the source will be split. 1318 SmallVector<Register, 2> SrcRegs; 1319 1320 // Create all the temporary registers. 1321 for (int i = 0; i < NumParts; ++i) { 1322 Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy); 1323 1324 SrcRegs.push_back(SrcReg); 1325 } 1326 1327 // Explode the big arguments into smaller chunks. 1328 MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1)); 1329 1330 Register AshrCstReg = 1331 MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1) 1332 .getReg(0); 1333 Register FullExtensionReg = 0; 1334 Register PartialExtensionReg = 0; 1335 1336 // Do the operation on each small part. 1337 for (int i = 0; i < NumParts; ++i) { 1338 if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits) 1339 DstRegs.push_back(SrcRegs[i]); 1340 else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) { 1341 assert(PartialExtensionReg && 1342 "Expected to visit partial extension before full"); 1343 if (FullExtensionReg) { 1344 DstRegs.push_back(FullExtensionReg); 1345 continue; 1346 } 1347 DstRegs.push_back( 1348 MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg) 1349 .getReg(0)); 1350 FullExtensionReg = DstRegs.back(); 1351 } else { 1352 DstRegs.push_back( 1353 MIRBuilder 1354 .buildInstr( 1355 TargetOpcode::G_SEXT_INREG, {NarrowTy}, 1356 {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()}) 1357 .getReg(0)); 1358 PartialExtensionReg = DstRegs.back(); 1359 } 1360 } 1361 1362 // Gather the destination registers into the final destination. 1363 Register DstReg = MI.getOperand(0).getReg(); 1364 MIRBuilder.buildMerge(DstReg, DstRegs); 1365 MI.eraseFromParent(); 1366 return Legalized; 1367 } 1368 case TargetOpcode::G_BSWAP: 1369 case TargetOpcode::G_BITREVERSE: { 1370 if (SizeOp0 % NarrowSize != 0) 1371 return UnableToLegalize; 1372 1373 Observer.changingInstr(MI); 1374 SmallVector<Register, 2> SrcRegs, DstRegs; 1375 unsigned NumParts = SizeOp0 / NarrowSize; 1376 extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs); 1377 1378 for (unsigned i = 0; i < NumParts; ++i) { 1379 auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy}, 1380 {SrcRegs[NumParts - 1 - i]}); 1381 DstRegs.push_back(DstPart.getReg(0)); 1382 } 1383 1384 MIRBuilder.buildMerge(MI.getOperand(0), DstRegs); 1385 1386 Observer.changedInstr(MI); 1387 MI.eraseFromParent(); 1388 return Legalized; 1389 } 1390 case TargetOpcode::G_PTR_ADD: 1391 case TargetOpcode::G_PTRMASK: { 1392 if (TypeIdx != 1) 1393 return UnableToLegalize; 1394 Observer.changingInstr(MI); 1395 narrowScalarSrc(MI, NarrowTy, 2); 1396 Observer.changedInstr(MI); 1397 return Legalized; 1398 } 1399 case TargetOpcode::G_FPTOUI: 1400 case TargetOpcode::G_FPTOSI: 1401 return narrowScalarFPTOI(MI, TypeIdx, NarrowTy); 1402 case TargetOpcode::G_FPEXT: 1403 if (TypeIdx != 0) 1404 return UnableToLegalize; 1405 Observer.changingInstr(MI); 1406 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT); 1407 Observer.changedInstr(MI); 1408 return Legalized; 1409 } 1410 } 1411 1412 Register LegalizerHelper::coerceToScalar(Register Val) { 1413 LLT Ty = MRI.getType(Val); 1414 if (Ty.isScalar()) 1415 return Val; 1416 1417 const DataLayout &DL = MIRBuilder.getDataLayout(); 1418 LLT NewTy = LLT::scalar(Ty.getSizeInBits()); 1419 if (Ty.isPointer()) { 1420 if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace())) 1421 return Register(); 1422 return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0); 1423 } 1424 1425 Register NewVal = Val; 1426 1427 assert(Ty.isVector()); 1428 LLT EltTy = Ty.getElementType(); 1429 if (EltTy.isPointer()) 1430 NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0); 1431 return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0); 1432 } 1433 1434 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy, 1435 unsigned OpIdx, unsigned ExtOpcode) { 1436 MachineOperand &MO = MI.getOperand(OpIdx); 1437 auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO}); 1438 MO.setReg(ExtB.getReg(0)); 1439 } 1440 1441 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy, 1442 unsigned OpIdx) { 1443 MachineOperand &MO = MI.getOperand(OpIdx); 1444 auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO); 1445 MO.setReg(ExtB.getReg(0)); 1446 } 1447 1448 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy, 1449 unsigned OpIdx, unsigned TruncOpcode) { 1450 MachineOperand &MO = MI.getOperand(OpIdx); 1451 Register DstExt = MRI.createGenericVirtualRegister(WideTy); 1452 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1453 MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt}); 1454 MO.setReg(DstExt); 1455 } 1456 1457 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy, 1458 unsigned OpIdx, unsigned ExtOpcode) { 1459 MachineOperand &MO = MI.getOperand(OpIdx); 1460 Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy); 1461 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1462 MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc}); 1463 MO.setReg(DstTrunc); 1464 } 1465 1466 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy, 1467 unsigned OpIdx) { 1468 MachineOperand &MO = MI.getOperand(OpIdx); 1469 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1470 Register Dst = MO.getReg(); 1471 Register DstExt = MRI.createGenericVirtualRegister(WideTy); 1472 MO.setReg(DstExt); 1473 MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt); 1474 } 1475 1476 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy, 1477 unsigned OpIdx) { 1478 MachineOperand &MO = MI.getOperand(OpIdx); 1479 SmallVector<Register, 8> Regs; 1480 MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0)); 1481 } 1482 1483 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) { 1484 MachineOperand &Op = MI.getOperand(OpIdx); 1485 Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0)); 1486 } 1487 1488 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) { 1489 MachineOperand &MO = MI.getOperand(OpIdx); 1490 Register CastDst = MRI.createGenericVirtualRegister(CastTy); 1491 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 1492 MIRBuilder.buildBitcast(MO, CastDst); 1493 MO.setReg(CastDst); 1494 } 1495 1496 LegalizerHelper::LegalizeResult 1497 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, 1498 LLT WideTy) { 1499 if (TypeIdx != 1) 1500 return UnableToLegalize; 1501 1502 Register DstReg = MI.getOperand(0).getReg(); 1503 LLT DstTy = MRI.getType(DstReg); 1504 if (DstTy.isVector()) 1505 return UnableToLegalize; 1506 1507 Register Src1 = MI.getOperand(1).getReg(); 1508 LLT SrcTy = MRI.getType(Src1); 1509 const int DstSize = DstTy.getSizeInBits(); 1510 const int SrcSize = SrcTy.getSizeInBits(); 1511 const int WideSize = WideTy.getSizeInBits(); 1512 const int NumMerge = (DstSize + WideSize - 1) / WideSize; 1513 1514 unsigned NumOps = MI.getNumOperands(); 1515 unsigned NumSrc = MI.getNumOperands() - 1; 1516 unsigned PartSize = DstTy.getSizeInBits() / NumSrc; 1517 1518 if (WideSize >= DstSize) { 1519 // Directly pack the bits in the target type. 1520 Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0); 1521 1522 for (unsigned I = 2; I != NumOps; ++I) { 1523 const unsigned Offset = (I - 1) * PartSize; 1524 1525 Register SrcReg = MI.getOperand(I).getReg(); 1526 assert(MRI.getType(SrcReg) == LLT::scalar(PartSize)); 1527 1528 auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg); 1529 1530 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg : 1531 MRI.createGenericVirtualRegister(WideTy); 1532 1533 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset); 1534 auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt); 1535 MIRBuilder.buildOr(NextResult, ResultReg, Shl); 1536 ResultReg = NextResult; 1537 } 1538 1539 if (WideSize > DstSize) 1540 MIRBuilder.buildTrunc(DstReg, ResultReg); 1541 else if (DstTy.isPointer()) 1542 MIRBuilder.buildIntToPtr(DstReg, ResultReg); 1543 1544 MI.eraseFromParent(); 1545 return Legalized; 1546 } 1547 1548 // Unmerge the original values to the GCD type, and recombine to the next 1549 // multiple greater than the original type. 1550 // 1551 // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6 1552 // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0 1553 // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1 1554 // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2 1555 // %10:_(s6) = G_MERGE_VALUES %4, %5, %6 1556 // %11:_(s6) = G_MERGE_VALUES %7, %8, %9 1557 // %12:_(s12) = G_MERGE_VALUES %10, %11 1558 // 1559 // Padding with undef if necessary: 1560 // 1561 // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6 1562 // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0 1563 // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1 1564 // %7:_(s2) = G_IMPLICIT_DEF 1565 // %8:_(s6) = G_MERGE_VALUES %3, %4, %5 1566 // %9:_(s6) = G_MERGE_VALUES %6, %7, %7 1567 // %10:_(s12) = G_MERGE_VALUES %8, %9 1568 1569 const int GCD = greatestCommonDivisor(SrcSize, WideSize); 1570 LLT GCDTy = LLT::scalar(GCD); 1571 1572 SmallVector<Register, 8> Parts; 1573 SmallVector<Register, 8> NewMergeRegs; 1574 SmallVector<Register, 8> Unmerges; 1575 LLT WideDstTy = LLT::scalar(NumMerge * WideSize); 1576 1577 // Decompose the original operands if they don't evenly divide. 1578 for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) { 1579 Register SrcReg = MO.getReg(); 1580 if (GCD == SrcSize) { 1581 Unmerges.push_back(SrcReg); 1582 } else { 1583 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg); 1584 for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J) 1585 Unmerges.push_back(Unmerge.getReg(J)); 1586 } 1587 } 1588 1589 // Pad with undef to the next size that is a multiple of the requested size. 1590 if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) { 1591 Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0); 1592 for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I) 1593 Unmerges.push_back(UndefReg); 1594 } 1595 1596 const int PartsPerGCD = WideSize / GCD; 1597 1598 // Build merges of each piece. 1599 ArrayRef<Register> Slicer(Unmerges); 1600 for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) { 1601 auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD)); 1602 NewMergeRegs.push_back(Merge.getReg(0)); 1603 } 1604 1605 // A truncate may be necessary if the requested type doesn't evenly divide the 1606 // original result type. 1607 if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) { 1608 MIRBuilder.buildMerge(DstReg, NewMergeRegs); 1609 } else { 1610 auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs); 1611 MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0)); 1612 } 1613 1614 MI.eraseFromParent(); 1615 return Legalized; 1616 } 1617 1618 Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) { 1619 Register WideReg = MRI.createGenericVirtualRegister(WideTy); 1620 LLT OrigTy = MRI.getType(OrigReg); 1621 LLT LCMTy = getLCMType(WideTy, OrigTy); 1622 1623 const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits(); 1624 const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits(); 1625 1626 Register UnmergeSrc = WideReg; 1627 1628 // Create a merge to the LCM type, padding with undef 1629 // %0:_(<3 x s32>) = G_FOO => <4 x s32> 1630 // => 1631 // %1:_(<4 x s32>) = G_FOO 1632 // %2:_(<4 x s32>) = G_IMPLICIT_DEF 1633 // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2 1634 // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3 1635 if (NumMergeParts > 1) { 1636 Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0); 1637 SmallVector<Register, 8> MergeParts(NumMergeParts, Undef); 1638 MergeParts[0] = WideReg; 1639 UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0); 1640 } 1641 1642 // Unmerge to the original register and pad with dead defs. 1643 SmallVector<Register, 8> UnmergeResults(NumUnmergeParts); 1644 UnmergeResults[0] = OrigReg; 1645 for (int I = 1; I != NumUnmergeParts; ++I) 1646 UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy); 1647 1648 MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc); 1649 return WideReg; 1650 } 1651 1652 LegalizerHelper::LegalizeResult 1653 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx, 1654 LLT WideTy) { 1655 if (TypeIdx != 0) 1656 return UnableToLegalize; 1657 1658 int NumDst = MI.getNumOperands() - 1; 1659 Register SrcReg = MI.getOperand(NumDst).getReg(); 1660 LLT SrcTy = MRI.getType(SrcReg); 1661 if (SrcTy.isVector()) 1662 return UnableToLegalize; 1663 1664 Register Dst0Reg = MI.getOperand(0).getReg(); 1665 LLT DstTy = MRI.getType(Dst0Reg); 1666 if (!DstTy.isScalar()) 1667 return UnableToLegalize; 1668 1669 if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) { 1670 if (SrcTy.isPointer()) { 1671 const DataLayout &DL = MIRBuilder.getDataLayout(); 1672 if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) { 1673 LLVM_DEBUG( 1674 dbgs() << "Not casting non-integral address space integer\n"); 1675 return UnableToLegalize; 1676 } 1677 1678 SrcTy = LLT::scalar(SrcTy.getSizeInBits()); 1679 SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0); 1680 } 1681 1682 // Widen SrcTy to WideTy. This does not affect the result, but since the 1683 // user requested this size, it is probably better handled than SrcTy and 1684 // should reduce the total number of legalization artifacts. 1685 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) { 1686 SrcTy = WideTy; 1687 SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0); 1688 } 1689 1690 // Theres no unmerge type to target. Directly extract the bits from the 1691 // source type 1692 unsigned DstSize = DstTy.getSizeInBits(); 1693 1694 MIRBuilder.buildTrunc(Dst0Reg, SrcReg); 1695 for (int I = 1; I != NumDst; ++I) { 1696 auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I); 1697 auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt); 1698 MIRBuilder.buildTrunc(MI.getOperand(I), Shr); 1699 } 1700 1701 MI.eraseFromParent(); 1702 return Legalized; 1703 } 1704 1705 // Extend the source to a wider type. 1706 LLT LCMTy = getLCMType(SrcTy, WideTy); 1707 1708 Register WideSrc = SrcReg; 1709 if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) { 1710 // TODO: If this is an integral address space, cast to integer and anyext. 1711 if (SrcTy.isPointer()) { 1712 LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n"); 1713 return UnableToLegalize; 1714 } 1715 1716 WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0); 1717 } 1718 1719 auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc); 1720 1721 // Create a sequence of unmerges and merges to the original results. Since we 1722 // may have widened the source, we will need to pad the results with dead defs 1723 // to cover the source register. 1724 // e.g. widen s48 to s64: 1725 // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96) 1726 // 1727 // => 1728 // %4:_(s192) = G_ANYEXT %0:_(s96) 1729 // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge 1730 // ; unpack to GCD type, with extra dead defs 1731 // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64) 1732 // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64) 1733 // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64) 1734 // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination 1735 // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination 1736 const LLT GCDTy = getGCDType(WideTy, DstTy); 1737 const int NumUnmerge = Unmerge->getNumOperands() - 1; 1738 const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits(); 1739 1740 // Directly unmerge to the destination without going through a GCD type 1741 // if possible 1742 if (PartsPerRemerge == 1) { 1743 const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits(); 1744 1745 for (int I = 0; I != NumUnmerge; ++I) { 1746 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES); 1747 1748 for (int J = 0; J != PartsPerUnmerge; ++J) { 1749 int Idx = I * PartsPerUnmerge + J; 1750 if (Idx < NumDst) 1751 MIB.addDef(MI.getOperand(Idx).getReg()); 1752 else { 1753 // Create dead def for excess components. 1754 MIB.addDef(MRI.createGenericVirtualRegister(DstTy)); 1755 } 1756 } 1757 1758 MIB.addUse(Unmerge.getReg(I)); 1759 } 1760 } else { 1761 SmallVector<Register, 16> Parts; 1762 for (int J = 0; J != NumUnmerge; ++J) 1763 extractGCDType(Parts, GCDTy, Unmerge.getReg(J)); 1764 1765 SmallVector<Register, 8> RemergeParts; 1766 for (int I = 0; I != NumDst; ++I) { 1767 for (int J = 0; J < PartsPerRemerge; ++J) { 1768 const int Idx = I * PartsPerRemerge + J; 1769 RemergeParts.emplace_back(Parts[Idx]); 1770 } 1771 1772 MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts); 1773 RemergeParts.clear(); 1774 } 1775 } 1776 1777 MI.eraseFromParent(); 1778 return Legalized; 1779 } 1780 1781 LegalizerHelper::LegalizeResult 1782 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx, 1783 LLT WideTy) { 1784 Register DstReg = MI.getOperand(0).getReg(); 1785 Register SrcReg = MI.getOperand(1).getReg(); 1786 LLT SrcTy = MRI.getType(SrcReg); 1787 1788 LLT DstTy = MRI.getType(DstReg); 1789 unsigned Offset = MI.getOperand(2).getImm(); 1790 1791 if (TypeIdx == 0) { 1792 if (SrcTy.isVector() || DstTy.isVector()) 1793 return UnableToLegalize; 1794 1795 SrcOp Src(SrcReg); 1796 if (SrcTy.isPointer()) { 1797 // Extracts from pointers can be handled only if they are really just 1798 // simple integers. 1799 const DataLayout &DL = MIRBuilder.getDataLayout(); 1800 if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) 1801 return UnableToLegalize; 1802 1803 LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits()); 1804 Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src); 1805 SrcTy = SrcAsIntTy; 1806 } 1807 1808 if (DstTy.isPointer()) 1809 return UnableToLegalize; 1810 1811 if (Offset == 0) { 1812 // Avoid a shift in the degenerate case. 1813 MIRBuilder.buildTrunc(DstReg, 1814 MIRBuilder.buildAnyExtOrTrunc(WideTy, Src)); 1815 MI.eraseFromParent(); 1816 return Legalized; 1817 } 1818 1819 // Do a shift in the source type. 1820 LLT ShiftTy = SrcTy; 1821 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) { 1822 Src = MIRBuilder.buildAnyExt(WideTy, Src); 1823 ShiftTy = WideTy; 1824 } 1825 1826 auto LShr = MIRBuilder.buildLShr( 1827 ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset)); 1828 MIRBuilder.buildTrunc(DstReg, LShr); 1829 MI.eraseFromParent(); 1830 return Legalized; 1831 } 1832 1833 if (SrcTy.isScalar()) { 1834 Observer.changingInstr(MI); 1835 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 1836 Observer.changedInstr(MI); 1837 return Legalized; 1838 } 1839 1840 if (!SrcTy.isVector()) 1841 return UnableToLegalize; 1842 1843 if (DstTy != SrcTy.getElementType()) 1844 return UnableToLegalize; 1845 1846 if (Offset % SrcTy.getScalarSizeInBits() != 0) 1847 return UnableToLegalize; 1848 1849 Observer.changingInstr(MI); 1850 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 1851 1852 MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) * 1853 Offset); 1854 widenScalarDst(MI, WideTy.getScalarType(), 0); 1855 Observer.changedInstr(MI); 1856 return Legalized; 1857 } 1858 1859 LegalizerHelper::LegalizeResult 1860 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx, 1861 LLT WideTy) { 1862 if (TypeIdx != 0 || WideTy.isVector()) 1863 return UnableToLegalize; 1864 Observer.changingInstr(MI); 1865 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 1866 widenScalarDst(MI, WideTy); 1867 Observer.changedInstr(MI); 1868 return Legalized; 1869 } 1870 1871 LegalizerHelper::LegalizeResult 1872 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx, 1873 LLT WideTy) { 1874 if (TypeIdx == 1) 1875 return UnableToLegalize; // TODO 1876 1877 unsigned Opcode; 1878 unsigned ExtOpcode; 1879 Optional<Register> CarryIn = None; 1880 switch (MI.getOpcode()) { 1881 default: 1882 llvm_unreachable("Unexpected opcode!"); 1883 case TargetOpcode::G_SADDO: 1884 Opcode = TargetOpcode::G_ADD; 1885 ExtOpcode = TargetOpcode::G_SEXT; 1886 break; 1887 case TargetOpcode::G_SSUBO: 1888 Opcode = TargetOpcode::G_SUB; 1889 ExtOpcode = TargetOpcode::G_SEXT; 1890 break; 1891 case TargetOpcode::G_UADDO: 1892 Opcode = TargetOpcode::G_ADD; 1893 ExtOpcode = TargetOpcode::G_ZEXT; 1894 break; 1895 case TargetOpcode::G_USUBO: 1896 Opcode = TargetOpcode::G_SUB; 1897 ExtOpcode = TargetOpcode::G_ZEXT; 1898 break; 1899 case TargetOpcode::G_SADDE: 1900 Opcode = TargetOpcode::G_UADDE; 1901 ExtOpcode = TargetOpcode::G_SEXT; 1902 CarryIn = MI.getOperand(4).getReg(); 1903 break; 1904 case TargetOpcode::G_SSUBE: 1905 Opcode = TargetOpcode::G_USUBE; 1906 ExtOpcode = TargetOpcode::G_SEXT; 1907 CarryIn = MI.getOperand(4).getReg(); 1908 break; 1909 case TargetOpcode::G_UADDE: 1910 Opcode = TargetOpcode::G_UADDE; 1911 ExtOpcode = TargetOpcode::G_ZEXT; 1912 CarryIn = MI.getOperand(4).getReg(); 1913 break; 1914 case TargetOpcode::G_USUBE: 1915 Opcode = TargetOpcode::G_USUBE; 1916 ExtOpcode = TargetOpcode::G_ZEXT; 1917 CarryIn = MI.getOperand(4).getReg(); 1918 break; 1919 } 1920 1921 auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)}); 1922 auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)}); 1923 // Do the arithmetic in the larger type. 1924 Register NewOp; 1925 if (CarryIn) { 1926 LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg()); 1927 NewOp = MIRBuilder 1928 .buildInstr(Opcode, {WideTy, CarryOutTy}, 1929 {LHSExt, RHSExt, *CarryIn}) 1930 .getReg(0); 1931 } else { 1932 NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0); 1933 } 1934 LLT OrigTy = MRI.getType(MI.getOperand(0).getReg()); 1935 auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp); 1936 auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp}); 1937 // There is no overflow if the ExtOp is the same as NewOp. 1938 MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp); 1939 // Now trunc the NewOp to the original result. 1940 MIRBuilder.buildTrunc(MI.getOperand(0), NewOp); 1941 MI.eraseFromParent(); 1942 return Legalized; 1943 } 1944 1945 LegalizerHelper::LegalizeResult 1946 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx, 1947 LLT WideTy) { 1948 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT || 1949 MI.getOpcode() == TargetOpcode::G_SSUBSAT || 1950 MI.getOpcode() == TargetOpcode::G_SSHLSAT; 1951 bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT || 1952 MI.getOpcode() == TargetOpcode::G_USHLSAT; 1953 // We can convert this to: 1954 // 1. Any extend iN to iM 1955 // 2. SHL by M-N 1956 // 3. [US][ADD|SUB|SHL]SAT 1957 // 4. L/ASHR by M-N 1958 // 1959 // It may be more efficient to lower this to a min and a max operation in 1960 // the higher precision arithmetic if the promoted operation isn't legal, 1961 // but this decision is up to the target's lowering request. 1962 Register DstReg = MI.getOperand(0).getReg(); 1963 1964 unsigned NewBits = WideTy.getScalarSizeInBits(); 1965 unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits(); 1966 1967 // Shifts must zero-extend the RHS to preserve the unsigned quantity, and 1968 // must not left shift the RHS to preserve the shift amount. 1969 auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1)); 1970 auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2)) 1971 : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2)); 1972 auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount); 1973 auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK); 1974 auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK); 1975 1976 auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy}, 1977 {ShiftL, ShiftR}, MI.getFlags()); 1978 1979 // Use a shift that will preserve the number of sign bits when the trunc is 1980 // folded away. 1981 auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK) 1982 : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK); 1983 1984 MIRBuilder.buildTrunc(DstReg, Result); 1985 MI.eraseFromParent(); 1986 return Legalized; 1987 } 1988 1989 LegalizerHelper::LegalizeResult 1990 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx, 1991 LLT WideTy) { 1992 if (TypeIdx == 1) 1993 return UnableToLegalize; 1994 1995 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO; 1996 Register Result = MI.getOperand(0).getReg(); 1997 Register OriginalOverflow = MI.getOperand(1).getReg(); 1998 Register LHS = MI.getOperand(2).getReg(); 1999 Register RHS = MI.getOperand(3).getReg(); 2000 LLT SrcTy = MRI.getType(LHS); 2001 LLT OverflowTy = MRI.getType(OriginalOverflow); 2002 unsigned SrcBitWidth = SrcTy.getScalarSizeInBits(); 2003 2004 // To determine if the result overflowed in the larger type, we extend the 2005 // input to the larger type, do the multiply (checking if it overflows), 2006 // then also check the high bits of the result to see if overflow happened 2007 // there. 2008 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; 2009 auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS}); 2010 auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS}); 2011 2012 auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy}, 2013 {LeftOperand, RightOperand}); 2014 auto Mul = Mulo->getOperand(0); 2015 MIRBuilder.buildTrunc(Result, Mul); 2016 2017 MachineInstrBuilder ExtResult; 2018 // Overflow occurred if it occurred in the larger type, or if the high part 2019 // of the result does not zero/sign-extend the low part. Check this second 2020 // possibility first. 2021 if (IsSigned) { 2022 // For signed, overflow occurred when the high part does not sign-extend 2023 // the low part. 2024 ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth); 2025 } else { 2026 // Unsigned overflow occurred when the high part does not zero-extend the 2027 // low part. 2028 ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth); 2029 } 2030 2031 // Multiplication cannot overflow if the WideTy is >= 2 * original width, 2032 // so we don't need to check the overflow result of larger type Mulo. 2033 if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) { 2034 auto Overflow = 2035 MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult); 2036 // Finally check if the multiplication in the larger type itself overflowed. 2037 MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow); 2038 } else { 2039 MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult); 2040 } 2041 MI.eraseFromParent(); 2042 return Legalized; 2043 } 2044 2045 LegalizerHelper::LegalizeResult 2046 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { 2047 switch (MI.getOpcode()) { 2048 default: 2049 return UnableToLegalize; 2050 case TargetOpcode::G_ATOMICRMW_XCHG: 2051 case TargetOpcode::G_ATOMICRMW_ADD: 2052 case TargetOpcode::G_ATOMICRMW_SUB: 2053 case TargetOpcode::G_ATOMICRMW_AND: 2054 case TargetOpcode::G_ATOMICRMW_OR: 2055 case TargetOpcode::G_ATOMICRMW_XOR: 2056 case TargetOpcode::G_ATOMICRMW_MIN: 2057 case TargetOpcode::G_ATOMICRMW_MAX: 2058 case TargetOpcode::G_ATOMICRMW_UMIN: 2059 case TargetOpcode::G_ATOMICRMW_UMAX: 2060 assert(TypeIdx == 0 && "atomicrmw with second scalar type"); 2061 Observer.changingInstr(MI); 2062 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2063 widenScalarDst(MI, WideTy, 0); 2064 Observer.changedInstr(MI); 2065 return Legalized; 2066 case TargetOpcode::G_ATOMIC_CMPXCHG: 2067 assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type"); 2068 Observer.changingInstr(MI); 2069 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2070 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT); 2071 widenScalarDst(MI, WideTy, 0); 2072 Observer.changedInstr(MI); 2073 return Legalized; 2074 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: 2075 if (TypeIdx == 0) { 2076 Observer.changingInstr(MI); 2077 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT); 2078 widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT); 2079 widenScalarDst(MI, WideTy, 0); 2080 Observer.changedInstr(MI); 2081 return Legalized; 2082 } 2083 assert(TypeIdx == 1 && 2084 "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type"); 2085 Observer.changingInstr(MI); 2086 widenScalarDst(MI, WideTy, 1); 2087 Observer.changedInstr(MI); 2088 return Legalized; 2089 case TargetOpcode::G_EXTRACT: 2090 return widenScalarExtract(MI, TypeIdx, WideTy); 2091 case TargetOpcode::G_INSERT: 2092 return widenScalarInsert(MI, TypeIdx, WideTy); 2093 case TargetOpcode::G_MERGE_VALUES: 2094 return widenScalarMergeValues(MI, TypeIdx, WideTy); 2095 case TargetOpcode::G_UNMERGE_VALUES: 2096 return widenScalarUnmergeValues(MI, TypeIdx, WideTy); 2097 case TargetOpcode::G_SADDO: 2098 case TargetOpcode::G_SSUBO: 2099 case TargetOpcode::G_UADDO: 2100 case TargetOpcode::G_USUBO: 2101 case TargetOpcode::G_SADDE: 2102 case TargetOpcode::G_SSUBE: 2103 case TargetOpcode::G_UADDE: 2104 case TargetOpcode::G_USUBE: 2105 return widenScalarAddSubOverflow(MI, TypeIdx, WideTy); 2106 case TargetOpcode::G_UMULO: 2107 case TargetOpcode::G_SMULO: 2108 return widenScalarMulo(MI, TypeIdx, WideTy); 2109 case TargetOpcode::G_SADDSAT: 2110 case TargetOpcode::G_SSUBSAT: 2111 case TargetOpcode::G_SSHLSAT: 2112 case TargetOpcode::G_UADDSAT: 2113 case TargetOpcode::G_USUBSAT: 2114 case TargetOpcode::G_USHLSAT: 2115 return widenScalarAddSubShlSat(MI, TypeIdx, WideTy); 2116 case TargetOpcode::G_CTTZ: 2117 case TargetOpcode::G_CTTZ_ZERO_UNDEF: 2118 case TargetOpcode::G_CTLZ: 2119 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 2120 case TargetOpcode::G_CTPOP: { 2121 if (TypeIdx == 0) { 2122 Observer.changingInstr(MI); 2123 widenScalarDst(MI, WideTy, 0); 2124 Observer.changedInstr(MI); 2125 return Legalized; 2126 } 2127 2128 Register SrcReg = MI.getOperand(1).getReg(); 2129 2130 // First extend the input. 2131 unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ || 2132 MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF 2133 ? TargetOpcode::G_ANYEXT 2134 : TargetOpcode::G_ZEXT; 2135 auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg}); 2136 LLT CurTy = MRI.getType(SrcReg); 2137 unsigned NewOpc = MI.getOpcode(); 2138 if (NewOpc == TargetOpcode::G_CTTZ) { 2139 // The count is the same in the larger type except if the original 2140 // value was zero. This can be handled by setting the bit just off 2141 // the top of the original type. 2142 auto TopBit = 2143 APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits()); 2144 MIBSrc = MIRBuilder.buildOr( 2145 WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit)); 2146 // Now we know the operand is non-zero, use the more relaxed opcode. 2147 NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF; 2148 } 2149 2150 // Perform the operation at the larger size. 2151 auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc}); 2152 // This is already the correct result for CTPOP and CTTZs 2153 if (MI.getOpcode() == TargetOpcode::G_CTLZ || 2154 MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) { 2155 // The correct result is NewOp - (Difference in widety and current ty). 2156 unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits(); 2157 MIBNewOp = MIRBuilder.buildSub( 2158 WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff)); 2159 } 2160 2161 MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp); 2162 MI.eraseFromParent(); 2163 return Legalized; 2164 } 2165 case TargetOpcode::G_BSWAP: { 2166 Observer.changingInstr(MI); 2167 Register DstReg = MI.getOperand(0).getReg(); 2168 2169 Register ShrReg = MRI.createGenericVirtualRegister(WideTy); 2170 Register DstExt = MRI.createGenericVirtualRegister(WideTy); 2171 Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy); 2172 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2173 2174 MI.getOperand(0).setReg(DstExt); 2175 2176 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 2177 2178 LLT Ty = MRI.getType(DstReg); 2179 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits(); 2180 MIRBuilder.buildConstant(ShiftAmtReg, DiffBits); 2181 MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg); 2182 2183 MIRBuilder.buildTrunc(DstReg, ShrReg); 2184 Observer.changedInstr(MI); 2185 return Legalized; 2186 } 2187 case TargetOpcode::G_BITREVERSE: { 2188 Observer.changingInstr(MI); 2189 2190 Register DstReg = MI.getOperand(0).getReg(); 2191 LLT Ty = MRI.getType(DstReg); 2192 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits(); 2193 2194 Register DstExt = MRI.createGenericVirtualRegister(WideTy); 2195 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2196 MI.getOperand(0).setReg(DstExt); 2197 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 2198 2199 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits); 2200 auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt); 2201 MIRBuilder.buildTrunc(DstReg, Shift); 2202 Observer.changedInstr(MI); 2203 return Legalized; 2204 } 2205 case TargetOpcode::G_FREEZE: 2206 Observer.changingInstr(MI); 2207 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2208 widenScalarDst(MI, WideTy); 2209 Observer.changedInstr(MI); 2210 return Legalized; 2211 2212 case TargetOpcode::G_ABS: 2213 Observer.changingInstr(MI); 2214 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); 2215 widenScalarDst(MI, WideTy); 2216 Observer.changedInstr(MI); 2217 return Legalized; 2218 2219 case TargetOpcode::G_ADD: 2220 case TargetOpcode::G_AND: 2221 case TargetOpcode::G_MUL: 2222 case TargetOpcode::G_OR: 2223 case TargetOpcode::G_XOR: 2224 case TargetOpcode::G_SUB: 2225 // Perform operation at larger width (any extension is fines here, high bits 2226 // don't affect the result) and then truncate the result back to the 2227 // original type. 2228 Observer.changingInstr(MI); 2229 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2230 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2231 widenScalarDst(MI, WideTy); 2232 Observer.changedInstr(MI); 2233 return Legalized; 2234 2235 case TargetOpcode::G_SBFX: 2236 case TargetOpcode::G_UBFX: 2237 Observer.changingInstr(MI); 2238 2239 if (TypeIdx == 0) { 2240 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2241 widenScalarDst(MI, WideTy); 2242 } else { 2243 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2244 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT); 2245 } 2246 2247 Observer.changedInstr(MI); 2248 return Legalized; 2249 2250 case TargetOpcode::G_SHL: 2251 Observer.changingInstr(MI); 2252 2253 if (TypeIdx == 0) { 2254 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2255 widenScalarDst(MI, WideTy); 2256 } else { 2257 assert(TypeIdx == 1); 2258 // The "number of bits to shift" operand must preserve its value as an 2259 // unsigned integer: 2260 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2261 } 2262 2263 Observer.changedInstr(MI); 2264 return Legalized; 2265 2266 case TargetOpcode::G_SDIV: 2267 case TargetOpcode::G_SREM: 2268 case TargetOpcode::G_SMIN: 2269 case TargetOpcode::G_SMAX: 2270 Observer.changingInstr(MI); 2271 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); 2272 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2273 widenScalarDst(MI, WideTy); 2274 Observer.changedInstr(MI); 2275 return Legalized; 2276 2277 case TargetOpcode::G_SDIVREM: 2278 Observer.changingInstr(MI); 2279 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2280 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT); 2281 widenScalarDst(MI, WideTy); 2282 widenScalarDst(MI, WideTy, 1); 2283 Observer.changedInstr(MI); 2284 return Legalized; 2285 2286 case TargetOpcode::G_ASHR: 2287 case TargetOpcode::G_LSHR: 2288 Observer.changingInstr(MI); 2289 2290 if (TypeIdx == 0) { 2291 unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ? 2292 TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; 2293 2294 widenScalarSrc(MI, WideTy, 1, CvtOp); 2295 widenScalarDst(MI, WideTy); 2296 } else { 2297 assert(TypeIdx == 1); 2298 // The "number of bits to shift" operand must preserve its value as an 2299 // unsigned integer: 2300 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2301 } 2302 2303 Observer.changedInstr(MI); 2304 return Legalized; 2305 case TargetOpcode::G_UDIV: 2306 case TargetOpcode::G_UREM: 2307 case TargetOpcode::G_UMIN: 2308 case TargetOpcode::G_UMAX: 2309 Observer.changingInstr(MI); 2310 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); 2311 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2312 widenScalarDst(MI, WideTy); 2313 Observer.changedInstr(MI); 2314 return Legalized; 2315 2316 case TargetOpcode::G_UDIVREM: 2317 Observer.changingInstr(MI); 2318 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2319 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT); 2320 widenScalarDst(MI, WideTy); 2321 widenScalarDst(MI, WideTy, 1); 2322 Observer.changedInstr(MI); 2323 return Legalized; 2324 2325 case TargetOpcode::G_SELECT: 2326 Observer.changingInstr(MI); 2327 if (TypeIdx == 0) { 2328 // Perform operation at larger width (any extension is fine here, high 2329 // bits don't affect the result) and then truncate the result back to the 2330 // original type. 2331 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2332 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT); 2333 widenScalarDst(MI, WideTy); 2334 } else { 2335 bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector(); 2336 // Explicit extension is required here since high bits affect the result. 2337 widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false)); 2338 } 2339 Observer.changedInstr(MI); 2340 return Legalized; 2341 2342 case TargetOpcode::G_FPTOSI: 2343 case TargetOpcode::G_FPTOUI: 2344 Observer.changingInstr(MI); 2345 2346 if (TypeIdx == 0) 2347 widenScalarDst(MI, WideTy); 2348 else 2349 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); 2350 2351 Observer.changedInstr(MI); 2352 return Legalized; 2353 case TargetOpcode::G_SITOFP: 2354 Observer.changingInstr(MI); 2355 2356 if (TypeIdx == 0) 2357 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2358 else 2359 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT); 2360 2361 Observer.changedInstr(MI); 2362 return Legalized; 2363 case TargetOpcode::G_UITOFP: 2364 Observer.changingInstr(MI); 2365 2366 if (TypeIdx == 0) 2367 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2368 else 2369 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); 2370 2371 Observer.changedInstr(MI); 2372 return Legalized; 2373 case TargetOpcode::G_LOAD: 2374 case TargetOpcode::G_SEXTLOAD: 2375 case TargetOpcode::G_ZEXTLOAD: 2376 Observer.changingInstr(MI); 2377 widenScalarDst(MI, WideTy); 2378 Observer.changedInstr(MI); 2379 return Legalized; 2380 2381 case TargetOpcode::G_STORE: { 2382 if (TypeIdx != 0) 2383 return UnableToLegalize; 2384 2385 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2386 if (!Ty.isScalar()) 2387 return UnableToLegalize; 2388 2389 Observer.changingInstr(MI); 2390 2391 unsigned ExtType = Ty.getScalarSizeInBits() == 1 ? 2392 TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT; 2393 widenScalarSrc(MI, WideTy, 0, ExtType); 2394 2395 Observer.changedInstr(MI); 2396 return Legalized; 2397 } 2398 case TargetOpcode::G_CONSTANT: { 2399 MachineOperand &SrcMO = MI.getOperand(1); 2400 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext(); 2401 unsigned ExtOpc = LI.getExtOpcodeForWideningConstant( 2402 MRI.getType(MI.getOperand(0).getReg())); 2403 assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT || 2404 ExtOpc == TargetOpcode::G_ANYEXT) && 2405 "Illegal Extend"); 2406 const APInt &SrcVal = SrcMO.getCImm()->getValue(); 2407 const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT) 2408 ? SrcVal.sext(WideTy.getSizeInBits()) 2409 : SrcVal.zext(WideTy.getSizeInBits()); 2410 Observer.changingInstr(MI); 2411 SrcMO.setCImm(ConstantInt::get(Ctx, Val)); 2412 2413 widenScalarDst(MI, WideTy); 2414 Observer.changedInstr(MI); 2415 return Legalized; 2416 } 2417 case TargetOpcode::G_FCONSTANT: { 2418 MachineOperand &SrcMO = MI.getOperand(1); 2419 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext(); 2420 APFloat Val = SrcMO.getFPImm()->getValueAPF(); 2421 bool LosesInfo; 2422 switch (WideTy.getSizeInBits()) { 2423 case 32: 2424 Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, 2425 &LosesInfo); 2426 break; 2427 case 64: 2428 Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, 2429 &LosesInfo); 2430 break; 2431 default: 2432 return UnableToLegalize; 2433 } 2434 2435 assert(!LosesInfo && "extend should always be lossless"); 2436 2437 Observer.changingInstr(MI); 2438 SrcMO.setFPImm(ConstantFP::get(Ctx, Val)); 2439 2440 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2441 Observer.changedInstr(MI); 2442 return Legalized; 2443 } 2444 case TargetOpcode::G_IMPLICIT_DEF: { 2445 Observer.changingInstr(MI); 2446 widenScalarDst(MI, WideTy); 2447 Observer.changedInstr(MI); 2448 return Legalized; 2449 } 2450 case TargetOpcode::G_BRCOND: 2451 Observer.changingInstr(MI); 2452 widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false)); 2453 Observer.changedInstr(MI); 2454 return Legalized; 2455 2456 case TargetOpcode::G_FCMP: 2457 Observer.changingInstr(MI); 2458 if (TypeIdx == 0) 2459 widenScalarDst(MI, WideTy); 2460 else { 2461 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT); 2462 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT); 2463 } 2464 Observer.changedInstr(MI); 2465 return Legalized; 2466 2467 case TargetOpcode::G_ICMP: 2468 Observer.changingInstr(MI); 2469 if (TypeIdx == 0) 2470 widenScalarDst(MI, WideTy); 2471 else { 2472 unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>( 2473 MI.getOperand(1).getPredicate())) 2474 ? TargetOpcode::G_SEXT 2475 : TargetOpcode::G_ZEXT; 2476 widenScalarSrc(MI, WideTy, 2, ExtOpcode); 2477 widenScalarSrc(MI, WideTy, 3, ExtOpcode); 2478 } 2479 Observer.changedInstr(MI); 2480 return Legalized; 2481 2482 case TargetOpcode::G_PTR_ADD: 2483 assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD"); 2484 Observer.changingInstr(MI); 2485 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2486 Observer.changedInstr(MI); 2487 return Legalized; 2488 2489 case TargetOpcode::G_PHI: { 2490 assert(TypeIdx == 0 && "Expecting only Idx 0"); 2491 2492 Observer.changingInstr(MI); 2493 for (unsigned I = 1; I < MI.getNumOperands(); I += 2) { 2494 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB(); 2495 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator()); 2496 widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT); 2497 } 2498 2499 MachineBasicBlock &MBB = *MI.getParent(); 2500 MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI()); 2501 widenScalarDst(MI, WideTy); 2502 Observer.changedInstr(MI); 2503 return Legalized; 2504 } 2505 case TargetOpcode::G_EXTRACT_VECTOR_ELT: { 2506 if (TypeIdx == 0) { 2507 Register VecReg = MI.getOperand(1).getReg(); 2508 LLT VecTy = MRI.getType(VecReg); 2509 Observer.changingInstr(MI); 2510 2511 widenScalarSrc( 2512 MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1, 2513 TargetOpcode::G_ANYEXT); 2514 2515 widenScalarDst(MI, WideTy, 0); 2516 Observer.changedInstr(MI); 2517 return Legalized; 2518 } 2519 2520 if (TypeIdx != 2) 2521 return UnableToLegalize; 2522 Observer.changingInstr(MI); 2523 // TODO: Probably should be zext 2524 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); 2525 Observer.changedInstr(MI); 2526 return Legalized; 2527 } 2528 case TargetOpcode::G_INSERT_VECTOR_ELT: { 2529 if (TypeIdx == 1) { 2530 Observer.changingInstr(MI); 2531 2532 Register VecReg = MI.getOperand(1).getReg(); 2533 LLT VecTy = MRI.getType(VecReg); 2534 LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy); 2535 2536 widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT); 2537 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT); 2538 widenScalarDst(MI, WideVecTy, 0); 2539 Observer.changedInstr(MI); 2540 return Legalized; 2541 } 2542 2543 if (TypeIdx == 2) { 2544 Observer.changingInstr(MI); 2545 // TODO: Probably should be zext 2546 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT); 2547 Observer.changedInstr(MI); 2548 return Legalized; 2549 } 2550 2551 return UnableToLegalize; 2552 } 2553 case TargetOpcode::G_FADD: 2554 case TargetOpcode::G_FMUL: 2555 case TargetOpcode::G_FSUB: 2556 case TargetOpcode::G_FMA: 2557 case TargetOpcode::G_FMAD: 2558 case TargetOpcode::G_FNEG: 2559 case TargetOpcode::G_FABS: 2560 case TargetOpcode::G_FCANONICALIZE: 2561 case TargetOpcode::G_FMINNUM: 2562 case TargetOpcode::G_FMAXNUM: 2563 case TargetOpcode::G_FMINNUM_IEEE: 2564 case TargetOpcode::G_FMAXNUM_IEEE: 2565 case TargetOpcode::G_FMINIMUM: 2566 case TargetOpcode::G_FMAXIMUM: 2567 case TargetOpcode::G_FDIV: 2568 case TargetOpcode::G_FREM: 2569 case TargetOpcode::G_FCEIL: 2570 case TargetOpcode::G_FFLOOR: 2571 case TargetOpcode::G_FCOS: 2572 case TargetOpcode::G_FSIN: 2573 case TargetOpcode::G_FLOG10: 2574 case TargetOpcode::G_FLOG: 2575 case TargetOpcode::G_FLOG2: 2576 case TargetOpcode::G_FRINT: 2577 case TargetOpcode::G_FNEARBYINT: 2578 case TargetOpcode::G_FSQRT: 2579 case TargetOpcode::G_FEXP: 2580 case TargetOpcode::G_FEXP2: 2581 case TargetOpcode::G_FPOW: 2582 case TargetOpcode::G_INTRINSIC_TRUNC: 2583 case TargetOpcode::G_INTRINSIC_ROUND: 2584 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 2585 assert(TypeIdx == 0); 2586 Observer.changingInstr(MI); 2587 2588 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) 2589 widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT); 2590 2591 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2592 Observer.changedInstr(MI); 2593 return Legalized; 2594 case TargetOpcode::G_FPOWI: { 2595 if (TypeIdx != 0) 2596 return UnableToLegalize; 2597 Observer.changingInstr(MI); 2598 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); 2599 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); 2600 Observer.changedInstr(MI); 2601 return Legalized; 2602 } 2603 case TargetOpcode::G_INTTOPTR: 2604 if (TypeIdx != 1) 2605 return UnableToLegalize; 2606 2607 Observer.changingInstr(MI); 2608 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT); 2609 Observer.changedInstr(MI); 2610 return Legalized; 2611 case TargetOpcode::G_PTRTOINT: 2612 if (TypeIdx != 0) 2613 return UnableToLegalize; 2614 2615 Observer.changingInstr(MI); 2616 widenScalarDst(MI, WideTy, 0); 2617 Observer.changedInstr(MI); 2618 return Legalized; 2619 case TargetOpcode::G_BUILD_VECTOR: { 2620 Observer.changingInstr(MI); 2621 2622 const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType(); 2623 for (int I = 1, E = MI.getNumOperands(); I != E; ++I) 2624 widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT); 2625 2626 // Avoid changing the result vector type if the source element type was 2627 // requested. 2628 if (TypeIdx == 1) { 2629 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC)); 2630 } else { 2631 widenScalarDst(MI, WideTy, 0); 2632 } 2633 2634 Observer.changedInstr(MI); 2635 return Legalized; 2636 } 2637 case TargetOpcode::G_SEXT_INREG: 2638 if (TypeIdx != 0) 2639 return UnableToLegalize; 2640 2641 Observer.changingInstr(MI); 2642 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT); 2643 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC); 2644 Observer.changedInstr(MI); 2645 return Legalized; 2646 case TargetOpcode::G_PTRMASK: { 2647 if (TypeIdx != 1) 2648 return UnableToLegalize; 2649 Observer.changingInstr(MI); 2650 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); 2651 Observer.changedInstr(MI); 2652 return Legalized; 2653 } 2654 } 2655 } 2656 2657 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces, 2658 MachineIRBuilder &B, Register Src, LLT Ty) { 2659 auto Unmerge = B.buildUnmerge(Ty, Src); 2660 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2661 Pieces.push_back(Unmerge.getReg(I)); 2662 } 2663 2664 LegalizerHelper::LegalizeResult 2665 LegalizerHelper::lowerBitcast(MachineInstr &MI) { 2666 Register Dst = MI.getOperand(0).getReg(); 2667 Register Src = MI.getOperand(1).getReg(); 2668 LLT DstTy = MRI.getType(Dst); 2669 LLT SrcTy = MRI.getType(Src); 2670 2671 if (SrcTy.isVector()) { 2672 LLT SrcEltTy = SrcTy.getElementType(); 2673 SmallVector<Register, 8> SrcRegs; 2674 2675 if (DstTy.isVector()) { 2676 int NumDstElt = DstTy.getNumElements(); 2677 int NumSrcElt = SrcTy.getNumElements(); 2678 2679 LLT DstEltTy = DstTy.getElementType(); 2680 LLT DstCastTy = DstEltTy; // Intermediate bitcast result type 2681 LLT SrcPartTy = SrcEltTy; // Original unmerge result type. 2682 2683 // If there's an element size mismatch, insert intermediate casts to match 2684 // the result element type. 2685 if (NumSrcElt < NumDstElt) { // Source element type is larger. 2686 // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>) 2687 // 2688 // => 2689 // 2690 // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0 2691 // %3:_(<2 x s8>) = G_BITCAST %2 2692 // %4:_(<2 x s8>) = G_BITCAST %3 2693 // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4 2694 DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy); 2695 SrcPartTy = SrcEltTy; 2696 } else if (NumSrcElt > NumDstElt) { // Source element type is smaller. 2697 // 2698 // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>) 2699 // 2700 // => 2701 // 2702 // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0 2703 // %3:_(s16) = G_BITCAST %2 2704 // %4:_(s16) = G_BITCAST %3 2705 // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4 2706 SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy); 2707 DstCastTy = DstEltTy; 2708 } 2709 2710 getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy); 2711 for (Register &SrcReg : SrcRegs) 2712 SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0); 2713 } else 2714 getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy); 2715 2716 MIRBuilder.buildMerge(Dst, SrcRegs); 2717 MI.eraseFromParent(); 2718 return Legalized; 2719 } 2720 2721 if (DstTy.isVector()) { 2722 SmallVector<Register, 8> SrcRegs; 2723 getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType()); 2724 MIRBuilder.buildMerge(Dst, SrcRegs); 2725 MI.eraseFromParent(); 2726 return Legalized; 2727 } 2728 2729 return UnableToLegalize; 2730 } 2731 2732 /// Figure out the bit offset into a register when coercing a vector index for 2733 /// the wide element type. This is only for the case when promoting vector to 2734 /// one with larger elements. 2735 // 2736 /// 2737 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize)) 2738 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize) 2739 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B, 2740 Register Idx, 2741 unsigned NewEltSize, 2742 unsigned OldEltSize) { 2743 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); 2744 LLT IdxTy = B.getMRI()->getType(Idx); 2745 2746 // Now figure out the amount we need to shift to get the target bits. 2747 auto OffsetMask = B.buildConstant( 2748 IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio)); 2749 auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask); 2750 return B.buildShl(IdxTy, OffsetIdx, 2751 B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0); 2752 } 2753 2754 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this 2755 /// is casting to a vector with a smaller element size, perform multiple element 2756 /// extracts and merge the results. If this is coercing to a vector with larger 2757 /// elements, index the bitcasted vector and extract the target element with bit 2758 /// operations. This is intended to force the indexing in the native register 2759 /// size for architectures that can dynamically index the register file. 2760 LegalizerHelper::LegalizeResult 2761 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx, 2762 LLT CastTy) { 2763 if (TypeIdx != 1) 2764 return UnableToLegalize; 2765 2766 Register Dst = MI.getOperand(0).getReg(); 2767 Register SrcVec = MI.getOperand(1).getReg(); 2768 Register Idx = MI.getOperand(2).getReg(); 2769 LLT SrcVecTy = MRI.getType(SrcVec); 2770 LLT IdxTy = MRI.getType(Idx); 2771 2772 LLT SrcEltTy = SrcVecTy.getElementType(); 2773 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1; 2774 unsigned OldNumElts = SrcVecTy.getNumElements(); 2775 2776 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy; 2777 Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0); 2778 2779 const unsigned NewEltSize = NewEltTy.getSizeInBits(); 2780 const unsigned OldEltSize = SrcEltTy.getSizeInBits(); 2781 if (NewNumElts > OldNumElts) { 2782 // Decreasing the vector element size 2783 // 2784 // e.g. i64 = extract_vector_elt x:v2i64, y:i32 2785 // => 2786 // v4i32:castx = bitcast x:v2i64 2787 // 2788 // i64 = bitcast 2789 // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))), 2790 // (i32 (extract_vector_elt castx, (2 * y + 1))) 2791 // 2792 if (NewNumElts % OldNumElts != 0) 2793 return UnableToLegalize; 2794 2795 // Type of the intermediate result vector. 2796 const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts; 2797 LLT MidTy = 2798 LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy); 2799 2800 auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt); 2801 2802 SmallVector<Register, 8> NewOps(NewEltsPerOldElt); 2803 auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK); 2804 2805 for (unsigned I = 0; I < NewEltsPerOldElt; ++I) { 2806 auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I); 2807 auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset); 2808 auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx); 2809 NewOps[I] = Elt.getReg(0); 2810 } 2811 2812 auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps); 2813 MIRBuilder.buildBitcast(Dst, NewVec); 2814 MI.eraseFromParent(); 2815 return Legalized; 2816 } 2817 2818 if (NewNumElts < OldNumElts) { 2819 if (NewEltSize % OldEltSize != 0) 2820 return UnableToLegalize; 2821 2822 // This only depends on powers of 2 because we use bit tricks to figure out 2823 // the bit offset we need to shift to get the target element. A general 2824 // expansion could emit division/multiply. 2825 if (!isPowerOf2_32(NewEltSize / OldEltSize)) 2826 return UnableToLegalize; 2827 2828 // Increasing the vector element size. 2829 // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx 2830 // 2831 // => 2832 // 2833 // %cast = G_BITCAST %vec 2834 // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize) 2835 // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx 2836 // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize)) 2837 // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize) 2838 // %elt_bits = G_LSHR %wide_elt, %offset_bits 2839 // %elt = G_TRUNC %elt_bits 2840 2841 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); 2842 auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio); 2843 2844 // Divide to get the index in the wider element type. 2845 auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio); 2846 2847 Register WideElt = CastVec; 2848 if (CastTy.isVector()) { 2849 WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, 2850 ScaledIdx).getReg(0); 2851 } 2852 2853 // Compute the bit offset into the register of the target element. 2854 Register OffsetBits = getBitcastWiderVectorElementOffset( 2855 MIRBuilder, Idx, NewEltSize, OldEltSize); 2856 2857 // Shift the wide element to get the target element. 2858 auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits); 2859 MIRBuilder.buildTrunc(Dst, ExtractedBits); 2860 MI.eraseFromParent(); 2861 return Legalized; 2862 } 2863 2864 return UnableToLegalize; 2865 } 2866 2867 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p 2868 /// TargetReg, while preserving other bits in \p TargetReg. 2869 /// 2870 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset) 2871 static Register buildBitFieldInsert(MachineIRBuilder &B, 2872 Register TargetReg, Register InsertReg, 2873 Register OffsetBits) { 2874 LLT TargetTy = B.getMRI()->getType(TargetReg); 2875 LLT InsertTy = B.getMRI()->getType(InsertReg); 2876 auto ZextVal = B.buildZExt(TargetTy, InsertReg); 2877 auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits); 2878 2879 // Produce a bitmask of the value to insert 2880 auto EltMask = B.buildConstant( 2881 TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(), 2882 InsertTy.getSizeInBits())); 2883 // Shift it into position 2884 auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits); 2885 auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask); 2886 2887 // Clear out the bits in the wide element 2888 auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask); 2889 2890 // The value to insert has all zeros already, so stick it into the masked 2891 // wide element. 2892 return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0); 2893 } 2894 2895 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this 2896 /// is increasing the element size, perform the indexing in the target element 2897 /// type, and use bit operations to insert at the element position. This is 2898 /// intended for architectures that can dynamically index the register file and 2899 /// want to force indexing in the native register size. 2900 LegalizerHelper::LegalizeResult 2901 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx, 2902 LLT CastTy) { 2903 if (TypeIdx != 0) 2904 return UnableToLegalize; 2905 2906 Register Dst = MI.getOperand(0).getReg(); 2907 Register SrcVec = MI.getOperand(1).getReg(); 2908 Register Val = MI.getOperand(2).getReg(); 2909 Register Idx = MI.getOperand(3).getReg(); 2910 2911 LLT VecTy = MRI.getType(Dst); 2912 LLT IdxTy = MRI.getType(Idx); 2913 2914 LLT VecEltTy = VecTy.getElementType(); 2915 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy; 2916 const unsigned NewEltSize = NewEltTy.getSizeInBits(); 2917 const unsigned OldEltSize = VecEltTy.getSizeInBits(); 2918 2919 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1; 2920 unsigned OldNumElts = VecTy.getNumElements(); 2921 2922 Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0); 2923 if (NewNumElts < OldNumElts) { 2924 if (NewEltSize % OldEltSize != 0) 2925 return UnableToLegalize; 2926 2927 // This only depends on powers of 2 because we use bit tricks to figure out 2928 // the bit offset we need to shift to get the target element. A general 2929 // expansion could emit division/multiply. 2930 if (!isPowerOf2_32(NewEltSize / OldEltSize)) 2931 return UnableToLegalize; 2932 2933 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); 2934 auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio); 2935 2936 // Divide to get the index in the wider element type. 2937 auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio); 2938 2939 Register ExtractedElt = CastVec; 2940 if (CastTy.isVector()) { 2941 ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, 2942 ScaledIdx).getReg(0); 2943 } 2944 2945 // Compute the bit offset into the register of the target element. 2946 Register OffsetBits = getBitcastWiderVectorElementOffset( 2947 MIRBuilder, Idx, NewEltSize, OldEltSize); 2948 2949 Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt, 2950 Val, OffsetBits); 2951 if (CastTy.isVector()) { 2952 InsertedElt = MIRBuilder.buildInsertVectorElement( 2953 CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0); 2954 } 2955 2956 MIRBuilder.buildBitcast(Dst, InsertedElt); 2957 MI.eraseFromParent(); 2958 return Legalized; 2959 } 2960 2961 return UnableToLegalize; 2962 } 2963 2964 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) { 2965 // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT 2966 Register DstReg = LoadMI.getDstReg(); 2967 Register PtrReg = LoadMI.getPointerReg(); 2968 LLT DstTy = MRI.getType(DstReg); 2969 MachineMemOperand &MMO = LoadMI.getMMO(); 2970 LLT MemTy = MMO.getMemoryType(); 2971 MachineFunction &MF = MIRBuilder.getMF(); 2972 2973 unsigned MemSizeInBits = MemTy.getSizeInBits(); 2974 unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes(); 2975 2976 if (MemSizeInBits != MemStoreSizeInBits) { 2977 if (MemTy.isVector()) 2978 return UnableToLegalize; 2979 2980 // Promote to a byte-sized load if not loading an integral number of 2981 // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24. 2982 LLT WideMemTy = LLT::scalar(MemStoreSizeInBits); 2983 MachineMemOperand *NewMMO = 2984 MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy); 2985 2986 Register LoadReg = DstReg; 2987 LLT LoadTy = DstTy; 2988 2989 // If this wasn't already an extending load, we need to widen the result 2990 // register to avoid creating a load with a narrower result than the source. 2991 if (MemStoreSizeInBits > DstTy.getSizeInBits()) { 2992 LoadTy = WideMemTy; 2993 LoadReg = MRI.createGenericVirtualRegister(WideMemTy); 2994 } 2995 2996 if (isa<GSExtLoad>(LoadMI)) { 2997 auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO); 2998 MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits); 2999 } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == DstTy) { 3000 auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO); 3001 // The extra bits are guaranteed to be zero, since we stored them that 3002 // way. A zext load from Wide thus automatically gives zext from MemVT. 3003 MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits); 3004 } else { 3005 MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO); 3006 } 3007 3008 if (DstTy != LoadTy) 3009 MIRBuilder.buildTrunc(DstReg, LoadReg); 3010 3011 LoadMI.eraseFromParent(); 3012 return Legalized; 3013 } 3014 3015 // Big endian lowering not implemented. 3016 if (MIRBuilder.getDataLayout().isBigEndian()) 3017 return UnableToLegalize; 3018 3019 // This load needs splitting into power of 2 sized loads. 3020 // 3021 // Our strategy here is to generate anyextending loads for the smaller 3022 // types up to next power-2 result type, and then combine the two larger 3023 // result values together, before truncating back down to the non-pow-2 3024 // type. 3025 // E.g. v1 = i24 load => 3026 // v2 = i32 zextload (2 byte) 3027 // v3 = i32 load (1 byte) 3028 // v4 = i32 shl v3, 16 3029 // v5 = i32 or v4, v2 3030 // v1 = i24 trunc v5 3031 // By doing this we generate the correct truncate which should get 3032 // combined away as an artifact with a matching extend. 3033 3034 uint64_t LargeSplitSize, SmallSplitSize; 3035 3036 if (!isPowerOf2_32(MemSizeInBits)) { 3037 // This load needs splitting into power of 2 sized loads. 3038 LargeSplitSize = PowerOf2Floor(MemSizeInBits); 3039 SmallSplitSize = MemSizeInBits - LargeSplitSize; 3040 } else { 3041 // This is already a power of 2, but we still need to split this in half. 3042 // 3043 // Assume we're being asked to decompose an unaligned load. 3044 // TODO: If this requires multiple splits, handle them all at once. 3045 auto &Ctx = MF.getFunction().getContext(); 3046 if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO)) 3047 return UnableToLegalize; 3048 3049 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2; 3050 } 3051 3052 if (MemTy.isVector()) { 3053 // TODO: Handle vector extloads 3054 if (MemTy != DstTy) 3055 return UnableToLegalize; 3056 3057 // TODO: We can do better than scalarizing the vector and at least split it 3058 // in half. 3059 return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType()); 3060 } 3061 3062 MachineMemOperand *LargeMMO = 3063 MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); 3064 MachineMemOperand *SmallMMO = 3065 MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8); 3066 3067 LLT PtrTy = MRI.getType(PtrReg); 3068 unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits()); 3069 LLT AnyExtTy = LLT::scalar(AnyExtSize); 3070 auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy, 3071 PtrReg, *LargeMMO); 3072 3073 auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), 3074 LargeSplitSize / 8); 3075 Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy); 3076 auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst); 3077 auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy, 3078 SmallPtr, *SmallMMO); 3079 3080 auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize); 3081 auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt); 3082 3083 if (AnyExtTy == DstTy) 3084 MIRBuilder.buildOr(DstReg, Shift, LargeLoad); 3085 else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) { 3086 auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad); 3087 MIRBuilder.buildTrunc(DstReg, {Or}); 3088 } else { 3089 assert(DstTy.isPointer() && "expected pointer"); 3090 auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad); 3091 3092 // FIXME: We currently consider this to be illegal for non-integral address 3093 // spaces, but we need still need a way to reinterpret the bits. 3094 MIRBuilder.buildIntToPtr(DstReg, Or); 3095 } 3096 3097 LoadMI.eraseFromParent(); 3098 return Legalized; 3099 } 3100 3101 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) { 3102 // Lower a non-power of 2 store into multiple pow-2 stores. 3103 // E.g. split an i24 store into an i16 store + i8 store. 3104 // We do this by first extending the stored value to the next largest power 3105 // of 2 type, and then using truncating stores to store the components. 3106 // By doing this, likewise with G_LOAD, generate an extend that can be 3107 // artifact-combined away instead of leaving behind extracts. 3108 Register SrcReg = StoreMI.getValueReg(); 3109 Register PtrReg = StoreMI.getPointerReg(); 3110 LLT SrcTy = MRI.getType(SrcReg); 3111 MachineFunction &MF = MIRBuilder.getMF(); 3112 MachineMemOperand &MMO = **StoreMI.memoperands_begin(); 3113 LLT MemTy = MMO.getMemoryType(); 3114 3115 unsigned StoreWidth = MemTy.getSizeInBits(); 3116 unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes(); 3117 3118 if (StoreWidth != StoreSizeInBits) { 3119 if (SrcTy.isVector()) 3120 return UnableToLegalize; 3121 3122 // Promote to a byte-sized store with upper bits zero if not 3123 // storing an integral number of bytes. For example, promote 3124 // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1) 3125 LLT WideTy = LLT::scalar(StoreSizeInBits); 3126 3127 if (StoreSizeInBits > SrcTy.getSizeInBits()) { 3128 // Avoid creating a store with a narrower source than result. 3129 SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0); 3130 SrcTy = WideTy; 3131 } 3132 3133 auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth); 3134 3135 MachineMemOperand *NewMMO = 3136 MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy); 3137 MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO); 3138 StoreMI.eraseFromParent(); 3139 return Legalized; 3140 } 3141 3142 if (MemTy.isVector()) { 3143 // TODO: Handle vector trunc stores 3144 if (MemTy != SrcTy) 3145 return UnableToLegalize; 3146 3147 // TODO: We can do better than scalarizing the vector and at least split it 3148 // in half. 3149 return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType()); 3150 } 3151 3152 unsigned MemSizeInBits = MemTy.getSizeInBits(); 3153 uint64_t LargeSplitSize, SmallSplitSize; 3154 3155 if (!isPowerOf2_32(MemSizeInBits)) { 3156 LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits()); 3157 SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize; 3158 } else { 3159 auto &Ctx = MF.getFunction().getContext(); 3160 if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO)) 3161 return UnableToLegalize; // Don't know what we're being asked to do. 3162 3163 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2; 3164 } 3165 3166 // Extend to the next pow-2. If this store was itself the result of lowering, 3167 // e.g. an s56 store being broken into s32 + s24, we might have a stored type 3168 // that's wider than the stored size. 3169 unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits()); 3170 const LLT NewSrcTy = LLT::scalar(AnyExtSize); 3171 3172 if (SrcTy.isPointer()) { 3173 const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits()); 3174 SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0); 3175 } 3176 3177 auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg); 3178 3179 // Obtain the smaller value by shifting away the larger value. 3180 auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize); 3181 auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt); 3182 3183 // Generate the PtrAdd and truncating stores. 3184 LLT PtrTy = MRI.getType(PtrReg); 3185 auto OffsetCst = MIRBuilder.buildConstant( 3186 LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8); 3187 auto SmallPtr = 3188 MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst); 3189 3190 MachineMemOperand *LargeMMO = 3191 MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8); 3192 MachineMemOperand *SmallMMO = 3193 MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8); 3194 MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO); 3195 MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO); 3196 StoreMI.eraseFromParent(); 3197 return Legalized; 3198 } 3199 3200 LegalizerHelper::LegalizeResult 3201 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { 3202 switch (MI.getOpcode()) { 3203 case TargetOpcode::G_LOAD: { 3204 if (TypeIdx != 0) 3205 return UnableToLegalize; 3206 MachineMemOperand &MMO = **MI.memoperands_begin(); 3207 3208 // Not sure how to interpret a bitcast of an extending load. 3209 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits()) 3210 return UnableToLegalize; 3211 3212 Observer.changingInstr(MI); 3213 bitcastDst(MI, CastTy, 0); 3214 MMO.setType(CastTy); 3215 Observer.changedInstr(MI); 3216 return Legalized; 3217 } 3218 case TargetOpcode::G_STORE: { 3219 if (TypeIdx != 0) 3220 return UnableToLegalize; 3221 3222 MachineMemOperand &MMO = **MI.memoperands_begin(); 3223 3224 // Not sure how to interpret a bitcast of a truncating store. 3225 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits()) 3226 return UnableToLegalize; 3227 3228 Observer.changingInstr(MI); 3229 bitcastSrc(MI, CastTy, 0); 3230 MMO.setType(CastTy); 3231 Observer.changedInstr(MI); 3232 return Legalized; 3233 } 3234 case TargetOpcode::G_SELECT: { 3235 if (TypeIdx != 0) 3236 return UnableToLegalize; 3237 3238 if (MRI.getType(MI.getOperand(1).getReg()).isVector()) { 3239 LLVM_DEBUG( 3240 dbgs() << "bitcast action not implemented for vector select\n"); 3241 return UnableToLegalize; 3242 } 3243 3244 Observer.changingInstr(MI); 3245 bitcastSrc(MI, CastTy, 2); 3246 bitcastSrc(MI, CastTy, 3); 3247 bitcastDst(MI, CastTy, 0); 3248 Observer.changedInstr(MI); 3249 return Legalized; 3250 } 3251 case TargetOpcode::G_AND: 3252 case TargetOpcode::G_OR: 3253 case TargetOpcode::G_XOR: { 3254 Observer.changingInstr(MI); 3255 bitcastSrc(MI, CastTy, 1); 3256 bitcastSrc(MI, CastTy, 2); 3257 bitcastDst(MI, CastTy, 0); 3258 Observer.changedInstr(MI); 3259 return Legalized; 3260 } 3261 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3262 return bitcastExtractVectorElt(MI, TypeIdx, CastTy); 3263 case TargetOpcode::G_INSERT_VECTOR_ELT: 3264 return bitcastInsertVectorElt(MI, TypeIdx, CastTy); 3265 default: 3266 return UnableToLegalize; 3267 } 3268 } 3269 3270 // Legalize an instruction by changing the opcode in place. 3271 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) { 3272 Observer.changingInstr(MI); 3273 MI.setDesc(MIRBuilder.getTII().get(NewOpcode)); 3274 Observer.changedInstr(MI); 3275 } 3276 3277 LegalizerHelper::LegalizeResult 3278 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { 3279 using namespace TargetOpcode; 3280 3281 switch(MI.getOpcode()) { 3282 default: 3283 return UnableToLegalize; 3284 case TargetOpcode::G_BITCAST: 3285 return lowerBitcast(MI); 3286 case TargetOpcode::G_SREM: 3287 case TargetOpcode::G_UREM: { 3288 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3289 auto Quot = 3290 MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty}, 3291 {MI.getOperand(1), MI.getOperand(2)}); 3292 3293 auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2)); 3294 MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod); 3295 MI.eraseFromParent(); 3296 return Legalized; 3297 } 3298 case TargetOpcode::G_SADDO: 3299 case TargetOpcode::G_SSUBO: 3300 return lowerSADDO_SSUBO(MI); 3301 case TargetOpcode::G_UMULH: 3302 case TargetOpcode::G_SMULH: 3303 return lowerSMULH_UMULH(MI); 3304 case TargetOpcode::G_SMULO: 3305 case TargetOpcode::G_UMULO: { 3306 // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the 3307 // result. 3308 Register Res = MI.getOperand(0).getReg(); 3309 Register Overflow = MI.getOperand(1).getReg(); 3310 Register LHS = MI.getOperand(2).getReg(); 3311 Register RHS = MI.getOperand(3).getReg(); 3312 LLT Ty = MRI.getType(Res); 3313 3314 unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO 3315 ? TargetOpcode::G_SMULH 3316 : TargetOpcode::G_UMULH; 3317 3318 Observer.changingInstr(MI); 3319 const auto &TII = MIRBuilder.getTII(); 3320 MI.setDesc(TII.get(TargetOpcode::G_MUL)); 3321 MI.RemoveOperand(1); 3322 Observer.changedInstr(MI); 3323 3324 auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS}); 3325 auto Zero = MIRBuilder.buildConstant(Ty, 0); 3326 3327 // Move insert point forward so we can use the Res register if needed. 3328 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); 3329 3330 // For *signed* multiply, overflow is detected by checking: 3331 // (hi != (lo >> bitwidth-1)) 3332 if (Opcode == TargetOpcode::G_SMULH) { 3333 auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1); 3334 auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt); 3335 MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted); 3336 } else { 3337 MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero); 3338 } 3339 return Legalized; 3340 } 3341 case TargetOpcode::G_FNEG: { 3342 Register Res = MI.getOperand(0).getReg(); 3343 LLT Ty = MRI.getType(Res); 3344 3345 // TODO: Handle vector types once we are able to 3346 // represent them. 3347 if (Ty.isVector()) 3348 return UnableToLegalize; 3349 auto SignMask = 3350 MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits())); 3351 Register SubByReg = MI.getOperand(1).getReg(); 3352 MIRBuilder.buildXor(Res, SubByReg, SignMask); 3353 MI.eraseFromParent(); 3354 return Legalized; 3355 } 3356 case TargetOpcode::G_FSUB: { 3357 Register Res = MI.getOperand(0).getReg(); 3358 LLT Ty = MRI.getType(Res); 3359 3360 // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)). 3361 // First, check if G_FNEG is marked as Lower. If so, we may 3362 // end up with an infinite loop as G_FSUB is used to legalize G_FNEG. 3363 if (LI.getAction({G_FNEG, {Ty}}).Action == Lower) 3364 return UnableToLegalize; 3365 Register LHS = MI.getOperand(1).getReg(); 3366 Register RHS = MI.getOperand(2).getReg(); 3367 Register Neg = MRI.createGenericVirtualRegister(Ty); 3368 MIRBuilder.buildFNeg(Neg, RHS); 3369 MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags()); 3370 MI.eraseFromParent(); 3371 return Legalized; 3372 } 3373 case TargetOpcode::G_FMAD: 3374 return lowerFMad(MI); 3375 case TargetOpcode::G_FFLOOR: 3376 return lowerFFloor(MI); 3377 case TargetOpcode::G_INTRINSIC_ROUND: 3378 return lowerIntrinsicRound(MI); 3379 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: { 3380 // Since round even is the assumed rounding mode for unconstrained FP 3381 // operations, rint and roundeven are the same operation. 3382 changeOpcode(MI, TargetOpcode::G_FRINT); 3383 return Legalized; 3384 } 3385 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: { 3386 Register OldValRes = MI.getOperand(0).getReg(); 3387 Register SuccessRes = MI.getOperand(1).getReg(); 3388 Register Addr = MI.getOperand(2).getReg(); 3389 Register CmpVal = MI.getOperand(3).getReg(); 3390 Register NewVal = MI.getOperand(4).getReg(); 3391 MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal, 3392 **MI.memoperands_begin()); 3393 MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal); 3394 MI.eraseFromParent(); 3395 return Legalized; 3396 } 3397 case TargetOpcode::G_LOAD: 3398 case TargetOpcode::G_SEXTLOAD: 3399 case TargetOpcode::G_ZEXTLOAD: 3400 return lowerLoad(cast<GAnyLoad>(MI)); 3401 case TargetOpcode::G_STORE: 3402 return lowerStore(cast<GStore>(MI)); 3403 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 3404 case TargetOpcode::G_CTTZ_ZERO_UNDEF: 3405 case TargetOpcode::G_CTLZ: 3406 case TargetOpcode::G_CTTZ: 3407 case TargetOpcode::G_CTPOP: 3408 return lowerBitCount(MI); 3409 case G_UADDO: { 3410 Register Res = MI.getOperand(0).getReg(); 3411 Register CarryOut = MI.getOperand(1).getReg(); 3412 Register LHS = MI.getOperand(2).getReg(); 3413 Register RHS = MI.getOperand(3).getReg(); 3414 3415 MIRBuilder.buildAdd(Res, LHS, RHS); 3416 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS); 3417 3418 MI.eraseFromParent(); 3419 return Legalized; 3420 } 3421 case G_UADDE: { 3422 Register Res = MI.getOperand(0).getReg(); 3423 Register CarryOut = MI.getOperand(1).getReg(); 3424 Register LHS = MI.getOperand(2).getReg(); 3425 Register RHS = MI.getOperand(3).getReg(); 3426 Register CarryIn = MI.getOperand(4).getReg(); 3427 LLT Ty = MRI.getType(Res); 3428 3429 auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS); 3430 auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn); 3431 MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn); 3432 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS); 3433 3434 MI.eraseFromParent(); 3435 return Legalized; 3436 } 3437 case G_USUBO: { 3438 Register Res = MI.getOperand(0).getReg(); 3439 Register BorrowOut = MI.getOperand(1).getReg(); 3440 Register LHS = MI.getOperand(2).getReg(); 3441 Register RHS = MI.getOperand(3).getReg(); 3442 3443 MIRBuilder.buildSub(Res, LHS, RHS); 3444 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS); 3445 3446 MI.eraseFromParent(); 3447 return Legalized; 3448 } 3449 case G_USUBE: { 3450 Register Res = MI.getOperand(0).getReg(); 3451 Register BorrowOut = MI.getOperand(1).getReg(); 3452 Register LHS = MI.getOperand(2).getReg(); 3453 Register RHS = MI.getOperand(3).getReg(); 3454 Register BorrowIn = MI.getOperand(4).getReg(); 3455 const LLT CondTy = MRI.getType(BorrowOut); 3456 const LLT Ty = MRI.getType(Res); 3457 3458 auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS); 3459 auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn); 3460 MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn); 3461 3462 auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS); 3463 auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS); 3464 MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS); 3465 3466 MI.eraseFromParent(); 3467 return Legalized; 3468 } 3469 case G_UITOFP: 3470 return lowerUITOFP(MI); 3471 case G_SITOFP: 3472 return lowerSITOFP(MI); 3473 case G_FPTOUI: 3474 return lowerFPTOUI(MI); 3475 case G_FPTOSI: 3476 return lowerFPTOSI(MI); 3477 case G_FPTRUNC: 3478 return lowerFPTRUNC(MI); 3479 case G_FPOWI: 3480 return lowerFPOWI(MI); 3481 case G_SMIN: 3482 case G_SMAX: 3483 case G_UMIN: 3484 case G_UMAX: 3485 return lowerMinMax(MI); 3486 case G_FCOPYSIGN: 3487 return lowerFCopySign(MI); 3488 case G_FMINNUM: 3489 case G_FMAXNUM: 3490 return lowerFMinNumMaxNum(MI); 3491 case G_MERGE_VALUES: 3492 return lowerMergeValues(MI); 3493 case G_UNMERGE_VALUES: 3494 return lowerUnmergeValues(MI); 3495 case TargetOpcode::G_SEXT_INREG: { 3496 assert(MI.getOperand(2).isImm() && "Expected immediate"); 3497 int64_t SizeInBits = MI.getOperand(2).getImm(); 3498 3499 Register DstReg = MI.getOperand(0).getReg(); 3500 Register SrcReg = MI.getOperand(1).getReg(); 3501 LLT DstTy = MRI.getType(DstReg); 3502 Register TmpRes = MRI.createGenericVirtualRegister(DstTy); 3503 3504 auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits); 3505 MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0)); 3506 MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0)); 3507 MI.eraseFromParent(); 3508 return Legalized; 3509 } 3510 case G_EXTRACT_VECTOR_ELT: 3511 case G_INSERT_VECTOR_ELT: 3512 return lowerExtractInsertVectorElt(MI); 3513 case G_SHUFFLE_VECTOR: 3514 return lowerShuffleVector(MI); 3515 case G_DYN_STACKALLOC: 3516 return lowerDynStackAlloc(MI); 3517 case G_EXTRACT: 3518 return lowerExtract(MI); 3519 case G_INSERT: 3520 return lowerInsert(MI); 3521 case G_BSWAP: 3522 return lowerBswap(MI); 3523 case G_BITREVERSE: 3524 return lowerBitreverse(MI); 3525 case G_READ_REGISTER: 3526 case G_WRITE_REGISTER: 3527 return lowerReadWriteRegister(MI); 3528 case G_UADDSAT: 3529 case G_USUBSAT: { 3530 // Try to make a reasonable guess about which lowering strategy to use. The 3531 // target can override this with custom lowering and calling the 3532 // implementation functions. 3533 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3534 if (LI.isLegalOrCustom({G_UMIN, Ty})) 3535 return lowerAddSubSatToMinMax(MI); 3536 return lowerAddSubSatToAddoSubo(MI); 3537 } 3538 case G_SADDSAT: 3539 case G_SSUBSAT: { 3540 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3541 3542 // FIXME: It would probably make more sense to see if G_SADDO is preferred, 3543 // since it's a shorter expansion. However, we would need to figure out the 3544 // preferred boolean type for the carry out for the query. 3545 if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty})) 3546 return lowerAddSubSatToMinMax(MI); 3547 return lowerAddSubSatToAddoSubo(MI); 3548 } 3549 case G_SSHLSAT: 3550 case G_USHLSAT: 3551 return lowerShlSat(MI); 3552 case G_ABS: 3553 return lowerAbsToAddXor(MI); 3554 case G_SELECT: 3555 return lowerSelect(MI); 3556 case G_SDIVREM: 3557 case G_UDIVREM: 3558 return lowerDIVREM(MI); 3559 case G_FSHL: 3560 case G_FSHR: 3561 return lowerFunnelShift(MI); 3562 case G_ROTL: 3563 case G_ROTR: 3564 return lowerRotate(MI); 3565 case G_MEMSET: 3566 case G_MEMCPY: 3567 case G_MEMMOVE: 3568 return lowerMemCpyFamily(MI); 3569 case G_MEMCPY_INLINE: 3570 return lowerMemcpyInline(MI); 3571 GISEL_VECREDUCE_CASES_NONSEQ 3572 return lowerVectorReduction(MI); 3573 } 3574 } 3575 3576 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty, 3577 Align MinAlign) const { 3578 // FIXME: We're missing a way to go back from LLT to llvm::Type to query the 3579 // datalayout for the preferred alignment. Also there should be a target hook 3580 // for this to allow targets to reduce the alignment and ignore the 3581 // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of 3582 // the type. 3583 return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign); 3584 } 3585 3586 MachineInstrBuilder 3587 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment, 3588 MachinePointerInfo &PtrInfo) { 3589 MachineFunction &MF = MIRBuilder.getMF(); 3590 const DataLayout &DL = MIRBuilder.getDataLayout(); 3591 int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false); 3592 3593 unsigned AddrSpace = DL.getAllocaAddrSpace(); 3594 LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)); 3595 3596 PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx); 3597 return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx); 3598 } 3599 3600 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg, 3601 LLT VecTy) { 3602 int64_t IdxVal; 3603 if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal))) 3604 return IdxReg; 3605 3606 LLT IdxTy = B.getMRI()->getType(IdxReg); 3607 unsigned NElts = VecTy.getNumElements(); 3608 if (isPowerOf2_32(NElts)) { 3609 APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts)); 3610 return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0); 3611 } 3612 3613 return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1)) 3614 .getReg(0); 3615 } 3616 3617 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy, 3618 Register Index) { 3619 LLT EltTy = VecTy.getElementType(); 3620 3621 // Calculate the element offset and add it to the pointer. 3622 unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size. 3623 assert(EltSize * 8 == EltTy.getSizeInBits() && 3624 "Converting bits to bytes lost precision"); 3625 3626 Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy); 3627 3628 LLT IdxTy = MRI.getType(Index); 3629 auto Mul = MIRBuilder.buildMul(IdxTy, Index, 3630 MIRBuilder.buildConstant(IdxTy, EltSize)); 3631 3632 LLT PtrTy = MRI.getType(VecPtr); 3633 return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0); 3634 } 3635 3636 #ifndef NDEBUG 3637 /// Check that all vector operands have same number of elements. Other operands 3638 /// should be listed in NonVecOp. 3639 static bool hasSameNumEltsOnAllVectorOperands( 3640 GenericMachineInstr &MI, MachineRegisterInfo &MRI, 3641 std::initializer_list<unsigned> NonVecOpIndices) { 3642 if (MI.getNumMemOperands() != 0) 3643 return false; 3644 3645 LLT VecTy = MRI.getType(MI.getReg(0)); 3646 if (!VecTy.isVector()) 3647 return false; 3648 unsigned NumElts = VecTy.getNumElements(); 3649 3650 for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) { 3651 MachineOperand &Op = MI.getOperand(OpIdx); 3652 if (!Op.isReg()) { 3653 if (!is_contained(NonVecOpIndices, OpIdx)) 3654 return false; 3655 continue; 3656 } 3657 3658 LLT Ty = MRI.getType(Op.getReg()); 3659 if (!Ty.isVector()) { 3660 if (!is_contained(NonVecOpIndices, OpIdx)) 3661 return false; 3662 continue; 3663 } 3664 3665 if (Ty.getNumElements() != NumElts) 3666 return false; 3667 } 3668 3669 return true; 3670 } 3671 #endif 3672 3673 /// Fill \p DstOps with DstOps that have same number of elements combined as 3674 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are 3675 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple 3676 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements. 3677 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty, 3678 unsigned NumElts) { 3679 LLT LeftoverTy; 3680 assert(Ty.isVector() && "Expected vector type"); 3681 LLT EltTy = Ty.getElementType(); 3682 LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy); 3683 int NumParts, NumLeftover; 3684 std::tie(NumParts, NumLeftover) = 3685 getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy); 3686 3687 assert(NumParts > 0 && "Error in getNarrowTypeBreakDown"); 3688 for (int i = 0; i < NumParts; ++i) { 3689 DstOps.push_back(NarrowTy); 3690 } 3691 3692 if (LeftoverTy.isValid()) { 3693 assert(NumLeftover == 1 && "expected exactly one leftover"); 3694 DstOps.push_back(LeftoverTy); 3695 } 3696 } 3697 3698 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps 3699 /// made from \p Op depending on operand type. 3700 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N, 3701 MachineOperand &Op) { 3702 for (unsigned i = 0; i < N; ++i) { 3703 if (Op.isReg()) 3704 Ops.push_back(Op.getReg()); 3705 else if (Op.isImm()) 3706 Ops.push_back(Op.getImm()); 3707 else if (Op.isPredicate()) 3708 Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate())); 3709 else 3710 llvm_unreachable("Unsupported type"); 3711 } 3712 } 3713 3714 // Handle splitting vector operations which need to have the same number of 3715 // elements in each type index, but each type index may have a different element 3716 // type. 3717 // 3718 // e.g. <4 x s64> = G_SHL <4 x s64>, <4 x s32> -> 3719 // <2 x s64> = G_SHL <2 x s64>, <2 x s32> 3720 // <2 x s64> = G_SHL <2 x s64>, <2 x s32> 3721 // 3722 // Also handles some irregular breakdown cases, e.g. 3723 // e.g. <3 x s64> = G_SHL <3 x s64>, <3 x s32> -> 3724 // <2 x s64> = G_SHL <2 x s64>, <2 x s32> 3725 // s64 = G_SHL s64, s32 3726 LegalizerHelper::LegalizeResult 3727 LegalizerHelper::fewerElementsVectorMultiEltType( 3728 GenericMachineInstr &MI, unsigned NumElts, 3729 std::initializer_list<unsigned> NonVecOpIndices) { 3730 assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) && 3731 "Non-compatible opcode or not specified non-vector operands"); 3732 unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements(); 3733 3734 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs(); 3735 unsigned NumDefs = MI.getNumDefs(); 3736 3737 // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output. 3738 // Build instructions with DstOps to use instruction found by CSE directly. 3739 // CSE copies found instruction into given vreg when building with vreg dest. 3740 SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs); 3741 // Output registers will be taken from created instructions. 3742 SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs); 3743 for (unsigned i = 0; i < NumDefs; ++i) { 3744 makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts); 3745 } 3746 3747 // Split vector input operands into sub-vectors with NumElts elts + Leftover. 3748 // Operands listed in NonVecOpIndices will be used as is without splitting; 3749 // examples: compare predicate in icmp and fcmp (op 1), vector select with i1 3750 // scalar condition (op 1), immediate in sext_inreg (op 2). 3751 SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs); 3752 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands(); 3753 ++UseIdx, ++UseNo) { 3754 if (is_contained(NonVecOpIndices, UseIdx)) { 3755 broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(), 3756 MI.getOperand(UseIdx)); 3757 } else { 3758 SmallVector<Register, 8> SplitPieces; 3759 extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces); 3760 for (auto Reg : SplitPieces) 3761 InputOpsPieces[UseNo].push_back(Reg); 3762 } 3763 } 3764 3765 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0; 3766 3767 // Take i-th piece of each input operand split and build sub-vector/scalar 3768 // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s). 3769 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) { 3770 SmallVector<DstOp, 2> Defs; 3771 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo) 3772 Defs.push_back(OutputOpsPieces[DstNo][i]); 3773 3774 SmallVector<SrcOp, 3> Uses; 3775 for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo) 3776 Uses.push_back(InputOpsPieces[InputNo][i]); 3777 3778 auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags()); 3779 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo) 3780 OutputRegs[DstNo].push_back(I.getReg(DstNo)); 3781 } 3782 3783 // Merge small outputs into MI's output for each def operand. 3784 if (NumLeftovers) { 3785 for (unsigned i = 0; i < NumDefs; ++i) 3786 mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]); 3787 } else { 3788 for (unsigned i = 0; i < NumDefs; ++i) 3789 MIRBuilder.buildMerge(MI.getReg(i), OutputRegs[i]); 3790 } 3791 3792 MI.eraseFromParent(); 3793 return Legalized; 3794 } 3795 3796 LegalizerHelper::LegalizeResult 3797 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI, 3798 unsigned NumElts) { 3799 unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements(); 3800 3801 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs(); 3802 unsigned NumDefs = MI.getNumDefs(); 3803 3804 SmallVector<DstOp, 8> OutputOpsPieces; 3805 SmallVector<Register, 8> OutputRegs; 3806 makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts); 3807 3808 // Instructions that perform register split will be inserted in basic block 3809 // where register is defined (basic block is in the next operand). 3810 SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2); 3811 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands(); 3812 UseIdx += 2, ++UseNo) { 3813 MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB(); 3814 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator()); 3815 extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo]); 3816 } 3817 3818 // Build PHIs with fewer elements. 3819 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0; 3820 MIRBuilder.setInsertPt(*MI.getParent(), MI); 3821 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) { 3822 auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI); 3823 Phi.addDef( 3824 MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI))); 3825 OutputRegs.push_back(Phi.getReg(0)); 3826 3827 for (unsigned j = 0; j < NumInputs / 2; ++j) { 3828 Phi.addUse(InputOpsPieces[j][i]); 3829 Phi.add(MI.getOperand(1 + j * 2 + 1)); 3830 } 3831 } 3832 3833 // Merge small outputs into MI's def. 3834 if (NumLeftovers) { 3835 mergeMixedSubvectors(MI.getReg(0), OutputRegs); 3836 } else { 3837 MIRBuilder.buildMerge(MI.getReg(0), OutputRegs); 3838 } 3839 3840 MI.eraseFromParent(); 3841 return Legalized; 3842 } 3843 3844 LegalizerHelper::LegalizeResult 3845 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI, 3846 unsigned TypeIdx, 3847 LLT NarrowTy) { 3848 const int NumDst = MI.getNumOperands() - 1; 3849 const Register SrcReg = MI.getOperand(NumDst).getReg(); 3850 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3851 LLT SrcTy = MRI.getType(SrcReg); 3852 3853 if (TypeIdx != 1 || NarrowTy == DstTy) 3854 return UnableToLegalize; 3855 3856 // Requires compatible types. Otherwise SrcReg should have been defined by 3857 // merge-like instruction that would get artifact combined. Most likely 3858 // instruction that defines SrcReg has to perform more/fewer elements 3859 // legalization compatible with NarrowTy. 3860 assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types"); 3861 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type"); 3862 3863 if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) || 3864 (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0)) 3865 return UnableToLegalize; 3866 3867 // This is most likely DstTy (smaller then register size) packed in SrcTy 3868 // (larger then register size) and since unmerge was not combined it will be 3869 // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy 3870 // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy. 3871 3872 // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy) 3873 // 3874 // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence 3875 // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg 3876 // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy) 3877 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg); 3878 const int NumUnmerge = Unmerge->getNumOperands() - 1; 3879 const int PartsPerUnmerge = NumDst / NumUnmerge; 3880 3881 for (int I = 0; I != NumUnmerge; ++I) { 3882 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES); 3883 3884 for (int J = 0; J != PartsPerUnmerge; ++J) 3885 MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg()); 3886 MIB.addUse(Unmerge.getReg(I)); 3887 } 3888 3889 MI.eraseFromParent(); 3890 return Legalized; 3891 } 3892 3893 LegalizerHelper::LegalizeResult 3894 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx, 3895 LLT NarrowTy) { 3896 Register DstReg = MI.getOperand(0).getReg(); 3897 LLT DstTy = MRI.getType(DstReg); 3898 LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); 3899 // Requires compatible types. Otherwise user of DstReg did not perform unmerge 3900 // that should have been artifact combined. Most likely instruction that uses 3901 // DstReg has to do more/fewer elements legalization compatible with NarrowTy. 3902 assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types"); 3903 assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type"); 3904 if (NarrowTy == SrcTy) 3905 return UnableToLegalize; 3906 3907 // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use 3908 // is for old mir tests. Since the changes to more/fewer elements it should no 3909 // longer be possible to generate MIR like this when starting from llvm-ir 3910 // because LCMTy approach was replaced with merge/unmerge to vector elements. 3911 if (TypeIdx == 1) { 3912 assert(SrcTy.isVector() && "Expected vector types"); 3913 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type"); 3914 if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) || 3915 (NarrowTy.getNumElements() >= SrcTy.getNumElements())) 3916 return UnableToLegalize; 3917 // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy) 3918 // 3919 // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy) 3920 // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy) 3921 // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4 3922 // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6 3923 // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8 3924 // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11 3925 3926 SmallVector<Register, 8> Elts; 3927 LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType(); 3928 for (unsigned i = 1; i < MI.getNumOperands(); ++i) { 3929 auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg()); 3930 for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j) 3931 Elts.push_back(Unmerge.getReg(j)); 3932 } 3933 3934 SmallVector<Register, 8> NarrowTyElts; 3935 unsigned NumNarrowTyElts = NarrowTy.getNumElements(); 3936 unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts; 3937 for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces; 3938 ++i, Offset += NumNarrowTyElts) { 3939 ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts); 3940 NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0)); 3941 } 3942 3943 MIRBuilder.buildMerge(DstReg, NarrowTyElts); 3944 MI.eraseFromParent(); 3945 return Legalized; 3946 } 3947 3948 assert(TypeIdx == 0 && "Bad type index"); 3949 if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) || 3950 (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0)) 3951 return UnableToLegalize; 3952 3953 // This is most likely SrcTy (smaller then register size) packed in DstTy 3954 // (larger then register size) and since merge was not combined it will be 3955 // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy 3956 // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy. 3957 3958 // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4 3959 // 3960 // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg 3961 // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4 3962 // %0:_(DstTy) = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence 3963 SmallVector<Register, 8> NarrowTyElts; 3964 unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements(); 3965 unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1; 3966 unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts; 3967 for (unsigned i = 0; i < NumParts; ++i) { 3968 SmallVector<Register, 8> Sources; 3969 for (unsigned j = 0; j < NumElts; ++j) 3970 Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg()); 3971 NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Sources).getReg(0)); 3972 } 3973 3974 MIRBuilder.buildMerge(DstReg, NarrowTyElts); 3975 MI.eraseFromParent(); 3976 return Legalized; 3977 } 3978 3979 LegalizerHelper::LegalizeResult 3980 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI, 3981 unsigned TypeIdx, 3982 LLT NarrowVecTy) { 3983 Register DstReg = MI.getOperand(0).getReg(); 3984 Register SrcVec = MI.getOperand(1).getReg(); 3985 Register InsertVal; 3986 bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT; 3987 3988 assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index"); 3989 if (IsInsert) 3990 InsertVal = MI.getOperand(2).getReg(); 3991 3992 Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg(); 3993 3994 // TODO: Handle total scalarization case. 3995 if (!NarrowVecTy.isVector()) 3996 return UnableToLegalize; 3997 3998 LLT VecTy = MRI.getType(SrcVec); 3999 4000 // If the index is a constant, we can really break this down as you would 4001 // expect, and index into the target size pieces. 4002 int64_t IdxVal; 4003 auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI); 4004 if (MaybeCst) { 4005 IdxVal = MaybeCst->Value.getSExtValue(); 4006 // Avoid out of bounds indexing the pieces. 4007 if (IdxVal >= VecTy.getNumElements()) { 4008 MIRBuilder.buildUndef(DstReg); 4009 MI.eraseFromParent(); 4010 return Legalized; 4011 } 4012 4013 SmallVector<Register, 8> VecParts; 4014 LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec); 4015 4016 // Build a sequence of NarrowTy pieces in VecParts for this operand. 4017 LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts, 4018 TargetOpcode::G_ANYEXT); 4019 4020 unsigned NewNumElts = NarrowVecTy.getNumElements(); 4021 4022 LLT IdxTy = MRI.getType(Idx); 4023 int64_t PartIdx = IdxVal / NewNumElts; 4024 auto NewIdx = 4025 MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx); 4026 4027 if (IsInsert) { 4028 LLT PartTy = MRI.getType(VecParts[PartIdx]); 4029 4030 // Use the adjusted index to insert into one of the subvectors. 4031 auto InsertPart = MIRBuilder.buildInsertVectorElement( 4032 PartTy, VecParts[PartIdx], InsertVal, NewIdx); 4033 VecParts[PartIdx] = InsertPart.getReg(0); 4034 4035 // Recombine the inserted subvector with the others to reform the result 4036 // vector. 4037 buildWidenedRemergeToDst(DstReg, LCMTy, VecParts); 4038 } else { 4039 MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx); 4040 } 4041 4042 MI.eraseFromParent(); 4043 return Legalized; 4044 } 4045 4046 // With a variable index, we can't perform the operation in a smaller type, so 4047 // we're forced to expand this. 4048 // 4049 // TODO: We could emit a chain of compare/select to figure out which piece to 4050 // index. 4051 return lowerExtractInsertVectorElt(MI); 4052 } 4053 4054 LegalizerHelper::LegalizeResult 4055 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx, 4056 LLT NarrowTy) { 4057 // FIXME: Don't know how to handle secondary types yet. 4058 if (TypeIdx != 0) 4059 return UnableToLegalize; 4060 4061 // This implementation doesn't work for atomics. Give up instead of doing 4062 // something invalid. 4063 if (LdStMI.isAtomic()) 4064 return UnableToLegalize; 4065 4066 bool IsLoad = isa<GLoad>(LdStMI); 4067 Register ValReg = LdStMI.getReg(0); 4068 Register AddrReg = LdStMI.getPointerReg(); 4069 LLT ValTy = MRI.getType(ValReg); 4070 4071 // FIXME: Do we need a distinct NarrowMemory legalize action? 4072 if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) { 4073 LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n"); 4074 return UnableToLegalize; 4075 } 4076 4077 int NumParts = -1; 4078 int NumLeftover = -1; 4079 LLT LeftoverTy; 4080 SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs; 4081 if (IsLoad) { 4082 std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy); 4083 } else { 4084 if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs, 4085 NarrowLeftoverRegs)) { 4086 NumParts = NarrowRegs.size(); 4087 NumLeftover = NarrowLeftoverRegs.size(); 4088 } 4089 } 4090 4091 if (NumParts == -1) 4092 return UnableToLegalize; 4093 4094 LLT PtrTy = MRI.getType(AddrReg); 4095 const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); 4096 4097 unsigned TotalSize = ValTy.getSizeInBits(); 4098 4099 // Split the load/store into PartTy sized pieces starting at Offset. If this 4100 // is a load, return the new registers in ValRegs. For a store, each elements 4101 // of ValRegs should be PartTy. Returns the next offset that needs to be 4102 // handled. 4103 bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian(); 4104 auto MMO = LdStMI.getMMO(); 4105 auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs, 4106 unsigned NumParts, unsigned Offset) -> unsigned { 4107 MachineFunction &MF = MIRBuilder.getMF(); 4108 unsigned PartSize = PartTy.getSizeInBits(); 4109 for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize; 4110 ++Idx) { 4111 unsigned ByteOffset = Offset / 8; 4112 Register NewAddrReg; 4113 4114 MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset); 4115 4116 MachineMemOperand *NewMMO = 4117 MF.getMachineMemOperand(&MMO, ByteOffset, PartTy); 4118 4119 if (IsLoad) { 4120 Register Dst = MRI.createGenericVirtualRegister(PartTy); 4121 ValRegs.push_back(Dst); 4122 MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO); 4123 } else { 4124 MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO); 4125 } 4126 Offset = isBigEndian ? Offset - PartSize : Offset + PartSize; 4127 } 4128 4129 return Offset; 4130 }; 4131 4132 unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0; 4133 unsigned HandledOffset = 4134 splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset); 4135 4136 // Handle the rest of the register if this isn't an even type breakdown. 4137 if (LeftoverTy.isValid()) 4138 splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset); 4139 4140 if (IsLoad) { 4141 insertParts(ValReg, ValTy, NarrowTy, NarrowRegs, 4142 LeftoverTy, NarrowLeftoverRegs); 4143 } 4144 4145 LdStMI.eraseFromParent(); 4146 return Legalized; 4147 } 4148 4149 LegalizerHelper::LegalizeResult 4150 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, 4151 LLT NarrowTy) { 4152 using namespace TargetOpcode; 4153 GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI); 4154 unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1; 4155 4156 switch (MI.getOpcode()) { 4157 case G_IMPLICIT_DEF: 4158 case G_TRUNC: 4159 case G_AND: 4160 case G_OR: 4161 case G_XOR: 4162 case G_ADD: 4163 case G_SUB: 4164 case G_MUL: 4165 case G_PTR_ADD: 4166 case G_SMULH: 4167 case G_UMULH: 4168 case G_FADD: 4169 case G_FMUL: 4170 case G_FSUB: 4171 case G_FNEG: 4172 case G_FABS: 4173 case G_FCANONICALIZE: 4174 case G_FDIV: 4175 case G_FREM: 4176 case G_FMA: 4177 case G_FMAD: 4178 case G_FPOW: 4179 case G_FEXP: 4180 case G_FEXP2: 4181 case G_FLOG: 4182 case G_FLOG2: 4183 case G_FLOG10: 4184 case G_FNEARBYINT: 4185 case G_FCEIL: 4186 case G_FFLOOR: 4187 case G_FRINT: 4188 case G_INTRINSIC_ROUND: 4189 case G_INTRINSIC_ROUNDEVEN: 4190 case G_INTRINSIC_TRUNC: 4191 case G_FCOS: 4192 case G_FSIN: 4193 case G_FSQRT: 4194 case G_BSWAP: 4195 case G_BITREVERSE: 4196 case G_SDIV: 4197 case G_UDIV: 4198 case G_SREM: 4199 case G_UREM: 4200 case G_SDIVREM: 4201 case G_UDIVREM: 4202 case G_SMIN: 4203 case G_SMAX: 4204 case G_UMIN: 4205 case G_UMAX: 4206 case G_ABS: 4207 case G_FMINNUM: 4208 case G_FMAXNUM: 4209 case G_FMINNUM_IEEE: 4210 case G_FMAXNUM_IEEE: 4211 case G_FMINIMUM: 4212 case G_FMAXIMUM: 4213 case G_FSHL: 4214 case G_FSHR: 4215 case G_ROTL: 4216 case G_ROTR: 4217 case G_FREEZE: 4218 case G_SADDSAT: 4219 case G_SSUBSAT: 4220 case G_UADDSAT: 4221 case G_USUBSAT: 4222 case G_UMULO: 4223 case G_SMULO: 4224 case G_SHL: 4225 case G_LSHR: 4226 case G_ASHR: 4227 case G_SSHLSAT: 4228 case G_USHLSAT: 4229 case G_CTLZ: 4230 case G_CTLZ_ZERO_UNDEF: 4231 case G_CTTZ: 4232 case G_CTTZ_ZERO_UNDEF: 4233 case G_CTPOP: 4234 case G_FCOPYSIGN: 4235 case G_ZEXT: 4236 case G_SEXT: 4237 case G_ANYEXT: 4238 case G_FPEXT: 4239 case G_FPTRUNC: 4240 case G_SITOFP: 4241 case G_UITOFP: 4242 case G_FPTOSI: 4243 case G_FPTOUI: 4244 case G_INTTOPTR: 4245 case G_PTRTOINT: 4246 case G_ADDRSPACE_CAST: 4247 return fewerElementsVectorMultiEltType(GMI, NumElts); 4248 case G_ICMP: 4249 case G_FCMP: 4250 return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/}); 4251 case G_SELECT: 4252 if (MRI.getType(MI.getOperand(1).getReg()).isVector()) 4253 return fewerElementsVectorMultiEltType(GMI, NumElts); 4254 return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/}); 4255 case G_PHI: 4256 return fewerElementsVectorPhi(GMI, NumElts); 4257 case G_UNMERGE_VALUES: 4258 return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy); 4259 case G_BUILD_VECTOR: 4260 assert(TypeIdx == 0 && "not a vector type index"); 4261 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy); 4262 case G_CONCAT_VECTORS: 4263 if (TypeIdx != 1) // TODO: This probably does work as expected already. 4264 return UnableToLegalize; 4265 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy); 4266 case G_EXTRACT_VECTOR_ELT: 4267 case G_INSERT_VECTOR_ELT: 4268 return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy); 4269 case G_LOAD: 4270 case G_STORE: 4271 return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy); 4272 case G_SEXT_INREG: 4273 return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/}); 4274 GISEL_VECREDUCE_CASES_NONSEQ 4275 return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy); 4276 case G_SHUFFLE_VECTOR: 4277 return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy); 4278 default: 4279 return UnableToLegalize; 4280 } 4281 } 4282 4283 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle( 4284 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) { 4285 assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); 4286 if (TypeIdx != 0) 4287 return UnableToLegalize; 4288 4289 Register DstReg = MI.getOperand(0).getReg(); 4290 Register Src1Reg = MI.getOperand(1).getReg(); 4291 Register Src2Reg = MI.getOperand(2).getReg(); 4292 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); 4293 LLT DstTy = MRI.getType(DstReg); 4294 LLT Src1Ty = MRI.getType(Src1Reg); 4295 LLT Src2Ty = MRI.getType(Src2Reg); 4296 // The shuffle should be canonicalized by now. 4297 if (DstTy != Src1Ty) 4298 return UnableToLegalize; 4299 if (DstTy != Src2Ty) 4300 return UnableToLegalize; 4301 4302 if (!isPowerOf2_32(DstTy.getNumElements())) 4303 return UnableToLegalize; 4304 4305 // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly. 4306 // Further legalization attempts will be needed to do split further. 4307 NarrowTy = 4308 DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2)); 4309 unsigned NewElts = NarrowTy.getNumElements(); 4310 4311 SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs; 4312 extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs); 4313 extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs); 4314 Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0], 4315 SplitSrc2Regs[1]}; 4316 4317 Register Hi, Lo; 4318 4319 // If Lo or Hi uses elements from at most two of the four input vectors, then 4320 // express it as a vector shuffle of those two inputs. Otherwise extract the 4321 // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR. 4322 SmallVector<int, 16> Ops; 4323 for (unsigned High = 0; High < 2; ++High) { 4324 Register &Output = High ? Hi : Lo; 4325 4326 // Build a shuffle mask for the output, discovering on the fly which 4327 // input vectors to use as shuffle operands (recorded in InputUsed). 4328 // If building a suitable shuffle vector proves too hard, then bail 4329 // out with useBuildVector set. 4330 unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered. 4331 unsigned FirstMaskIdx = High * NewElts; 4332 bool UseBuildVector = false; 4333 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) { 4334 // The mask element. This indexes into the input. 4335 int Idx = Mask[FirstMaskIdx + MaskOffset]; 4336 4337 // The input vector this mask element indexes into. 4338 unsigned Input = (unsigned)Idx / NewElts; 4339 4340 if (Input >= array_lengthof(Inputs)) { 4341 // The mask element does not index into any input vector. 4342 Ops.push_back(-1); 4343 continue; 4344 } 4345 4346 // Turn the index into an offset from the start of the input vector. 4347 Idx -= Input * NewElts; 4348 4349 // Find or create a shuffle vector operand to hold this input. 4350 unsigned OpNo; 4351 for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { 4352 if (InputUsed[OpNo] == Input) { 4353 // This input vector is already an operand. 4354 break; 4355 } else if (InputUsed[OpNo] == -1U) { 4356 // Create a new operand for this input vector. 4357 InputUsed[OpNo] = Input; 4358 break; 4359 } 4360 } 4361 4362 if (OpNo >= array_lengthof(InputUsed)) { 4363 // More than two input vectors used! Give up on trying to create a 4364 // shuffle vector. Insert all elements into a BUILD_VECTOR instead. 4365 UseBuildVector = true; 4366 break; 4367 } 4368 4369 // Add the mask index for the new shuffle vector. 4370 Ops.push_back(Idx + OpNo * NewElts); 4371 } 4372 4373 if (UseBuildVector) { 4374 LLT EltTy = NarrowTy.getElementType(); 4375 SmallVector<Register, 16> SVOps; 4376 4377 // Extract the input elements by hand. 4378 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) { 4379 // The mask element. This indexes into the input. 4380 int Idx = Mask[FirstMaskIdx + MaskOffset]; 4381 4382 // The input vector this mask element indexes into. 4383 unsigned Input = (unsigned)Idx / NewElts; 4384 4385 if (Input >= array_lengthof(Inputs)) { 4386 // The mask element is "undef" or indexes off the end of the input. 4387 SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0)); 4388 continue; 4389 } 4390 4391 // Turn the index into an offset from the start of the input vector. 4392 Idx -= Input * NewElts; 4393 4394 // Extract the vector element by hand. 4395 SVOps.push_back(MIRBuilder 4396 .buildExtractVectorElement( 4397 EltTy, Inputs[Input], 4398 MIRBuilder.buildConstant(LLT::scalar(32), Idx)) 4399 .getReg(0)); 4400 } 4401 4402 // Construct the Lo/Hi output using a G_BUILD_VECTOR. 4403 Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0); 4404 } else if (InputUsed[0] == -1U) { 4405 // No input vectors were used! The result is undefined. 4406 Output = MIRBuilder.buildUndef(NarrowTy).getReg(0); 4407 } else { 4408 Register Op0 = Inputs[InputUsed[0]]; 4409 // If only one input was used, use an undefined vector for the other. 4410 Register Op1 = InputUsed[1] == -1U 4411 ? MIRBuilder.buildUndef(NarrowTy).getReg(0) 4412 : Inputs[InputUsed[1]]; 4413 // At least one input vector was used. Create a new shuffle vector. 4414 Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0); 4415 } 4416 4417 Ops.clear(); 4418 } 4419 4420 MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi}); 4421 MI.eraseFromParent(); 4422 return Legalized; 4423 } 4424 4425 static unsigned getScalarOpcForReduction(unsigned Opc) { 4426 unsigned ScalarOpc; 4427 switch (Opc) { 4428 case TargetOpcode::G_VECREDUCE_FADD: 4429 ScalarOpc = TargetOpcode::G_FADD; 4430 break; 4431 case TargetOpcode::G_VECREDUCE_FMUL: 4432 ScalarOpc = TargetOpcode::G_FMUL; 4433 break; 4434 case TargetOpcode::G_VECREDUCE_FMAX: 4435 ScalarOpc = TargetOpcode::G_FMAXNUM; 4436 break; 4437 case TargetOpcode::G_VECREDUCE_FMIN: 4438 ScalarOpc = TargetOpcode::G_FMINNUM; 4439 break; 4440 case TargetOpcode::G_VECREDUCE_ADD: 4441 ScalarOpc = TargetOpcode::G_ADD; 4442 break; 4443 case TargetOpcode::G_VECREDUCE_MUL: 4444 ScalarOpc = TargetOpcode::G_MUL; 4445 break; 4446 case TargetOpcode::G_VECREDUCE_AND: 4447 ScalarOpc = TargetOpcode::G_AND; 4448 break; 4449 case TargetOpcode::G_VECREDUCE_OR: 4450 ScalarOpc = TargetOpcode::G_OR; 4451 break; 4452 case TargetOpcode::G_VECREDUCE_XOR: 4453 ScalarOpc = TargetOpcode::G_XOR; 4454 break; 4455 case TargetOpcode::G_VECREDUCE_SMAX: 4456 ScalarOpc = TargetOpcode::G_SMAX; 4457 break; 4458 case TargetOpcode::G_VECREDUCE_SMIN: 4459 ScalarOpc = TargetOpcode::G_SMIN; 4460 break; 4461 case TargetOpcode::G_VECREDUCE_UMAX: 4462 ScalarOpc = TargetOpcode::G_UMAX; 4463 break; 4464 case TargetOpcode::G_VECREDUCE_UMIN: 4465 ScalarOpc = TargetOpcode::G_UMIN; 4466 break; 4467 default: 4468 llvm_unreachable("Unhandled reduction"); 4469 } 4470 return ScalarOpc; 4471 } 4472 4473 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions( 4474 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) { 4475 unsigned Opc = MI.getOpcode(); 4476 assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD && 4477 Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL && 4478 "Sequential reductions not expected"); 4479 4480 if (TypeIdx != 1) 4481 return UnableToLegalize; 4482 4483 // The semantics of the normal non-sequential reductions allow us to freely 4484 // re-associate the operation. 4485 Register SrcReg = MI.getOperand(1).getReg(); 4486 LLT SrcTy = MRI.getType(SrcReg); 4487 Register DstReg = MI.getOperand(0).getReg(); 4488 LLT DstTy = MRI.getType(DstReg); 4489 4490 if (NarrowTy.isVector() && 4491 (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0)) 4492 return UnableToLegalize; 4493 4494 unsigned ScalarOpc = getScalarOpcForReduction(Opc); 4495 SmallVector<Register> SplitSrcs; 4496 // If NarrowTy is a scalar then we're being asked to scalarize. 4497 const unsigned NumParts = 4498 NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements() 4499 : SrcTy.getNumElements(); 4500 4501 extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs); 4502 if (NarrowTy.isScalar()) { 4503 if (DstTy != NarrowTy) 4504 return UnableToLegalize; // FIXME: handle implicit extensions. 4505 4506 if (isPowerOf2_32(NumParts)) { 4507 // Generate a tree of scalar operations to reduce the critical path. 4508 SmallVector<Register> PartialResults; 4509 unsigned NumPartsLeft = NumParts; 4510 while (NumPartsLeft > 1) { 4511 for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) { 4512 PartialResults.emplace_back( 4513 MIRBuilder 4514 .buildInstr(ScalarOpc, {NarrowTy}, 4515 {SplitSrcs[Idx], SplitSrcs[Idx + 1]}) 4516 .getReg(0)); 4517 } 4518 SplitSrcs = PartialResults; 4519 PartialResults.clear(); 4520 NumPartsLeft = SplitSrcs.size(); 4521 } 4522 assert(SplitSrcs.size() == 1); 4523 MIRBuilder.buildCopy(DstReg, SplitSrcs[0]); 4524 MI.eraseFromParent(); 4525 return Legalized; 4526 } 4527 // If we can't generate a tree, then just do sequential operations. 4528 Register Acc = SplitSrcs[0]; 4529 for (unsigned Idx = 1; Idx < NumParts; ++Idx) 4530 Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]}) 4531 .getReg(0); 4532 MIRBuilder.buildCopy(DstReg, Acc); 4533 MI.eraseFromParent(); 4534 return Legalized; 4535 } 4536 SmallVector<Register> PartialReductions; 4537 for (unsigned Part = 0; Part < NumParts; ++Part) { 4538 PartialReductions.push_back( 4539 MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0)); 4540 } 4541 4542 4543 // If the types involved are powers of 2, we can generate intermediate vector 4544 // ops, before generating a final reduction operation. 4545 if (isPowerOf2_32(SrcTy.getNumElements()) && 4546 isPowerOf2_32(NarrowTy.getNumElements())) { 4547 return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc); 4548 } 4549 4550 Register Acc = PartialReductions[0]; 4551 for (unsigned Part = 1; Part < NumParts; ++Part) { 4552 if (Part == NumParts - 1) { 4553 MIRBuilder.buildInstr(ScalarOpc, {DstReg}, 4554 {Acc, PartialReductions[Part]}); 4555 } else { 4556 Acc = MIRBuilder 4557 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]}) 4558 .getReg(0); 4559 } 4560 } 4561 MI.eraseFromParent(); 4562 return Legalized; 4563 } 4564 4565 LegalizerHelper::LegalizeResult 4566 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg, 4567 LLT SrcTy, LLT NarrowTy, 4568 unsigned ScalarOpc) { 4569 SmallVector<Register> SplitSrcs; 4570 // Split the sources into NarrowTy size pieces. 4571 extractParts(SrcReg, NarrowTy, 4572 SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs); 4573 // We're going to do a tree reduction using vector operations until we have 4574 // one NarrowTy size value left. 4575 while (SplitSrcs.size() > 1) { 4576 SmallVector<Register> PartialRdxs; 4577 for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) { 4578 Register LHS = SplitSrcs[Idx]; 4579 Register RHS = SplitSrcs[Idx + 1]; 4580 // Create the intermediate vector op. 4581 Register Res = 4582 MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0); 4583 PartialRdxs.push_back(Res); 4584 } 4585 SplitSrcs = std::move(PartialRdxs); 4586 } 4587 // Finally generate the requested NarrowTy based reduction. 4588 Observer.changingInstr(MI); 4589 MI.getOperand(1).setReg(SplitSrcs[0]); 4590 Observer.changedInstr(MI); 4591 return Legalized; 4592 } 4593 4594 LegalizerHelper::LegalizeResult 4595 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt, 4596 const LLT HalfTy, const LLT AmtTy) { 4597 4598 Register InL = MRI.createGenericVirtualRegister(HalfTy); 4599 Register InH = MRI.createGenericVirtualRegister(HalfTy); 4600 MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1)); 4601 4602 if (Amt.isZero()) { 4603 MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH}); 4604 MI.eraseFromParent(); 4605 return Legalized; 4606 } 4607 4608 LLT NVT = HalfTy; 4609 unsigned NVTBits = HalfTy.getSizeInBits(); 4610 unsigned VTBits = 2 * NVTBits; 4611 4612 SrcOp Lo(Register(0)), Hi(Register(0)); 4613 if (MI.getOpcode() == TargetOpcode::G_SHL) { 4614 if (Amt.ugt(VTBits)) { 4615 Lo = Hi = MIRBuilder.buildConstant(NVT, 0); 4616 } else if (Amt.ugt(NVTBits)) { 4617 Lo = MIRBuilder.buildConstant(NVT, 0); 4618 Hi = MIRBuilder.buildShl(NVT, InL, 4619 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits)); 4620 } else if (Amt == NVTBits) { 4621 Lo = MIRBuilder.buildConstant(NVT, 0); 4622 Hi = InL; 4623 } else { 4624 Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt)); 4625 auto OrLHS = 4626 MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt)); 4627 auto OrRHS = MIRBuilder.buildLShr( 4628 NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits)); 4629 Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS); 4630 } 4631 } else if (MI.getOpcode() == TargetOpcode::G_LSHR) { 4632 if (Amt.ugt(VTBits)) { 4633 Lo = Hi = MIRBuilder.buildConstant(NVT, 0); 4634 } else if (Amt.ugt(NVTBits)) { 4635 Lo = MIRBuilder.buildLShr(NVT, InH, 4636 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits)); 4637 Hi = MIRBuilder.buildConstant(NVT, 0); 4638 } else if (Amt == NVTBits) { 4639 Lo = InH; 4640 Hi = MIRBuilder.buildConstant(NVT, 0); 4641 } else { 4642 auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt); 4643 4644 auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst); 4645 auto OrRHS = MIRBuilder.buildShl( 4646 NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits)); 4647 4648 Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS); 4649 Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst); 4650 } 4651 } else { 4652 if (Amt.ugt(VTBits)) { 4653 Hi = Lo = MIRBuilder.buildAShr( 4654 NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1)); 4655 } else if (Amt.ugt(NVTBits)) { 4656 Lo = MIRBuilder.buildAShr(NVT, InH, 4657 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits)); 4658 Hi = MIRBuilder.buildAShr(NVT, InH, 4659 MIRBuilder.buildConstant(AmtTy, NVTBits - 1)); 4660 } else if (Amt == NVTBits) { 4661 Lo = InH; 4662 Hi = MIRBuilder.buildAShr(NVT, InH, 4663 MIRBuilder.buildConstant(AmtTy, NVTBits - 1)); 4664 } else { 4665 auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt); 4666 4667 auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst); 4668 auto OrRHS = MIRBuilder.buildShl( 4669 NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits)); 4670 4671 Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS); 4672 Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst); 4673 } 4674 } 4675 4676 MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi}); 4677 MI.eraseFromParent(); 4678 4679 return Legalized; 4680 } 4681 4682 // TODO: Optimize if constant shift amount. 4683 LegalizerHelper::LegalizeResult 4684 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx, 4685 LLT RequestedTy) { 4686 if (TypeIdx == 1) { 4687 Observer.changingInstr(MI); 4688 narrowScalarSrc(MI, RequestedTy, 2); 4689 Observer.changedInstr(MI); 4690 return Legalized; 4691 } 4692 4693 Register DstReg = MI.getOperand(0).getReg(); 4694 LLT DstTy = MRI.getType(DstReg); 4695 if (DstTy.isVector()) 4696 return UnableToLegalize; 4697 4698 Register Amt = MI.getOperand(2).getReg(); 4699 LLT ShiftAmtTy = MRI.getType(Amt); 4700 const unsigned DstEltSize = DstTy.getScalarSizeInBits(); 4701 if (DstEltSize % 2 != 0) 4702 return UnableToLegalize; 4703 4704 // Ignore the input type. We can only go to exactly half the size of the 4705 // input. If that isn't small enough, the resulting pieces will be further 4706 // legalized. 4707 const unsigned NewBitSize = DstEltSize / 2; 4708 const LLT HalfTy = LLT::scalar(NewBitSize); 4709 const LLT CondTy = LLT::scalar(1); 4710 4711 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) { 4712 return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy, 4713 ShiftAmtTy); 4714 } 4715 4716 // TODO: Expand with known bits. 4717 4718 // Handle the fully general expansion by an unknown amount. 4719 auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize); 4720 4721 Register InL = MRI.createGenericVirtualRegister(HalfTy); 4722 Register InH = MRI.createGenericVirtualRegister(HalfTy); 4723 MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1)); 4724 4725 auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits); 4726 auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt); 4727 4728 auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0); 4729 auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits); 4730 auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero); 4731 4732 Register ResultRegs[2]; 4733 switch (MI.getOpcode()) { 4734 case TargetOpcode::G_SHL: { 4735 // Short: ShAmt < NewBitSize 4736 auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt); 4737 4738 auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack); 4739 auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt); 4740 auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr); 4741 4742 // Long: ShAmt >= NewBitSize 4743 auto LoL = MIRBuilder.buildConstant(HalfTy, 0); // Lo part is zero. 4744 auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part. 4745 4746 auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL); 4747 auto Hi = MIRBuilder.buildSelect( 4748 HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL)); 4749 4750 ResultRegs[0] = Lo.getReg(0); 4751 ResultRegs[1] = Hi.getReg(0); 4752 break; 4753 } 4754 case TargetOpcode::G_LSHR: 4755 case TargetOpcode::G_ASHR: { 4756 // Short: ShAmt < NewBitSize 4757 auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt}); 4758 4759 auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt); 4760 auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack); 4761 auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr); 4762 4763 // Long: ShAmt >= NewBitSize 4764 MachineInstrBuilder HiL; 4765 if (MI.getOpcode() == TargetOpcode::G_LSHR) { 4766 HiL = MIRBuilder.buildConstant(HalfTy, 0); // Hi part is zero. 4767 } else { 4768 auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1); 4769 HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt); // Sign of Hi part. 4770 } 4771 auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, 4772 {InH, AmtExcess}); // Lo from Hi part. 4773 4774 auto Lo = MIRBuilder.buildSelect( 4775 HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL)); 4776 4777 auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL); 4778 4779 ResultRegs[0] = Lo.getReg(0); 4780 ResultRegs[1] = Hi.getReg(0); 4781 break; 4782 } 4783 default: 4784 llvm_unreachable("not a shift"); 4785 } 4786 4787 MIRBuilder.buildMerge(DstReg, ResultRegs); 4788 MI.eraseFromParent(); 4789 return Legalized; 4790 } 4791 4792 LegalizerHelper::LegalizeResult 4793 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx, 4794 LLT MoreTy) { 4795 assert(TypeIdx == 0 && "Expecting only Idx 0"); 4796 4797 Observer.changingInstr(MI); 4798 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 4799 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB(); 4800 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator()); 4801 moreElementsVectorSrc(MI, MoreTy, I); 4802 } 4803 4804 MachineBasicBlock &MBB = *MI.getParent(); 4805 MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI()); 4806 moreElementsVectorDst(MI, MoreTy, 0); 4807 Observer.changedInstr(MI); 4808 return Legalized; 4809 } 4810 4811 LegalizerHelper::LegalizeResult 4812 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, 4813 LLT MoreTy) { 4814 unsigned Opc = MI.getOpcode(); 4815 switch (Opc) { 4816 case TargetOpcode::G_IMPLICIT_DEF: 4817 case TargetOpcode::G_LOAD: { 4818 if (TypeIdx != 0) 4819 return UnableToLegalize; 4820 Observer.changingInstr(MI); 4821 moreElementsVectorDst(MI, MoreTy, 0); 4822 Observer.changedInstr(MI); 4823 return Legalized; 4824 } 4825 case TargetOpcode::G_STORE: 4826 if (TypeIdx != 0) 4827 return UnableToLegalize; 4828 Observer.changingInstr(MI); 4829 moreElementsVectorSrc(MI, MoreTy, 0); 4830 Observer.changedInstr(MI); 4831 return Legalized; 4832 case TargetOpcode::G_AND: 4833 case TargetOpcode::G_OR: 4834 case TargetOpcode::G_XOR: 4835 case TargetOpcode::G_ADD: 4836 case TargetOpcode::G_SUB: 4837 case TargetOpcode::G_MUL: 4838 case TargetOpcode::G_FADD: 4839 case TargetOpcode::G_FMUL: 4840 case TargetOpcode::G_UADDSAT: 4841 case TargetOpcode::G_USUBSAT: 4842 case TargetOpcode::G_SADDSAT: 4843 case TargetOpcode::G_SSUBSAT: 4844 case TargetOpcode::G_SMIN: 4845 case TargetOpcode::G_SMAX: 4846 case TargetOpcode::G_UMIN: 4847 case TargetOpcode::G_UMAX: 4848 case TargetOpcode::G_FMINNUM: 4849 case TargetOpcode::G_FMAXNUM: 4850 case TargetOpcode::G_FMINNUM_IEEE: 4851 case TargetOpcode::G_FMAXNUM_IEEE: 4852 case TargetOpcode::G_FMINIMUM: 4853 case TargetOpcode::G_FMAXIMUM: { 4854 Observer.changingInstr(MI); 4855 moreElementsVectorSrc(MI, MoreTy, 1); 4856 moreElementsVectorSrc(MI, MoreTy, 2); 4857 moreElementsVectorDst(MI, MoreTy, 0); 4858 Observer.changedInstr(MI); 4859 return Legalized; 4860 } 4861 case TargetOpcode::G_FMA: 4862 case TargetOpcode::G_FSHR: 4863 case TargetOpcode::G_FSHL: { 4864 Observer.changingInstr(MI); 4865 moreElementsVectorSrc(MI, MoreTy, 1); 4866 moreElementsVectorSrc(MI, MoreTy, 2); 4867 moreElementsVectorSrc(MI, MoreTy, 3); 4868 moreElementsVectorDst(MI, MoreTy, 0); 4869 Observer.changedInstr(MI); 4870 return Legalized; 4871 } 4872 case TargetOpcode::G_EXTRACT: 4873 if (TypeIdx != 1) 4874 return UnableToLegalize; 4875 Observer.changingInstr(MI); 4876 moreElementsVectorSrc(MI, MoreTy, 1); 4877 Observer.changedInstr(MI); 4878 return Legalized; 4879 case TargetOpcode::G_INSERT: 4880 case TargetOpcode::G_FREEZE: 4881 case TargetOpcode::G_FNEG: 4882 case TargetOpcode::G_FABS: 4883 case TargetOpcode::G_BSWAP: 4884 case TargetOpcode::G_FCANONICALIZE: 4885 case TargetOpcode::G_SEXT_INREG: 4886 if (TypeIdx != 0) 4887 return UnableToLegalize; 4888 Observer.changingInstr(MI); 4889 moreElementsVectorSrc(MI, MoreTy, 1); 4890 moreElementsVectorDst(MI, MoreTy, 0); 4891 Observer.changedInstr(MI); 4892 return Legalized; 4893 case TargetOpcode::G_SELECT: 4894 if (TypeIdx != 0) 4895 return UnableToLegalize; 4896 if (MRI.getType(MI.getOperand(1).getReg()).isVector()) 4897 return UnableToLegalize; 4898 4899 Observer.changingInstr(MI); 4900 moreElementsVectorSrc(MI, MoreTy, 2); 4901 moreElementsVectorSrc(MI, MoreTy, 3); 4902 moreElementsVectorDst(MI, MoreTy, 0); 4903 Observer.changedInstr(MI); 4904 return Legalized; 4905 case TargetOpcode::G_UNMERGE_VALUES: 4906 return UnableToLegalize; 4907 case TargetOpcode::G_PHI: 4908 return moreElementsVectorPhi(MI, TypeIdx, MoreTy); 4909 case TargetOpcode::G_SHUFFLE_VECTOR: 4910 return moreElementsVectorShuffle(MI, TypeIdx, MoreTy); 4911 case TargetOpcode::G_BUILD_VECTOR: { 4912 SmallVector<SrcOp, 8> Elts; 4913 for (auto Op : MI.uses()) { 4914 Elts.push_back(Op.getReg()); 4915 } 4916 4917 for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) { 4918 Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType())); 4919 } 4920 4921 MIRBuilder.buildDeleteTrailingVectorElements( 4922 MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts)); 4923 MI.eraseFromParent(); 4924 return Legalized; 4925 } 4926 case TargetOpcode::G_TRUNC: { 4927 Observer.changingInstr(MI); 4928 moreElementsVectorSrc(MI, MoreTy, 1); 4929 moreElementsVectorDst(MI, MoreTy, 0); 4930 Observer.changedInstr(MI); 4931 return Legalized; 4932 } 4933 default: 4934 return UnableToLegalize; 4935 } 4936 } 4937 4938 LegalizerHelper::LegalizeResult 4939 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI, 4940 unsigned int TypeIdx, LLT MoreTy) { 4941 if (TypeIdx != 0) 4942 return UnableToLegalize; 4943 4944 Register DstReg = MI.getOperand(0).getReg(); 4945 Register Src1Reg = MI.getOperand(1).getReg(); 4946 Register Src2Reg = MI.getOperand(2).getReg(); 4947 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); 4948 LLT DstTy = MRI.getType(DstReg); 4949 LLT Src1Ty = MRI.getType(Src1Reg); 4950 LLT Src2Ty = MRI.getType(Src2Reg); 4951 unsigned NumElts = DstTy.getNumElements(); 4952 unsigned WidenNumElts = MoreTy.getNumElements(); 4953 4954 // Expect a canonicalized shuffle. 4955 if (DstTy != Src1Ty || DstTy != Src2Ty) 4956 return UnableToLegalize; 4957 4958 moreElementsVectorSrc(MI, MoreTy, 1); 4959 moreElementsVectorSrc(MI, MoreTy, 2); 4960 4961 // Adjust mask based on new input vector length. 4962 SmallVector<int, 16> NewMask; 4963 for (unsigned I = 0; I != NumElts; ++I) { 4964 int Idx = Mask[I]; 4965 if (Idx < static_cast<int>(NumElts)) 4966 NewMask.push_back(Idx); 4967 else 4968 NewMask.push_back(Idx - NumElts + WidenNumElts); 4969 } 4970 for (unsigned I = NumElts; I != WidenNumElts; ++I) 4971 NewMask.push_back(-1); 4972 moreElementsVectorDst(MI, MoreTy, 0); 4973 MIRBuilder.setInstrAndDebugLoc(MI); 4974 MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(), 4975 MI.getOperand(1).getReg(), 4976 MI.getOperand(2).getReg(), NewMask); 4977 MI.eraseFromParent(); 4978 return Legalized; 4979 } 4980 4981 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs, 4982 ArrayRef<Register> Src1Regs, 4983 ArrayRef<Register> Src2Regs, 4984 LLT NarrowTy) { 4985 MachineIRBuilder &B = MIRBuilder; 4986 unsigned SrcParts = Src1Regs.size(); 4987 unsigned DstParts = DstRegs.size(); 4988 4989 unsigned DstIdx = 0; // Low bits of the result. 4990 Register FactorSum = 4991 B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0); 4992 DstRegs[DstIdx] = FactorSum; 4993 4994 unsigned CarrySumPrevDstIdx; 4995 SmallVector<Register, 4> Factors; 4996 4997 for (DstIdx = 1; DstIdx < DstParts; DstIdx++) { 4998 // Collect low parts of muls for DstIdx. 4999 for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1; 5000 i <= std::min(DstIdx, SrcParts - 1); ++i) { 5001 MachineInstrBuilder Mul = 5002 B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]); 5003 Factors.push_back(Mul.getReg(0)); 5004 } 5005 // Collect high parts of muls from previous DstIdx. 5006 for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts; 5007 i <= std::min(DstIdx - 1, SrcParts - 1); ++i) { 5008 MachineInstrBuilder Umulh = 5009 B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]); 5010 Factors.push_back(Umulh.getReg(0)); 5011 } 5012 // Add CarrySum from additions calculated for previous DstIdx. 5013 if (DstIdx != 1) { 5014 Factors.push_back(CarrySumPrevDstIdx); 5015 } 5016 5017 Register CarrySum; 5018 // Add all factors and accumulate all carries into CarrySum. 5019 if (DstIdx != DstParts - 1) { 5020 MachineInstrBuilder Uaddo = 5021 B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]); 5022 FactorSum = Uaddo.getReg(0); 5023 CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0); 5024 for (unsigned i = 2; i < Factors.size(); ++i) { 5025 MachineInstrBuilder Uaddo = 5026 B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]); 5027 FactorSum = Uaddo.getReg(0); 5028 MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1)); 5029 CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0); 5030 } 5031 } else { 5032 // Since value for the next index is not calculated, neither is CarrySum. 5033 FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0); 5034 for (unsigned i = 2; i < Factors.size(); ++i) 5035 FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0); 5036 } 5037 5038 CarrySumPrevDstIdx = CarrySum; 5039 DstRegs[DstIdx] = FactorSum; 5040 Factors.clear(); 5041 } 5042 } 5043 5044 LegalizerHelper::LegalizeResult 5045 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx, 5046 LLT NarrowTy) { 5047 if (TypeIdx != 0) 5048 return UnableToLegalize; 5049 5050 Register DstReg = MI.getOperand(0).getReg(); 5051 LLT DstType = MRI.getType(DstReg); 5052 // FIXME: add support for vector types 5053 if (DstType.isVector()) 5054 return UnableToLegalize; 5055 5056 unsigned Opcode = MI.getOpcode(); 5057 unsigned OpO, OpE, OpF; 5058 switch (Opcode) { 5059 case TargetOpcode::G_SADDO: 5060 case TargetOpcode::G_SADDE: 5061 case TargetOpcode::G_UADDO: 5062 case TargetOpcode::G_UADDE: 5063 case TargetOpcode::G_ADD: 5064 OpO = TargetOpcode::G_UADDO; 5065 OpE = TargetOpcode::G_UADDE; 5066 OpF = TargetOpcode::G_UADDE; 5067 if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE) 5068 OpF = TargetOpcode::G_SADDE; 5069 break; 5070 case TargetOpcode::G_SSUBO: 5071 case TargetOpcode::G_SSUBE: 5072 case TargetOpcode::G_USUBO: 5073 case TargetOpcode::G_USUBE: 5074 case TargetOpcode::G_SUB: 5075 OpO = TargetOpcode::G_USUBO; 5076 OpE = TargetOpcode::G_USUBE; 5077 OpF = TargetOpcode::G_USUBE; 5078 if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE) 5079 OpF = TargetOpcode::G_SSUBE; 5080 break; 5081 default: 5082 llvm_unreachable("Unexpected add/sub opcode!"); 5083 } 5084 5085 // 1 for a plain add/sub, 2 if this is an operation with a carry-out. 5086 unsigned NumDefs = MI.getNumExplicitDefs(); 5087 Register Src1 = MI.getOperand(NumDefs).getReg(); 5088 Register Src2 = MI.getOperand(NumDefs + 1).getReg(); 5089 Register CarryDst, CarryIn; 5090 if (NumDefs == 2) 5091 CarryDst = MI.getOperand(1).getReg(); 5092 if (MI.getNumOperands() == NumDefs + 3) 5093 CarryIn = MI.getOperand(NumDefs + 2).getReg(); 5094 5095 LLT RegTy = MRI.getType(MI.getOperand(0).getReg()); 5096 LLT LeftoverTy, DummyTy; 5097 SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs; 5098 extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left); 5099 extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left); 5100 5101 int NarrowParts = Src1Regs.size(); 5102 for (int I = 0, E = Src1Left.size(); I != E; ++I) { 5103 Src1Regs.push_back(Src1Left[I]); 5104 Src2Regs.push_back(Src2Left[I]); 5105 } 5106 DstRegs.reserve(Src1Regs.size()); 5107 5108 for (int i = 0, e = Src1Regs.size(); i != e; ++i) { 5109 Register DstReg = 5110 MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i])); 5111 Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1)); 5112 // Forward the final carry-out to the destination register 5113 if (i == e - 1 && CarryDst) 5114 CarryOut = CarryDst; 5115 5116 if (!CarryIn) { 5117 MIRBuilder.buildInstr(OpO, {DstReg, CarryOut}, 5118 {Src1Regs[i], Src2Regs[i]}); 5119 } else if (i == e - 1) { 5120 MIRBuilder.buildInstr(OpF, {DstReg, CarryOut}, 5121 {Src1Regs[i], Src2Regs[i], CarryIn}); 5122 } else { 5123 MIRBuilder.buildInstr(OpE, {DstReg, CarryOut}, 5124 {Src1Regs[i], Src2Regs[i], CarryIn}); 5125 } 5126 5127 DstRegs.push_back(DstReg); 5128 CarryIn = CarryOut; 5129 } 5130 insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy, 5131 makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy, 5132 makeArrayRef(DstRegs).drop_front(NarrowParts)); 5133 5134 MI.eraseFromParent(); 5135 return Legalized; 5136 } 5137 5138 LegalizerHelper::LegalizeResult 5139 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) { 5140 Register DstReg = MI.getOperand(0).getReg(); 5141 Register Src1 = MI.getOperand(1).getReg(); 5142 Register Src2 = MI.getOperand(2).getReg(); 5143 5144 LLT Ty = MRI.getType(DstReg); 5145 if (Ty.isVector()) 5146 return UnableToLegalize; 5147 5148 unsigned Size = Ty.getSizeInBits(); 5149 unsigned NarrowSize = NarrowTy.getSizeInBits(); 5150 if (Size % NarrowSize != 0) 5151 return UnableToLegalize; 5152 5153 unsigned NumParts = Size / NarrowSize; 5154 bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH; 5155 unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1); 5156 5157 SmallVector<Register, 2> Src1Parts, Src2Parts; 5158 SmallVector<Register, 2> DstTmpRegs(DstTmpParts); 5159 extractParts(Src1, NarrowTy, NumParts, Src1Parts); 5160 extractParts(Src2, NarrowTy, NumParts, Src2Parts); 5161 multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy); 5162 5163 // Take only high half of registers if this is high mul. 5164 ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts); 5165 MIRBuilder.buildMerge(DstReg, DstRegs); 5166 MI.eraseFromParent(); 5167 return Legalized; 5168 } 5169 5170 LegalizerHelper::LegalizeResult 5171 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx, 5172 LLT NarrowTy) { 5173 if (TypeIdx != 0) 5174 return UnableToLegalize; 5175 5176 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI; 5177 5178 Register Src = MI.getOperand(1).getReg(); 5179 LLT SrcTy = MRI.getType(Src); 5180 5181 // If all finite floats fit into the narrowed integer type, we can just swap 5182 // out the result type. This is practically only useful for conversions from 5183 // half to at least 16-bits, so just handle the one case. 5184 if (SrcTy.getScalarType() != LLT::scalar(16) || 5185 NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u)) 5186 return UnableToLegalize; 5187 5188 Observer.changingInstr(MI); 5189 narrowScalarDst(MI, NarrowTy, 0, 5190 IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT); 5191 Observer.changedInstr(MI); 5192 return Legalized; 5193 } 5194 5195 LegalizerHelper::LegalizeResult 5196 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx, 5197 LLT NarrowTy) { 5198 if (TypeIdx != 1) 5199 return UnableToLegalize; 5200 5201 uint64_t NarrowSize = NarrowTy.getSizeInBits(); 5202 5203 int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 5204 // FIXME: add support for when SizeOp1 isn't an exact multiple of 5205 // NarrowSize. 5206 if (SizeOp1 % NarrowSize != 0) 5207 return UnableToLegalize; 5208 int NumParts = SizeOp1 / NarrowSize; 5209 5210 SmallVector<Register, 2> SrcRegs, DstRegs; 5211 SmallVector<uint64_t, 2> Indexes; 5212 extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs); 5213 5214 Register OpReg = MI.getOperand(0).getReg(); 5215 uint64_t OpStart = MI.getOperand(2).getImm(); 5216 uint64_t OpSize = MRI.getType(OpReg).getSizeInBits(); 5217 for (int i = 0; i < NumParts; ++i) { 5218 unsigned SrcStart = i * NarrowSize; 5219 5220 if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) { 5221 // No part of the extract uses this subregister, ignore it. 5222 continue; 5223 } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) { 5224 // The entire subregister is extracted, forward the value. 5225 DstRegs.push_back(SrcRegs[i]); 5226 continue; 5227 } 5228 5229 // OpSegStart is where this destination segment would start in OpReg if it 5230 // extended infinitely in both directions. 5231 int64_t ExtractOffset; 5232 uint64_t SegSize; 5233 if (OpStart < SrcStart) { 5234 ExtractOffset = 0; 5235 SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart); 5236 } else { 5237 ExtractOffset = OpStart - SrcStart; 5238 SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize); 5239 } 5240 5241 Register SegReg = SrcRegs[i]; 5242 if (ExtractOffset != 0 || SegSize != NarrowSize) { 5243 // A genuine extract is needed. 5244 SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize)); 5245 MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset); 5246 } 5247 5248 DstRegs.push_back(SegReg); 5249 } 5250 5251 Register DstReg = MI.getOperand(0).getReg(); 5252 if (MRI.getType(DstReg).isVector()) 5253 MIRBuilder.buildBuildVector(DstReg, DstRegs); 5254 else if (DstRegs.size() > 1) 5255 MIRBuilder.buildMerge(DstReg, DstRegs); 5256 else 5257 MIRBuilder.buildCopy(DstReg, DstRegs[0]); 5258 MI.eraseFromParent(); 5259 return Legalized; 5260 } 5261 5262 LegalizerHelper::LegalizeResult 5263 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx, 5264 LLT NarrowTy) { 5265 // FIXME: Don't know how to handle secondary types yet. 5266 if (TypeIdx != 0) 5267 return UnableToLegalize; 5268 5269 SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs; 5270 SmallVector<uint64_t, 2> Indexes; 5271 LLT RegTy = MRI.getType(MI.getOperand(0).getReg()); 5272 LLT LeftoverTy; 5273 extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs, 5274 LeftoverRegs); 5275 5276 for (Register Reg : LeftoverRegs) 5277 SrcRegs.push_back(Reg); 5278 5279 uint64_t NarrowSize = NarrowTy.getSizeInBits(); 5280 Register OpReg = MI.getOperand(2).getReg(); 5281 uint64_t OpStart = MI.getOperand(3).getImm(); 5282 uint64_t OpSize = MRI.getType(OpReg).getSizeInBits(); 5283 for (int I = 0, E = SrcRegs.size(); I != E; ++I) { 5284 unsigned DstStart = I * NarrowSize; 5285 5286 if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) { 5287 // The entire subregister is defined by this insert, forward the new 5288 // value. 5289 DstRegs.push_back(OpReg); 5290 continue; 5291 } 5292 5293 Register SrcReg = SrcRegs[I]; 5294 if (MRI.getType(SrcRegs[I]) == LeftoverTy) { 5295 // The leftover reg is smaller than NarrowTy, so we need to extend it. 5296 SrcReg = MRI.createGenericVirtualRegister(NarrowTy); 5297 MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]); 5298 } 5299 5300 if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) { 5301 // No part of the insert affects this subregister, forward the original. 5302 DstRegs.push_back(SrcReg); 5303 continue; 5304 } 5305 5306 // OpSegStart is where this destination segment would start in OpReg if it 5307 // extended infinitely in both directions. 5308 int64_t ExtractOffset, InsertOffset; 5309 uint64_t SegSize; 5310 if (OpStart < DstStart) { 5311 InsertOffset = 0; 5312 ExtractOffset = DstStart - OpStart; 5313 SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart); 5314 } else { 5315 InsertOffset = OpStart - DstStart; 5316 ExtractOffset = 0; 5317 SegSize = 5318 std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart); 5319 } 5320 5321 Register SegReg = OpReg; 5322 if (ExtractOffset != 0 || SegSize != OpSize) { 5323 // A genuine extract is needed. 5324 SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize)); 5325 MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset); 5326 } 5327 5328 Register DstReg = MRI.createGenericVirtualRegister(NarrowTy); 5329 MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset); 5330 DstRegs.push_back(DstReg); 5331 } 5332 5333 uint64_t WideSize = DstRegs.size() * NarrowSize; 5334 Register DstReg = MI.getOperand(0).getReg(); 5335 if (WideSize > RegTy.getSizeInBits()) { 5336 Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize)); 5337 MIRBuilder.buildMerge(MergeReg, DstRegs); 5338 MIRBuilder.buildTrunc(DstReg, MergeReg); 5339 } else 5340 MIRBuilder.buildMerge(DstReg, DstRegs); 5341 5342 MI.eraseFromParent(); 5343 return Legalized; 5344 } 5345 5346 LegalizerHelper::LegalizeResult 5347 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx, 5348 LLT NarrowTy) { 5349 Register DstReg = MI.getOperand(0).getReg(); 5350 LLT DstTy = MRI.getType(DstReg); 5351 5352 assert(MI.getNumOperands() == 3 && TypeIdx == 0); 5353 5354 SmallVector<Register, 4> DstRegs, DstLeftoverRegs; 5355 SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs; 5356 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs; 5357 LLT LeftoverTy; 5358 if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy, 5359 Src0Regs, Src0LeftoverRegs)) 5360 return UnableToLegalize; 5361 5362 LLT Unused; 5363 if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused, 5364 Src1Regs, Src1LeftoverRegs)) 5365 llvm_unreachable("inconsistent extractParts result"); 5366 5367 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) { 5368 auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy}, 5369 {Src0Regs[I], Src1Regs[I]}); 5370 DstRegs.push_back(Inst.getReg(0)); 5371 } 5372 5373 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) { 5374 auto Inst = MIRBuilder.buildInstr( 5375 MI.getOpcode(), 5376 {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]}); 5377 DstLeftoverRegs.push_back(Inst.getReg(0)); 5378 } 5379 5380 insertParts(DstReg, DstTy, NarrowTy, DstRegs, 5381 LeftoverTy, DstLeftoverRegs); 5382 5383 MI.eraseFromParent(); 5384 return Legalized; 5385 } 5386 5387 LegalizerHelper::LegalizeResult 5388 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx, 5389 LLT NarrowTy) { 5390 if (TypeIdx != 0) 5391 return UnableToLegalize; 5392 5393 Register DstReg = MI.getOperand(0).getReg(); 5394 Register SrcReg = MI.getOperand(1).getReg(); 5395 5396 LLT DstTy = MRI.getType(DstReg); 5397 if (DstTy.isVector()) 5398 return UnableToLegalize; 5399 5400 SmallVector<Register, 8> Parts; 5401 LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg); 5402 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode()); 5403 buildWidenedRemergeToDst(DstReg, LCMTy, Parts); 5404 5405 MI.eraseFromParent(); 5406 return Legalized; 5407 } 5408 5409 LegalizerHelper::LegalizeResult 5410 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx, 5411 LLT NarrowTy) { 5412 if (TypeIdx != 0) 5413 return UnableToLegalize; 5414 5415 Register CondReg = MI.getOperand(1).getReg(); 5416 LLT CondTy = MRI.getType(CondReg); 5417 if (CondTy.isVector()) // TODO: Handle vselect 5418 return UnableToLegalize; 5419 5420 Register DstReg = MI.getOperand(0).getReg(); 5421 LLT DstTy = MRI.getType(DstReg); 5422 5423 SmallVector<Register, 4> DstRegs, DstLeftoverRegs; 5424 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs; 5425 SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs; 5426 LLT LeftoverTy; 5427 if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy, 5428 Src1Regs, Src1LeftoverRegs)) 5429 return UnableToLegalize; 5430 5431 LLT Unused; 5432 if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused, 5433 Src2Regs, Src2LeftoverRegs)) 5434 llvm_unreachable("inconsistent extractParts result"); 5435 5436 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) { 5437 auto Select = MIRBuilder.buildSelect(NarrowTy, 5438 CondReg, Src1Regs[I], Src2Regs[I]); 5439 DstRegs.push_back(Select.getReg(0)); 5440 } 5441 5442 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) { 5443 auto Select = MIRBuilder.buildSelect( 5444 LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]); 5445 DstLeftoverRegs.push_back(Select.getReg(0)); 5446 } 5447 5448 insertParts(DstReg, DstTy, NarrowTy, DstRegs, 5449 LeftoverTy, DstLeftoverRegs); 5450 5451 MI.eraseFromParent(); 5452 return Legalized; 5453 } 5454 5455 LegalizerHelper::LegalizeResult 5456 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx, 5457 LLT NarrowTy) { 5458 if (TypeIdx != 1) 5459 return UnableToLegalize; 5460 5461 Register DstReg = MI.getOperand(0).getReg(); 5462 Register SrcReg = MI.getOperand(1).getReg(); 5463 LLT DstTy = MRI.getType(DstReg); 5464 LLT SrcTy = MRI.getType(SrcReg); 5465 unsigned NarrowSize = NarrowTy.getSizeInBits(); 5466 5467 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { 5468 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF; 5469 5470 MachineIRBuilder &B = MIRBuilder; 5471 auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg); 5472 // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi) 5473 auto C_0 = B.buildConstant(NarrowTy, 0); 5474 auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), 5475 UnmergeSrc.getReg(1), C_0); 5476 auto LoCTLZ = IsUndef ? 5477 B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) : 5478 B.buildCTLZ(DstTy, UnmergeSrc.getReg(0)); 5479 auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize); 5480 auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize); 5481 auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)); 5482 B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ); 5483 5484 MI.eraseFromParent(); 5485 return Legalized; 5486 } 5487 5488 return UnableToLegalize; 5489 } 5490 5491 LegalizerHelper::LegalizeResult 5492 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, 5493 LLT NarrowTy) { 5494 if (TypeIdx != 1) 5495 return UnableToLegalize; 5496 5497 Register DstReg = MI.getOperand(0).getReg(); 5498 Register SrcReg = MI.getOperand(1).getReg(); 5499 LLT DstTy = MRI.getType(DstReg); 5500 LLT SrcTy = MRI.getType(SrcReg); 5501 unsigned NarrowSize = NarrowTy.getSizeInBits(); 5502 5503 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { 5504 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF; 5505 5506 MachineIRBuilder &B = MIRBuilder; 5507 auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg); 5508 // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo) 5509 auto C_0 = B.buildConstant(NarrowTy, 0); 5510 auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), 5511 UnmergeSrc.getReg(0), C_0); 5512 auto HiCTTZ = IsUndef ? 5513 B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) : 5514 B.buildCTTZ(DstTy, UnmergeSrc.getReg(1)); 5515 auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize); 5516 auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize); 5517 auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)); 5518 B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ); 5519 5520 MI.eraseFromParent(); 5521 return Legalized; 5522 } 5523 5524 return UnableToLegalize; 5525 } 5526 5527 LegalizerHelper::LegalizeResult 5528 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, 5529 LLT NarrowTy) { 5530 if (TypeIdx != 1) 5531 return UnableToLegalize; 5532 5533 Register DstReg = MI.getOperand(0).getReg(); 5534 LLT DstTy = MRI.getType(DstReg); 5535 LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); 5536 unsigned NarrowSize = NarrowTy.getSizeInBits(); 5537 5538 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { 5539 auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1)); 5540 5541 auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0)); 5542 auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1)); 5543 MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP); 5544 5545 MI.eraseFromParent(); 5546 return Legalized; 5547 } 5548 5549 return UnableToLegalize; 5550 } 5551 5552 LegalizerHelper::LegalizeResult 5553 LegalizerHelper::lowerBitCount(MachineInstr &MI) { 5554 unsigned Opc = MI.getOpcode(); 5555 const auto &TII = MIRBuilder.getTII(); 5556 auto isSupported = [this](const LegalityQuery &Q) { 5557 auto QAction = LI.getAction(Q).Action; 5558 return QAction == Legal || QAction == Libcall || QAction == Custom; 5559 }; 5560 switch (Opc) { 5561 default: 5562 return UnableToLegalize; 5563 case TargetOpcode::G_CTLZ_ZERO_UNDEF: { 5564 // This trivially expands to CTLZ. 5565 Observer.changingInstr(MI); 5566 MI.setDesc(TII.get(TargetOpcode::G_CTLZ)); 5567 Observer.changedInstr(MI); 5568 return Legalized; 5569 } 5570 case TargetOpcode::G_CTLZ: { 5571 Register DstReg = MI.getOperand(0).getReg(); 5572 Register SrcReg = MI.getOperand(1).getReg(); 5573 LLT DstTy = MRI.getType(DstReg); 5574 LLT SrcTy = MRI.getType(SrcReg); 5575 unsigned Len = SrcTy.getSizeInBits(); 5576 5577 if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) { 5578 // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero. 5579 auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg); 5580 auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0); 5581 auto ICmp = MIRBuilder.buildICmp( 5582 CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc); 5583 auto LenConst = MIRBuilder.buildConstant(DstTy, Len); 5584 MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU); 5585 MI.eraseFromParent(); 5586 return Legalized; 5587 } 5588 // for now, we do this: 5589 // NewLen = NextPowerOf2(Len); 5590 // x = x | (x >> 1); 5591 // x = x | (x >> 2); 5592 // ... 5593 // x = x | (x >>16); 5594 // x = x | (x >>32); // for 64-bit input 5595 // Upto NewLen/2 5596 // return Len - popcount(x); 5597 // 5598 // Ref: "Hacker's Delight" by Henry Warren 5599 Register Op = SrcReg; 5600 unsigned NewLen = PowerOf2Ceil(Len); 5601 for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) { 5602 auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i); 5603 auto MIBOp = MIRBuilder.buildOr( 5604 SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt)); 5605 Op = MIBOp.getReg(0); 5606 } 5607 auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op); 5608 MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len), 5609 MIBPop); 5610 MI.eraseFromParent(); 5611 return Legalized; 5612 } 5613 case TargetOpcode::G_CTTZ_ZERO_UNDEF: { 5614 // This trivially expands to CTTZ. 5615 Observer.changingInstr(MI); 5616 MI.setDesc(TII.get(TargetOpcode::G_CTTZ)); 5617 Observer.changedInstr(MI); 5618 return Legalized; 5619 } 5620 case TargetOpcode::G_CTTZ: { 5621 Register DstReg = MI.getOperand(0).getReg(); 5622 Register SrcReg = MI.getOperand(1).getReg(); 5623 LLT DstTy = MRI.getType(DstReg); 5624 LLT SrcTy = MRI.getType(SrcReg); 5625 5626 unsigned Len = SrcTy.getSizeInBits(); 5627 if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) { 5628 // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with 5629 // zero. 5630 auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg); 5631 auto Zero = MIRBuilder.buildConstant(SrcTy, 0); 5632 auto ICmp = MIRBuilder.buildICmp( 5633 CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero); 5634 auto LenConst = MIRBuilder.buildConstant(DstTy, Len); 5635 MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU); 5636 MI.eraseFromParent(); 5637 return Legalized; 5638 } 5639 // for now, we use: { return popcount(~x & (x - 1)); } 5640 // unless the target has ctlz but not ctpop, in which case we use: 5641 // { return 32 - nlz(~x & (x-1)); } 5642 // Ref: "Hacker's Delight" by Henry Warren 5643 auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1); 5644 auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1); 5645 auto MIBTmp = MIRBuilder.buildAnd( 5646 SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1)); 5647 if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) && 5648 isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) { 5649 auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len); 5650 MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen, 5651 MIRBuilder.buildCTLZ(SrcTy, MIBTmp)); 5652 MI.eraseFromParent(); 5653 return Legalized; 5654 } 5655 MI.setDesc(TII.get(TargetOpcode::G_CTPOP)); 5656 MI.getOperand(1).setReg(MIBTmp.getReg(0)); 5657 return Legalized; 5658 } 5659 case TargetOpcode::G_CTPOP: { 5660 Register SrcReg = MI.getOperand(1).getReg(); 5661 LLT Ty = MRI.getType(SrcReg); 5662 unsigned Size = Ty.getSizeInBits(); 5663 MachineIRBuilder &B = MIRBuilder; 5664 5665 // Count set bits in blocks of 2 bits. Default approach would be 5666 // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 } 5667 // We use following formula instead: 5668 // B2Count = val - { (val >> 1) & 0x55555555 } 5669 // since it gives same result in blocks of 2 with one instruction less. 5670 auto C_1 = B.buildConstant(Ty, 1); 5671 auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1); 5672 APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55)); 5673 auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0); 5674 auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0); 5675 auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi); 5676 5677 // In order to get count in blocks of 4 add values from adjacent block of 2. 5678 // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 } 5679 auto C_2 = B.buildConstant(Ty, 2); 5680 auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2); 5681 APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33)); 5682 auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0); 5683 auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0); 5684 auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0); 5685 auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count); 5686 5687 // For count in blocks of 8 bits we don't have to mask high 4 bits before 5688 // addition since count value sits in range {0,...,8} and 4 bits are enough 5689 // to hold such binary values. After addition high 4 bits still hold count 5690 // of set bits in high 4 bit block, set them to zero and get 8 bit result. 5691 // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F 5692 auto C_4 = B.buildConstant(Ty, 4); 5693 auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4); 5694 auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count); 5695 APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F)); 5696 auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0); 5697 auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0); 5698 5699 assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm"); 5700 // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this 5701 // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks. 5702 auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01))); 5703 auto ResTmp = B.buildMul(Ty, B8Count, MulMask); 5704 5705 // Shift count result from 8 high bits to low bits. 5706 auto C_SizeM8 = B.buildConstant(Ty, Size - 8); 5707 B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8); 5708 5709 MI.eraseFromParent(); 5710 return Legalized; 5711 } 5712 } 5713 } 5714 5715 // Check that (every element of) Reg is undef or not an exact multiple of BW. 5716 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI, 5717 Register Reg, unsigned BW) { 5718 return matchUnaryPredicate( 5719 MRI, Reg, 5720 [=](const Constant *C) { 5721 // Null constant here means an undef. 5722 const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C); 5723 return !CI || CI->getValue().urem(BW) != 0; 5724 }, 5725 /*AllowUndefs*/ true); 5726 } 5727 5728 LegalizerHelper::LegalizeResult 5729 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) { 5730 Register Dst = MI.getOperand(0).getReg(); 5731 Register X = MI.getOperand(1).getReg(); 5732 Register Y = MI.getOperand(2).getReg(); 5733 Register Z = MI.getOperand(3).getReg(); 5734 LLT Ty = MRI.getType(Dst); 5735 LLT ShTy = MRI.getType(Z); 5736 5737 unsigned BW = Ty.getScalarSizeInBits(); 5738 5739 if (!isPowerOf2_32(BW)) 5740 return UnableToLegalize; 5741 5742 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL; 5743 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL; 5744 5745 if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) { 5746 // fshl X, Y, Z -> fshr X, Y, -Z 5747 // fshr X, Y, Z -> fshl X, Y, -Z 5748 auto Zero = MIRBuilder.buildConstant(ShTy, 0); 5749 Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0); 5750 } else { 5751 // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z 5752 // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z 5753 auto One = MIRBuilder.buildConstant(ShTy, 1); 5754 if (IsFSHL) { 5755 Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0); 5756 X = MIRBuilder.buildLShr(Ty, X, One).getReg(0); 5757 } else { 5758 X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0); 5759 Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0); 5760 } 5761 5762 Z = MIRBuilder.buildNot(ShTy, Z).getReg(0); 5763 } 5764 5765 MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z}); 5766 MI.eraseFromParent(); 5767 return Legalized; 5768 } 5769 5770 LegalizerHelper::LegalizeResult 5771 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) { 5772 Register Dst = MI.getOperand(0).getReg(); 5773 Register X = MI.getOperand(1).getReg(); 5774 Register Y = MI.getOperand(2).getReg(); 5775 Register Z = MI.getOperand(3).getReg(); 5776 LLT Ty = MRI.getType(Dst); 5777 LLT ShTy = MRI.getType(Z); 5778 5779 const unsigned BW = Ty.getScalarSizeInBits(); 5780 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL; 5781 5782 Register ShX, ShY; 5783 Register ShAmt, InvShAmt; 5784 5785 // FIXME: Emit optimized urem by constant instead of letting it expand later. 5786 if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) { 5787 // fshl: X << C | Y >> (BW - C) 5788 // fshr: X << (BW - C) | Y >> C 5789 // where C = Z % BW is not zero 5790 auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW); 5791 ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0); 5792 InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0); 5793 ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0); 5794 ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0); 5795 } else { 5796 // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW)) 5797 // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW) 5798 auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1); 5799 if (isPowerOf2_32(BW)) { 5800 // Z % BW -> Z & (BW - 1) 5801 ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0); 5802 // (BW - 1) - (Z % BW) -> ~Z & (BW - 1) 5803 auto NotZ = MIRBuilder.buildNot(ShTy, Z); 5804 InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0); 5805 } else { 5806 auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW); 5807 ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0); 5808 InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0); 5809 } 5810 5811 auto One = MIRBuilder.buildConstant(ShTy, 1); 5812 if (IsFSHL) { 5813 ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0); 5814 auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One); 5815 ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0); 5816 } else { 5817 auto ShX1 = MIRBuilder.buildShl(Ty, X, One); 5818 ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0); 5819 ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0); 5820 } 5821 } 5822 5823 MIRBuilder.buildOr(Dst, ShX, ShY); 5824 MI.eraseFromParent(); 5825 return Legalized; 5826 } 5827 5828 LegalizerHelper::LegalizeResult 5829 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) { 5830 // These operations approximately do the following (while avoiding undefined 5831 // shifts by BW): 5832 // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 5833 // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 5834 Register Dst = MI.getOperand(0).getReg(); 5835 LLT Ty = MRI.getType(Dst); 5836 LLT ShTy = MRI.getType(MI.getOperand(3).getReg()); 5837 5838 bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL; 5839 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL; 5840 5841 // TODO: Use smarter heuristic that accounts for vector legalization. 5842 if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower) 5843 return lowerFunnelShiftAsShifts(MI); 5844 5845 // This only works for powers of 2, fallback to shifts if it fails. 5846 LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI); 5847 if (Result == UnableToLegalize) 5848 return lowerFunnelShiftAsShifts(MI); 5849 return Result; 5850 } 5851 5852 LegalizerHelper::LegalizeResult 5853 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) { 5854 Register Dst = MI.getOperand(0).getReg(); 5855 Register Src = MI.getOperand(1).getReg(); 5856 Register Amt = MI.getOperand(2).getReg(); 5857 LLT AmtTy = MRI.getType(Amt); 5858 auto Zero = MIRBuilder.buildConstant(AmtTy, 0); 5859 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL; 5860 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL; 5861 auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt); 5862 MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg}); 5863 MI.eraseFromParent(); 5864 return Legalized; 5865 } 5866 5867 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) { 5868 Register Dst = MI.getOperand(0).getReg(); 5869 Register Src = MI.getOperand(1).getReg(); 5870 Register Amt = MI.getOperand(2).getReg(); 5871 LLT DstTy = MRI.getType(Dst); 5872 LLT SrcTy = MRI.getType(Src); 5873 LLT AmtTy = MRI.getType(Amt); 5874 5875 unsigned EltSizeInBits = DstTy.getScalarSizeInBits(); 5876 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL; 5877 5878 MIRBuilder.setInstrAndDebugLoc(MI); 5879 5880 // If a rotate in the other direction is supported, use it. 5881 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL; 5882 if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) && 5883 isPowerOf2_32(EltSizeInBits)) 5884 return lowerRotateWithReverseRotate(MI); 5885 5886 // If a funnel shift is supported, use it. 5887 unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR; 5888 unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR; 5889 bool IsFShLegal = false; 5890 if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) || 5891 LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) { 5892 auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2, 5893 Register R3) { 5894 MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3}); 5895 MI.eraseFromParent(); 5896 return Legalized; 5897 }; 5898 // If a funnel shift in the other direction is supported, use it. 5899 if (IsFShLegal) { 5900 return buildFunnelShift(FShOpc, Dst, Src, Amt); 5901 } else if (isPowerOf2_32(EltSizeInBits)) { 5902 Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0); 5903 return buildFunnelShift(RevFsh, Dst, Src, Amt); 5904 } 5905 } 5906 5907 auto Zero = MIRBuilder.buildConstant(AmtTy, 0); 5908 unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR; 5909 unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL; 5910 auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1); 5911 Register ShVal; 5912 Register RevShiftVal; 5913 if (isPowerOf2_32(EltSizeInBits)) { 5914 // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1)) 5915 // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1)) 5916 auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt); 5917 auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC); 5918 ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0); 5919 auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC); 5920 RevShiftVal = 5921 MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0); 5922 } else { 5923 // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w)) 5924 // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w)) 5925 auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits); 5926 auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC); 5927 ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0); 5928 auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt); 5929 auto One = MIRBuilder.buildConstant(AmtTy, 1); 5930 auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One}); 5931 RevShiftVal = 5932 MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0); 5933 } 5934 MIRBuilder.buildOr(Dst, ShVal, RevShiftVal); 5935 MI.eraseFromParent(); 5936 return Legalized; 5937 } 5938 5939 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float 5940 // representation. 5941 LegalizerHelper::LegalizeResult 5942 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) { 5943 Register Dst = MI.getOperand(0).getReg(); 5944 Register Src = MI.getOperand(1).getReg(); 5945 const LLT S64 = LLT::scalar(64); 5946 const LLT S32 = LLT::scalar(32); 5947 const LLT S1 = LLT::scalar(1); 5948 5949 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32); 5950 5951 // unsigned cul2f(ulong u) { 5952 // uint lz = clz(u); 5953 // uint e = (u != 0) ? 127U + 63U - lz : 0; 5954 // u = (u << lz) & 0x7fffffffffffffffUL; 5955 // ulong t = u & 0xffffffffffUL; 5956 // uint v = (e << 23) | (uint)(u >> 40); 5957 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U); 5958 // return as_float(v + r); 5959 // } 5960 5961 auto Zero32 = MIRBuilder.buildConstant(S32, 0); 5962 auto Zero64 = MIRBuilder.buildConstant(S64, 0); 5963 5964 auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src); 5965 5966 auto K = MIRBuilder.buildConstant(S32, 127U + 63U); 5967 auto Sub = MIRBuilder.buildSub(S32, K, LZ); 5968 5969 auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64); 5970 auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32); 5971 5972 auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1); 5973 auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ); 5974 5975 auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0); 5976 5977 auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL); 5978 auto T = MIRBuilder.buildAnd(S64, U, Mask1); 5979 5980 auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40)); 5981 auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23)); 5982 auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl)); 5983 5984 auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL); 5985 auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C); 5986 auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C); 5987 auto One = MIRBuilder.buildConstant(S32, 1); 5988 5989 auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One); 5990 auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32); 5991 auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0); 5992 MIRBuilder.buildAdd(Dst, V, R); 5993 5994 MI.eraseFromParent(); 5995 return Legalized; 5996 } 5997 5998 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) { 5999 Register Dst = MI.getOperand(0).getReg(); 6000 Register Src = MI.getOperand(1).getReg(); 6001 LLT DstTy = MRI.getType(Dst); 6002 LLT SrcTy = MRI.getType(Src); 6003 6004 if (SrcTy == LLT::scalar(1)) { 6005 auto True = MIRBuilder.buildFConstant(DstTy, 1.0); 6006 auto False = MIRBuilder.buildFConstant(DstTy, 0.0); 6007 MIRBuilder.buildSelect(Dst, Src, True, False); 6008 MI.eraseFromParent(); 6009 return Legalized; 6010 } 6011 6012 if (SrcTy != LLT::scalar(64)) 6013 return UnableToLegalize; 6014 6015 if (DstTy == LLT::scalar(32)) { 6016 // TODO: SelectionDAG has several alternative expansions to port which may 6017 // be more reasonble depending on the available instructions. If a target 6018 // has sitofp, does not have CTLZ, or can efficiently use f64 as an 6019 // intermediate type, this is probably worse. 6020 return lowerU64ToF32BitOps(MI); 6021 } 6022 6023 return UnableToLegalize; 6024 } 6025 6026 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) { 6027 Register Dst = MI.getOperand(0).getReg(); 6028 Register Src = MI.getOperand(1).getReg(); 6029 LLT DstTy = MRI.getType(Dst); 6030 LLT SrcTy = MRI.getType(Src); 6031 6032 const LLT S64 = LLT::scalar(64); 6033 const LLT S32 = LLT::scalar(32); 6034 const LLT S1 = LLT::scalar(1); 6035 6036 if (SrcTy == S1) { 6037 auto True = MIRBuilder.buildFConstant(DstTy, -1.0); 6038 auto False = MIRBuilder.buildFConstant(DstTy, 0.0); 6039 MIRBuilder.buildSelect(Dst, Src, True, False); 6040 MI.eraseFromParent(); 6041 return Legalized; 6042 } 6043 6044 if (SrcTy != S64) 6045 return UnableToLegalize; 6046 6047 if (DstTy == S32) { 6048 // signed cl2f(long l) { 6049 // long s = l >> 63; 6050 // float r = cul2f((l + s) ^ s); 6051 // return s ? -r : r; 6052 // } 6053 Register L = Src; 6054 auto SignBit = MIRBuilder.buildConstant(S64, 63); 6055 auto S = MIRBuilder.buildAShr(S64, L, SignBit); 6056 6057 auto LPlusS = MIRBuilder.buildAdd(S64, L, S); 6058 auto Xor = MIRBuilder.buildXor(S64, LPlusS, S); 6059 auto R = MIRBuilder.buildUITOFP(S32, Xor); 6060 6061 auto RNeg = MIRBuilder.buildFNeg(S32, R); 6062 auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S, 6063 MIRBuilder.buildConstant(S64, 0)); 6064 MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R); 6065 MI.eraseFromParent(); 6066 return Legalized; 6067 } 6068 6069 return UnableToLegalize; 6070 } 6071 6072 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) { 6073 Register Dst = MI.getOperand(0).getReg(); 6074 Register Src = MI.getOperand(1).getReg(); 6075 LLT DstTy = MRI.getType(Dst); 6076 LLT SrcTy = MRI.getType(Src); 6077 const LLT S64 = LLT::scalar(64); 6078 const LLT S32 = LLT::scalar(32); 6079 6080 if (SrcTy != S64 && SrcTy != S32) 6081 return UnableToLegalize; 6082 if (DstTy != S32 && DstTy != S64) 6083 return UnableToLegalize; 6084 6085 // FPTOSI gives same result as FPTOUI for positive signed integers. 6086 // FPTOUI needs to deal with fp values that convert to unsigned integers 6087 // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp. 6088 6089 APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits()); 6090 APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle() 6091 : APFloat::IEEEdouble(), 6092 APInt::getZero(SrcTy.getSizeInBits())); 6093 TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven); 6094 6095 MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src); 6096 6097 MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP); 6098 // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on 6099 // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1. 6100 MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold); 6101 MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub); 6102 MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt); 6103 MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit); 6104 6105 const LLT S1 = LLT::scalar(1); 6106 6107 MachineInstrBuilder FCMP = 6108 MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold); 6109 MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res); 6110 6111 MI.eraseFromParent(); 6112 return Legalized; 6113 } 6114 6115 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) { 6116 Register Dst = MI.getOperand(0).getReg(); 6117 Register Src = MI.getOperand(1).getReg(); 6118 LLT DstTy = MRI.getType(Dst); 6119 LLT SrcTy = MRI.getType(Src); 6120 const LLT S64 = LLT::scalar(64); 6121 const LLT S32 = LLT::scalar(32); 6122 6123 // FIXME: Only f32 to i64 conversions are supported. 6124 if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64) 6125 return UnableToLegalize; 6126 6127 // Expand f32 -> i64 conversion 6128 // This algorithm comes from compiler-rt's implementation of fixsfdi: 6129 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c 6130 6131 unsigned SrcEltBits = SrcTy.getScalarSizeInBits(); 6132 6133 auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000); 6134 auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23); 6135 6136 auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask); 6137 auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit); 6138 6139 auto SignMask = MIRBuilder.buildConstant(SrcTy, 6140 APInt::getSignMask(SrcEltBits)); 6141 auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask); 6142 auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1); 6143 auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit); 6144 Sign = MIRBuilder.buildSExt(DstTy, Sign); 6145 6146 auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF); 6147 auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask); 6148 auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000); 6149 6150 auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K); 6151 R = MIRBuilder.buildZExt(DstTy, R); 6152 6153 auto Bias = MIRBuilder.buildConstant(SrcTy, 127); 6154 auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias); 6155 auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit); 6156 auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent); 6157 6158 auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent); 6159 auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub); 6160 6161 const LLT S1 = LLT::scalar(1); 6162 auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, 6163 S1, Exponent, ExponentLoBit); 6164 6165 R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl); 6166 6167 auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign); 6168 auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign); 6169 6170 auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0); 6171 6172 auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, 6173 S1, Exponent, ZeroSrcTy); 6174 6175 auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0); 6176 MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret); 6177 6178 MI.eraseFromParent(); 6179 return Legalized; 6180 } 6181 6182 // f64 -> f16 conversion using round-to-nearest-even rounding mode. 6183 LegalizerHelper::LegalizeResult 6184 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) { 6185 Register Dst = MI.getOperand(0).getReg(); 6186 Register Src = MI.getOperand(1).getReg(); 6187 6188 if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly. 6189 return UnableToLegalize; 6190 6191 const unsigned ExpMask = 0x7ff; 6192 const unsigned ExpBiasf64 = 1023; 6193 const unsigned ExpBiasf16 = 15; 6194 const LLT S32 = LLT::scalar(32); 6195 const LLT S1 = LLT::scalar(1); 6196 6197 auto Unmerge = MIRBuilder.buildUnmerge(S32, Src); 6198 Register U = Unmerge.getReg(0); 6199 Register UH = Unmerge.getReg(1); 6200 6201 auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20)); 6202 E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask)); 6203 6204 // Subtract the fp64 exponent bias (1023) to get the real exponent and 6205 // add the f16 bias (15) to get the biased exponent for the f16 format. 6206 E = MIRBuilder.buildAdd( 6207 S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16)); 6208 6209 auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8)); 6210 M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe)); 6211 6212 auto MaskedSig = MIRBuilder.buildAnd(S32, UH, 6213 MIRBuilder.buildConstant(S32, 0x1ff)); 6214 MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U); 6215 6216 auto Zero = MIRBuilder.buildConstant(S32, 0); 6217 auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero); 6218 auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0); 6219 M = MIRBuilder.buildOr(S32, M, Lo40Set); 6220 6221 // (M != 0 ? 0x0200 : 0) | 0x7c00; 6222 auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200); 6223 auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero); 6224 auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero); 6225 6226 auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00); 6227 auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00); 6228 6229 // N = M | (E << 12); 6230 auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12)); 6231 auto N = MIRBuilder.buildOr(S32, M, EShl12); 6232 6233 // B = clamp(1-E, 0, 13); 6234 auto One = MIRBuilder.buildConstant(S32, 1); 6235 auto OneSubExp = MIRBuilder.buildSub(S32, One, E); 6236 auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero); 6237 B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13)); 6238 6239 auto SigSetHigh = MIRBuilder.buildOr(S32, M, 6240 MIRBuilder.buildConstant(S32, 0x1000)); 6241 6242 auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B); 6243 auto D0 = MIRBuilder.buildShl(S32, D, B); 6244 6245 auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, 6246 D0, SigSetHigh); 6247 auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh); 6248 D = MIRBuilder.buildOr(S32, D, D1); 6249 6250 auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One); 6251 auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N); 6252 6253 auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7)); 6254 V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2)); 6255 6256 auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3, 6257 MIRBuilder.buildConstant(S32, 3)); 6258 auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3); 6259 6260 auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3, 6261 MIRBuilder.buildConstant(S32, 5)); 6262 auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5); 6263 6264 V1 = MIRBuilder.buildOr(S32, V0, V1); 6265 V = MIRBuilder.buildAdd(S32, V, V1); 6266 6267 auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, 6268 E, MIRBuilder.buildConstant(S32, 30)); 6269 V = MIRBuilder.buildSelect(S32, CmpEGt30, 6270 MIRBuilder.buildConstant(S32, 0x7c00), V); 6271 6272 auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, 6273 E, MIRBuilder.buildConstant(S32, 1039)); 6274 V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V); 6275 6276 // Extract the sign bit. 6277 auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16)); 6278 Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000)); 6279 6280 // Insert the sign bit 6281 V = MIRBuilder.buildOr(S32, Sign, V); 6282 6283 MIRBuilder.buildTrunc(Dst, V); 6284 MI.eraseFromParent(); 6285 return Legalized; 6286 } 6287 6288 LegalizerHelper::LegalizeResult 6289 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) { 6290 Register Dst = MI.getOperand(0).getReg(); 6291 Register Src = MI.getOperand(1).getReg(); 6292 6293 LLT DstTy = MRI.getType(Dst); 6294 LLT SrcTy = MRI.getType(Src); 6295 const LLT S64 = LLT::scalar(64); 6296 const LLT S16 = LLT::scalar(16); 6297 6298 if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64) 6299 return lowerFPTRUNC_F64_TO_F16(MI); 6300 6301 return UnableToLegalize; 6302 } 6303 6304 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a 6305 // multiplication tree. 6306 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) { 6307 Register Dst = MI.getOperand(0).getReg(); 6308 Register Src0 = MI.getOperand(1).getReg(); 6309 Register Src1 = MI.getOperand(2).getReg(); 6310 LLT Ty = MRI.getType(Dst); 6311 6312 auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1); 6313 MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags()); 6314 MI.eraseFromParent(); 6315 return Legalized; 6316 } 6317 6318 static CmpInst::Predicate minMaxToCompare(unsigned Opc) { 6319 switch (Opc) { 6320 case TargetOpcode::G_SMIN: 6321 return CmpInst::ICMP_SLT; 6322 case TargetOpcode::G_SMAX: 6323 return CmpInst::ICMP_SGT; 6324 case TargetOpcode::G_UMIN: 6325 return CmpInst::ICMP_ULT; 6326 case TargetOpcode::G_UMAX: 6327 return CmpInst::ICMP_UGT; 6328 default: 6329 llvm_unreachable("not in integer min/max"); 6330 } 6331 } 6332 6333 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) { 6334 Register Dst = MI.getOperand(0).getReg(); 6335 Register Src0 = MI.getOperand(1).getReg(); 6336 Register Src1 = MI.getOperand(2).getReg(); 6337 6338 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); 6339 LLT CmpType = MRI.getType(Dst).changeElementSize(1); 6340 6341 auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1); 6342 MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1); 6343 6344 MI.eraseFromParent(); 6345 return Legalized; 6346 } 6347 6348 LegalizerHelper::LegalizeResult 6349 LegalizerHelper::lowerFCopySign(MachineInstr &MI) { 6350 Register Dst = MI.getOperand(0).getReg(); 6351 Register Src0 = MI.getOperand(1).getReg(); 6352 Register Src1 = MI.getOperand(2).getReg(); 6353 6354 const LLT Src0Ty = MRI.getType(Src0); 6355 const LLT Src1Ty = MRI.getType(Src1); 6356 6357 const int Src0Size = Src0Ty.getScalarSizeInBits(); 6358 const int Src1Size = Src1Ty.getScalarSizeInBits(); 6359 6360 auto SignBitMask = MIRBuilder.buildConstant( 6361 Src0Ty, APInt::getSignMask(Src0Size)); 6362 6363 auto NotSignBitMask = MIRBuilder.buildConstant( 6364 Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1)); 6365 6366 Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0); 6367 Register And1; 6368 if (Src0Ty == Src1Ty) { 6369 And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0); 6370 } else if (Src0Size > Src1Size) { 6371 auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size); 6372 auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1); 6373 auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt); 6374 And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0); 6375 } else { 6376 auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size); 6377 auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt); 6378 auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift); 6379 And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0); 6380 } 6381 6382 // Be careful about setting nsz/nnan/ninf on every instruction, since the 6383 // constants are a nan and -0.0, but the final result should preserve 6384 // everything. 6385 unsigned Flags = MI.getFlags(); 6386 MIRBuilder.buildOr(Dst, And0, And1, Flags); 6387 6388 MI.eraseFromParent(); 6389 return Legalized; 6390 } 6391 6392 LegalizerHelper::LegalizeResult 6393 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) { 6394 unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ? 6395 TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE; 6396 6397 Register Dst = MI.getOperand(0).getReg(); 6398 Register Src0 = MI.getOperand(1).getReg(); 6399 Register Src1 = MI.getOperand(2).getReg(); 6400 LLT Ty = MRI.getType(Dst); 6401 6402 if (!MI.getFlag(MachineInstr::FmNoNans)) { 6403 // Insert canonicalizes if it's possible we need to quiet to get correct 6404 // sNaN behavior. 6405 6406 // Note this must be done here, and not as an optimization combine in the 6407 // absence of a dedicate quiet-snan instruction as we're using an 6408 // omni-purpose G_FCANONICALIZE. 6409 if (!isKnownNeverSNaN(Src0, MRI)) 6410 Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0); 6411 6412 if (!isKnownNeverSNaN(Src1, MRI)) 6413 Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0); 6414 } 6415 6416 // If there are no nans, it's safe to simply replace this with the non-IEEE 6417 // version. 6418 MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags()); 6419 MI.eraseFromParent(); 6420 return Legalized; 6421 } 6422 6423 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) { 6424 // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c 6425 Register DstReg = MI.getOperand(0).getReg(); 6426 LLT Ty = MRI.getType(DstReg); 6427 unsigned Flags = MI.getFlags(); 6428 6429 auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2), 6430 Flags); 6431 MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags); 6432 MI.eraseFromParent(); 6433 return Legalized; 6434 } 6435 6436 LegalizerHelper::LegalizeResult 6437 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) { 6438 Register DstReg = MI.getOperand(0).getReg(); 6439 Register X = MI.getOperand(1).getReg(); 6440 const unsigned Flags = MI.getFlags(); 6441 const LLT Ty = MRI.getType(DstReg); 6442 const LLT CondTy = Ty.changeElementSize(1); 6443 6444 // round(x) => 6445 // t = trunc(x); 6446 // d = fabs(x - t); 6447 // o = copysign(1.0f, x); 6448 // return t + (d >= 0.5 ? o : 0.0); 6449 6450 auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags); 6451 6452 auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags); 6453 auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags); 6454 auto Zero = MIRBuilder.buildFConstant(Ty, 0.0); 6455 auto One = MIRBuilder.buildFConstant(Ty, 1.0); 6456 auto Half = MIRBuilder.buildFConstant(Ty, 0.5); 6457 auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X); 6458 6459 auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half, 6460 Flags); 6461 auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags); 6462 6463 MIRBuilder.buildFAdd(DstReg, T, Sel, Flags); 6464 6465 MI.eraseFromParent(); 6466 return Legalized; 6467 } 6468 6469 LegalizerHelper::LegalizeResult 6470 LegalizerHelper::lowerFFloor(MachineInstr &MI) { 6471 Register DstReg = MI.getOperand(0).getReg(); 6472 Register SrcReg = MI.getOperand(1).getReg(); 6473 unsigned Flags = MI.getFlags(); 6474 LLT Ty = MRI.getType(DstReg); 6475 const LLT CondTy = Ty.changeElementSize(1); 6476 6477 // result = trunc(src); 6478 // if (src < 0.0 && src != result) 6479 // result += -1.0. 6480 6481 auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags); 6482 auto Zero = MIRBuilder.buildFConstant(Ty, 0.0); 6483 6484 auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy, 6485 SrcReg, Zero, Flags); 6486 auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy, 6487 SrcReg, Trunc, Flags); 6488 auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc); 6489 auto AddVal = MIRBuilder.buildSITOFP(Ty, And); 6490 6491 MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags); 6492 MI.eraseFromParent(); 6493 return Legalized; 6494 } 6495 6496 LegalizerHelper::LegalizeResult 6497 LegalizerHelper::lowerMergeValues(MachineInstr &MI) { 6498 const unsigned NumOps = MI.getNumOperands(); 6499 Register DstReg = MI.getOperand(0).getReg(); 6500 Register Src0Reg = MI.getOperand(1).getReg(); 6501 LLT DstTy = MRI.getType(DstReg); 6502 LLT SrcTy = MRI.getType(Src0Reg); 6503 unsigned PartSize = SrcTy.getSizeInBits(); 6504 6505 LLT WideTy = LLT::scalar(DstTy.getSizeInBits()); 6506 Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0); 6507 6508 for (unsigned I = 2; I != NumOps; ++I) { 6509 const unsigned Offset = (I - 1) * PartSize; 6510 6511 Register SrcReg = MI.getOperand(I).getReg(); 6512 auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg); 6513 6514 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg : 6515 MRI.createGenericVirtualRegister(WideTy); 6516 6517 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset); 6518 auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt); 6519 MIRBuilder.buildOr(NextResult, ResultReg, Shl); 6520 ResultReg = NextResult; 6521 } 6522 6523 if (DstTy.isPointer()) { 6524 if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace( 6525 DstTy.getAddressSpace())) { 6526 LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n"); 6527 return UnableToLegalize; 6528 } 6529 6530 MIRBuilder.buildIntToPtr(DstReg, ResultReg); 6531 } 6532 6533 MI.eraseFromParent(); 6534 return Legalized; 6535 } 6536 6537 LegalizerHelper::LegalizeResult 6538 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) { 6539 const unsigned NumDst = MI.getNumOperands() - 1; 6540 Register SrcReg = MI.getOperand(NumDst).getReg(); 6541 Register Dst0Reg = MI.getOperand(0).getReg(); 6542 LLT DstTy = MRI.getType(Dst0Reg); 6543 if (DstTy.isPointer()) 6544 return UnableToLegalize; // TODO 6545 6546 SrcReg = coerceToScalar(SrcReg); 6547 if (!SrcReg) 6548 return UnableToLegalize; 6549 6550 // Expand scalarizing unmerge as bitcast to integer and shift. 6551 LLT IntTy = MRI.getType(SrcReg); 6552 6553 MIRBuilder.buildTrunc(Dst0Reg, SrcReg); 6554 6555 const unsigned DstSize = DstTy.getSizeInBits(); 6556 unsigned Offset = DstSize; 6557 for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) { 6558 auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset); 6559 auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt); 6560 MIRBuilder.buildTrunc(MI.getOperand(I), Shift); 6561 } 6562 6563 MI.eraseFromParent(); 6564 return Legalized; 6565 } 6566 6567 /// Lower a vector extract or insert by writing the vector to a stack temporary 6568 /// and reloading the element or vector. 6569 /// 6570 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx 6571 /// => 6572 /// %stack_temp = G_FRAME_INDEX 6573 /// G_STORE %vec, %stack_temp 6574 /// %idx = clamp(%idx, %vec.getNumElements()) 6575 /// %element_ptr = G_PTR_ADD %stack_temp, %idx 6576 /// %dst = G_LOAD %element_ptr 6577 LegalizerHelper::LegalizeResult 6578 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) { 6579 Register DstReg = MI.getOperand(0).getReg(); 6580 Register SrcVec = MI.getOperand(1).getReg(); 6581 Register InsertVal; 6582 if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT) 6583 InsertVal = MI.getOperand(2).getReg(); 6584 6585 Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg(); 6586 6587 LLT VecTy = MRI.getType(SrcVec); 6588 LLT EltTy = VecTy.getElementType(); 6589 unsigned NumElts = VecTy.getNumElements(); 6590 6591 int64_t IdxVal; 6592 if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) { 6593 SmallVector<Register, 8> SrcRegs; 6594 extractParts(SrcVec, EltTy, NumElts, SrcRegs); 6595 6596 if (InsertVal) { 6597 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 6598 MIRBuilder.buildMerge(DstReg, SrcRegs); 6599 } else { 6600 MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]); 6601 } 6602 6603 MI.eraseFromParent(); 6604 return Legalized; 6605 } 6606 6607 if (!EltTy.isByteSized()) { // Not implemented. 6608 LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n"); 6609 return UnableToLegalize; 6610 } 6611 6612 unsigned EltBytes = EltTy.getSizeInBytes(); 6613 Align VecAlign = getStackTemporaryAlignment(VecTy); 6614 Align EltAlign; 6615 6616 MachinePointerInfo PtrInfo; 6617 auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()), 6618 VecAlign, PtrInfo); 6619 MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign); 6620 6621 // Get the pointer to the element, and be sure not to hit undefined behavior 6622 // if the index is out of bounds. 6623 Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx); 6624 6625 if (mi_match(Idx, MRI, m_ICst(IdxVal))) { 6626 int64_t Offset = IdxVal * EltBytes; 6627 PtrInfo = PtrInfo.getWithOffset(Offset); 6628 EltAlign = commonAlignment(VecAlign, Offset); 6629 } else { 6630 // We lose information with a variable offset. 6631 EltAlign = getStackTemporaryAlignment(EltTy); 6632 PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace()); 6633 } 6634 6635 if (InsertVal) { 6636 // Write the inserted element 6637 MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign); 6638 6639 // Reload the whole vector. 6640 MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign); 6641 } else { 6642 MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign); 6643 } 6644 6645 MI.eraseFromParent(); 6646 return Legalized; 6647 } 6648 6649 LegalizerHelper::LegalizeResult 6650 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { 6651 Register DstReg = MI.getOperand(0).getReg(); 6652 Register Src0Reg = MI.getOperand(1).getReg(); 6653 Register Src1Reg = MI.getOperand(2).getReg(); 6654 LLT Src0Ty = MRI.getType(Src0Reg); 6655 LLT DstTy = MRI.getType(DstReg); 6656 LLT IdxTy = LLT::scalar(32); 6657 6658 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); 6659 6660 if (DstTy.isScalar()) { 6661 if (Src0Ty.isVector()) 6662 return UnableToLegalize; 6663 6664 // This is just a SELECT. 6665 assert(Mask.size() == 1 && "Expected a single mask element"); 6666 Register Val; 6667 if (Mask[0] < 0 || Mask[0] > 1) 6668 Val = MIRBuilder.buildUndef(DstTy).getReg(0); 6669 else 6670 Val = Mask[0] == 0 ? Src0Reg : Src1Reg; 6671 MIRBuilder.buildCopy(DstReg, Val); 6672 MI.eraseFromParent(); 6673 return Legalized; 6674 } 6675 6676 Register Undef; 6677 SmallVector<Register, 32> BuildVec; 6678 LLT EltTy = DstTy.getElementType(); 6679 6680 for (int Idx : Mask) { 6681 if (Idx < 0) { 6682 if (!Undef.isValid()) 6683 Undef = MIRBuilder.buildUndef(EltTy).getReg(0); 6684 BuildVec.push_back(Undef); 6685 continue; 6686 } 6687 6688 if (Src0Ty.isScalar()) { 6689 BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg); 6690 } else { 6691 int NumElts = Src0Ty.getNumElements(); 6692 Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg; 6693 int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts; 6694 auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx); 6695 auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK); 6696 BuildVec.push_back(Extract.getReg(0)); 6697 } 6698 } 6699 6700 MIRBuilder.buildBuildVector(DstReg, BuildVec); 6701 MI.eraseFromParent(); 6702 return Legalized; 6703 } 6704 6705 LegalizerHelper::LegalizeResult 6706 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { 6707 const auto &MF = *MI.getMF(); 6708 const auto &TFI = *MF.getSubtarget().getFrameLowering(); 6709 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp) 6710 return UnableToLegalize; 6711 6712 Register Dst = MI.getOperand(0).getReg(); 6713 Register AllocSize = MI.getOperand(1).getReg(); 6714 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 6715 6716 LLT PtrTy = MRI.getType(Dst); 6717 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 6718 6719 Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); 6720 auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg); 6721 SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp); 6722 6723 // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't 6724 // have to generate an extra instruction to negate the alloc and then use 6725 // G_PTR_ADD to add the negative offset. 6726 auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize); 6727 if (Alignment > Align(1)) { 6728 APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true); 6729 AlignMask.negate(); 6730 auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask); 6731 Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst); 6732 } 6733 6734 SPTmp = MIRBuilder.buildCast(PtrTy, Alloc); 6735 MIRBuilder.buildCopy(SPReg, SPTmp); 6736 MIRBuilder.buildCopy(Dst, SPTmp); 6737 6738 MI.eraseFromParent(); 6739 return Legalized; 6740 } 6741 6742 LegalizerHelper::LegalizeResult 6743 LegalizerHelper::lowerExtract(MachineInstr &MI) { 6744 Register Dst = MI.getOperand(0).getReg(); 6745 Register Src = MI.getOperand(1).getReg(); 6746 unsigned Offset = MI.getOperand(2).getImm(); 6747 6748 LLT DstTy = MRI.getType(Dst); 6749 LLT SrcTy = MRI.getType(Src); 6750 6751 // Extract sub-vector or one element 6752 if (SrcTy.isVector()) { 6753 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); 6754 unsigned DstSize = DstTy.getSizeInBits(); 6755 6756 if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) && 6757 (Offset + DstSize <= SrcTy.getSizeInBits())) { 6758 // Unmerge and allow access to each Src element for the artifact combiner. 6759 auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), Src); 6760 6761 // Take element(s) we need to extract and copy it (merge them). 6762 SmallVector<Register, 8> SubVectorElts; 6763 for (unsigned Idx = Offset / SrcEltSize; 6764 Idx < (Offset + DstSize) / SrcEltSize; ++Idx) { 6765 SubVectorElts.push_back(Unmerge.getReg(Idx)); 6766 } 6767 if (SubVectorElts.size() == 1) 6768 MIRBuilder.buildCopy(Dst, SubVectorElts[0]); 6769 else 6770 MIRBuilder.buildMerge(Dst, SubVectorElts); 6771 6772 MI.eraseFromParent(); 6773 return Legalized; 6774 } 6775 } 6776 6777 if (DstTy.isScalar() && 6778 (SrcTy.isScalar() || 6779 (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) { 6780 LLT SrcIntTy = SrcTy; 6781 if (!SrcTy.isScalar()) { 6782 SrcIntTy = LLT::scalar(SrcTy.getSizeInBits()); 6783 Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0); 6784 } 6785 6786 if (Offset == 0) 6787 MIRBuilder.buildTrunc(Dst, Src); 6788 else { 6789 auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset); 6790 auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt); 6791 MIRBuilder.buildTrunc(Dst, Shr); 6792 } 6793 6794 MI.eraseFromParent(); 6795 return Legalized; 6796 } 6797 6798 return UnableToLegalize; 6799 } 6800 6801 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) { 6802 Register Dst = MI.getOperand(0).getReg(); 6803 Register Src = MI.getOperand(1).getReg(); 6804 Register InsertSrc = MI.getOperand(2).getReg(); 6805 uint64_t Offset = MI.getOperand(3).getImm(); 6806 6807 LLT DstTy = MRI.getType(Src); 6808 LLT InsertTy = MRI.getType(InsertSrc); 6809 6810 // Insert sub-vector or one element 6811 if (DstTy.isVector() && !InsertTy.isPointer()) { 6812 LLT EltTy = DstTy.getElementType(); 6813 unsigned EltSize = EltTy.getSizeInBits(); 6814 unsigned InsertSize = InsertTy.getSizeInBits(); 6815 6816 if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) && 6817 (Offset + InsertSize <= DstTy.getSizeInBits())) { 6818 auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src); 6819 SmallVector<Register, 8> DstElts; 6820 unsigned Idx = 0; 6821 // Elements from Src before insert start Offset 6822 for (; Idx < Offset / EltSize; ++Idx) { 6823 DstElts.push_back(UnmergeSrc.getReg(Idx)); 6824 } 6825 6826 // Replace elements in Src with elements from InsertSrc 6827 if (InsertTy.getSizeInBits() > EltSize) { 6828 auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc); 6829 for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize; 6830 ++Idx, ++i) { 6831 DstElts.push_back(UnmergeInsertSrc.getReg(i)); 6832 } 6833 } else { 6834 DstElts.push_back(InsertSrc); 6835 ++Idx; 6836 } 6837 6838 // Remaining elements from Src after insert 6839 for (; Idx < DstTy.getNumElements(); ++Idx) { 6840 DstElts.push_back(UnmergeSrc.getReg(Idx)); 6841 } 6842 6843 MIRBuilder.buildMerge(Dst, DstElts); 6844 MI.eraseFromParent(); 6845 return Legalized; 6846 } 6847 } 6848 6849 if (InsertTy.isVector() || 6850 (DstTy.isVector() && DstTy.getElementType() != InsertTy)) 6851 return UnableToLegalize; 6852 6853 const DataLayout &DL = MIRBuilder.getDataLayout(); 6854 if ((DstTy.isPointer() && 6855 DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) || 6856 (InsertTy.isPointer() && 6857 DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) { 6858 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n"); 6859 return UnableToLegalize; 6860 } 6861 6862 LLT IntDstTy = DstTy; 6863 6864 if (!DstTy.isScalar()) { 6865 IntDstTy = LLT::scalar(DstTy.getSizeInBits()); 6866 Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0); 6867 } 6868 6869 if (!InsertTy.isScalar()) { 6870 const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits()); 6871 InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0); 6872 } 6873 6874 Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0); 6875 if (Offset != 0) { 6876 auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset); 6877 ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0); 6878 } 6879 6880 APInt MaskVal = APInt::getBitsSetWithWrap( 6881 DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset); 6882 6883 auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal); 6884 auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask); 6885 auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc); 6886 6887 MIRBuilder.buildCast(Dst, Or); 6888 MI.eraseFromParent(); 6889 return Legalized; 6890 } 6891 6892 LegalizerHelper::LegalizeResult 6893 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) { 6894 Register Dst0 = MI.getOperand(0).getReg(); 6895 Register Dst1 = MI.getOperand(1).getReg(); 6896 Register LHS = MI.getOperand(2).getReg(); 6897 Register RHS = MI.getOperand(3).getReg(); 6898 const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO; 6899 6900 LLT Ty = MRI.getType(Dst0); 6901 LLT BoolTy = MRI.getType(Dst1); 6902 6903 if (IsAdd) 6904 MIRBuilder.buildAdd(Dst0, LHS, RHS); 6905 else 6906 MIRBuilder.buildSub(Dst0, LHS, RHS); 6907 6908 // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow. 6909 6910 auto Zero = MIRBuilder.buildConstant(Ty, 0); 6911 6912 // For an addition, the result should be less than one of the operands (LHS) 6913 // if and only if the other operand (RHS) is negative, otherwise there will 6914 // be overflow. 6915 // For a subtraction, the result should be less than one of the operands 6916 // (LHS) if and only if the other operand (RHS) is (non-zero) positive, 6917 // otherwise there will be overflow. 6918 auto ResultLowerThanLHS = 6919 MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS); 6920 auto ConditionRHS = MIRBuilder.buildICmp( 6921 IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero); 6922 6923 MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS); 6924 MI.eraseFromParent(); 6925 return Legalized; 6926 } 6927 6928 LegalizerHelper::LegalizeResult 6929 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) { 6930 Register Res = MI.getOperand(0).getReg(); 6931 Register LHS = MI.getOperand(1).getReg(); 6932 Register RHS = MI.getOperand(2).getReg(); 6933 LLT Ty = MRI.getType(Res); 6934 bool IsSigned; 6935 bool IsAdd; 6936 unsigned BaseOp; 6937 switch (MI.getOpcode()) { 6938 default: 6939 llvm_unreachable("unexpected addsat/subsat opcode"); 6940 case TargetOpcode::G_UADDSAT: 6941 IsSigned = false; 6942 IsAdd = true; 6943 BaseOp = TargetOpcode::G_ADD; 6944 break; 6945 case TargetOpcode::G_SADDSAT: 6946 IsSigned = true; 6947 IsAdd = true; 6948 BaseOp = TargetOpcode::G_ADD; 6949 break; 6950 case TargetOpcode::G_USUBSAT: 6951 IsSigned = false; 6952 IsAdd = false; 6953 BaseOp = TargetOpcode::G_SUB; 6954 break; 6955 case TargetOpcode::G_SSUBSAT: 6956 IsSigned = true; 6957 IsAdd = false; 6958 BaseOp = TargetOpcode::G_SUB; 6959 break; 6960 } 6961 6962 if (IsSigned) { 6963 // sadd.sat(a, b) -> 6964 // hi = 0x7fffffff - smax(a, 0) 6965 // lo = 0x80000000 - smin(a, 0) 6966 // a + smin(smax(lo, b), hi) 6967 // ssub.sat(a, b) -> 6968 // lo = smax(a, -1) - 0x7fffffff 6969 // hi = smin(a, -1) - 0x80000000 6970 // a - smin(smax(lo, b), hi) 6971 // TODO: AMDGPU can use a "median of 3" instruction here: 6972 // a +/- med3(lo, b, hi) 6973 uint64_t NumBits = Ty.getScalarSizeInBits(); 6974 auto MaxVal = 6975 MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits)); 6976 auto MinVal = 6977 MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits)); 6978 MachineInstrBuilder Hi, Lo; 6979 if (IsAdd) { 6980 auto Zero = MIRBuilder.buildConstant(Ty, 0); 6981 Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero)); 6982 Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero)); 6983 } else { 6984 auto NegOne = MIRBuilder.buildConstant(Ty, -1); 6985 Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne), 6986 MaxVal); 6987 Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne), 6988 MinVal); 6989 } 6990 auto RHSClamped = 6991 MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi); 6992 MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped}); 6993 } else { 6994 // uadd.sat(a, b) -> a + umin(~a, b) 6995 // usub.sat(a, b) -> a - umin(a, b) 6996 Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS; 6997 auto Min = MIRBuilder.buildUMin(Ty, Not, RHS); 6998 MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min}); 6999 } 7000 7001 MI.eraseFromParent(); 7002 return Legalized; 7003 } 7004 7005 LegalizerHelper::LegalizeResult 7006 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) { 7007 Register Res = MI.getOperand(0).getReg(); 7008 Register LHS = MI.getOperand(1).getReg(); 7009 Register RHS = MI.getOperand(2).getReg(); 7010 LLT Ty = MRI.getType(Res); 7011 LLT BoolTy = Ty.changeElementSize(1); 7012 bool IsSigned; 7013 bool IsAdd; 7014 unsigned OverflowOp; 7015 switch (MI.getOpcode()) { 7016 default: 7017 llvm_unreachable("unexpected addsat/subsat opcode"); 7018 case TargetOpcode::G_UADDSAT: 7019 IsSigned = false; 7020 IsAdd = true; 7021 OverflowOp = TargetOpcode::G_UADDO; 7022 break; 7023 case TargetOpcode::G_SADDSAT: 7024 IsSigned = true; 7025 IsAdd = true; 7026 OverflowOp = TargetOpcode::G_SADDO; 7027 break; 7028 case TargetOpcode::G_USUBSAT: 7029 IsSigned = false; 7030 IsAdd = false; 7031 OverflowOp = TargetOpcode::G_USUBO; 7032 break; 7033 case TargetOpcode::G_SSUBSAT: 7034 IsSigned = true; 7035 IsAdd = false; 7036 OverflowOp = TargetOpcode::G_SSUBO; 7037 break; 7038 } 7039 7040 auto OverflowRes = 7041 MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS}); 7042 Register Tmp = OverflowRes.getReg(0); 7043 Register Ov = OverflowRes.getReg(1); 7044 MachineInstrBuilder Clamp; 7045 if (IsSigned) { 7046 // sadd.sat(a, b) -> 7047 // {tmp, ov} = saddo(a, b) 7048 // ov ? (tmp >>s 31) + 0x80000000 : r 7049 // ssub.sat(a, b) -> 7050 // {tmp, ov} = ssubo(a, b) 7051 // ov ? (tmp >>s 31) + 0x80000000 : r 7052 uint64_t NumBits = Ty.getScalarSizeInBits(); 7053 auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1); 7054 auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount); 7055 auto MinVal = 7056 MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits)); 7057 Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal); 7058 } else { 7059 // uadd.sat(a, b) -> 7060 // {tmp, ov} = uaddo(a, b) 7061 // ov ? 0xffffffff : tmp 7062 // usub.sat(a, b) -> 7063 // {tmp, ov} = usubo(a, b) 7064 // ov ? 0 : tmp 7065 Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0); 7066 } 7067 MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp); 7068 7069 MI.eraseFromParent(); 7070 return Legalized; 7071 } 7072 7073 LegalizerHelper::LegalizeResult 7074 LegalizerHelper::lowerShlSat(MachineInstr &MI) { 7075 assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT || 7076 MI.getOpcode() == TargetOpcode::G_USHLSAT) && 7077 "Expected shlsat opcode!"); 7078 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT; 7079 Register Res = MI.getOperand(0).getReg(); 7080 Register LHS = MI.getOperand(1).getReg(); 7081 Register RHS = MI.getOperand(2).getReg(); 7082 LLT Ty = MRI.getType(Res); 7083 LLT BoolTy = Ty.changeElementSize(1); 7084 7085 unsigned BW = Ty.getScalarSizeInBits(); 7086 auto Result = MIRBuilder.buildShl(Ty, LHS, RHS); 7087 auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS) 7088 : MIRBuilder.buildLShr(Ty, Result, RHS); 7089 7090 MachineInstrBuilder SatVal; 7091 if (IsSigned) { 7092 auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW)); 7093 auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW)); 7094 auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS, 7095 MIRBuilder.buildConstant(Ty, 0)); 7096 SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax); 7097 } else { 7098 SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW)); 7099 } 7100 auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig); 7101 MIRBuilder.buildSelect(Res, Ov, SatVal, Result); 7102 7103 MI.eraseFromParent(); 7104 return Legalized; 7105 } 7106 7107 LegalizerHelper::LegalizeResult 7108 LegalizerHelper::lowerBswap(MachineInstr &MI) { 7109 Register Dst = MI.getOperand(0).getReg(); 7110 Register Src = MI.getOperand(1).getReg(); 7111 const LLT Ty = MRI.getType(Src); 7112 unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8; 7113 unsigned BaseShiftAmt = (SizeInBytes - 1) * 8; 7114 7115 // Swap most and least significant byte, set remaining bytes in Res to zero. 7116 auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt); 7117 auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt); 7118 auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt); 7119 auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft); 7120 7121 // Set i-th high/low byte in Res to i-th low/high byte from Src. 7122 for (unsigned i = 1; i < SizeInBytes / 2; ++i) { 7123 // AND with Mask leaves byte i unchanged and sets remaining bytes to 0. 7124 APInt APMask(SizeInBytes * 8, 0xFF << (i * 8)); 7125 auto Mask = MIRBuilder.buildConstant(Ty, APMask); 7126 auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i); 7127 // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt. 7128 auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask); 7129 auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt); 7130 Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft); 7131 // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask. 7132 auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt); 7133 auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask); 7134 Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight); 7135 } 7136 Res.getInstr()->getOperand(0).setReg(Dst); 7137 7138 MI.eraseFromParent(); 7139 return Legalized; 7140 } 7141 7142 //{ (Src & Mask) >> N } | { (Src << N) & Mask } 7143 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B, 7144 MachineInstrBuilder Src, APInt Mask) { 7145 const LLT Ty = Dst.getLLTTy(*B.getMRI()); 7146 MachineInstrBuilder C_N = B.buildConstant(Ty, N); 7147 MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask); 7148 auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N); 7149 auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0); 7150 return B.buildOr(Dst, LHS, RHS); 7151 } 7152 7153 LegalizerHelper::LegalizeResult 7154 LegalizerHelper::lowerBitreverse(MachineInstr &MI) { 7155 Register Dst = MI.getOperand(0).getReg(); 7156 Register Src = MI.getOperand(1).getReg(); 7157 const LLT Ty = MRI.getType(Src); 7158 unsigned Size = Ty.getSizeInBits(); 7159 7160 MachineInstrBuilder BSWAP = 7161 MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src}); 7162 7163 // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654 7164 // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4] 7165 // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0] 7166 MachineInstrBuilder Swap4 = 7167 SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0))); 7168 7169 // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76 7170 // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2] 7171 // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC] 7172 MachineInstrBuilder Swap2 = 7173 SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC))); 7174 7175 // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7 7176 // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1] 7177 // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA] 7178 SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA))); 7179 7180 MI.eraseFromParent(); 7181 return Legalized; 7182 } 7183 7184 LegalizerHelper::LegalizeResult 7185 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) { 7186 MachineFunction &MF = MIRBuilder.getMF(); 7187 7188 bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER; 7189 int NameOpIdx = IsRead ? 1 : 0; 7190 int ValRegIndex = IsRead ? 0 : 1; 7191 7192 Register ValReg = MI.getOperand(ValRegIndex).getReg(); 7193 const LLT Ty = MRI.getType(ValReg); 7194 const MDString *RegStr = cast<MDString>( 7195 cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0)); 7196 7197 Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF); 7198 if (!PhysReg.isValid()) 7199 return UnableToLegalize; 7200 7201 if (IsRead) 7202 MIRBuilder.buildCopy(ValReg, PhysReg); 7203 else 7204 MIRBuilder.buildCopy(PhysReg, ValReg); 7205 7206 MI.eraseFromParent(); 7207 return Legalized; 7208 } 7209 7210 LegalizerHelper::LegalizeResult 7211 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) { 7212 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH; 7213 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; 7214 Register Result = MI.getOperand(0).getReg(); 7215 LLT OrigTy = MRI.getType(Result); 7216 auto SizeInBits = OrigTy.getScalarSizeInBits(); 7217 LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2); 7218 7219 auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)}); 7220 auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)}); 7221 auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS); 7222 unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR; 7223 7224 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits); 7225 auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt}); 7226 MIRBuilder.buildTrunc(Result, Shifted); 7227 7228 MI.eraseFromParent(); 7229 return Legalized; 7230 } 7231 7232 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) { 7233 // Implement vector G_SELECT in terms of XOR, AND, OR. 7234 Register DstReg = MI.getOperand(0).getReg(); 7235 Register MaskReg = MI.getOperand(1).getReg(); 7236 Register Op1Reg = MI.getOperand(2).getReg(); 7237 Register Op2Reg = MI.getOperand(3).getReg(); 7238 LLT DstTy = MRI.getType(DstReg); 7239 LLT MaskTy = MRI.getType(MaskReg); 7240 LLT Op1Ty = MRI.getType(Op1Reg); 7241 if (!DstTy.isVector()) 7242 return UnableToLegalize; 7243 7244 // Vector selects can have a scalar predicate. If so, splat into a vector and 7245 // finish for later legalization attempts to try again. 7246 if (MaskTy.isScalar()) { 7247 Register MaskElt = MaskReg; 7248 if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits()) 7249 MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0); 7250 // Generate a vector splat idiom to be pattern matched later. 7251 auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt); 7252 Observer.changingInstr(MI); 7253 MI.getOperand(1).setReg(ShufSplat.getReg(0)); 7254 Observer.changedInstr(MI); 7255 return Legalized; 7256 } 7257 7258 if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) { 7259 return UnableToLegalize; 7260 } 7261 7262 auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg); 7263 auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg); 7264 auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask); 7265 MIRBuilder.buildOr(DstReg, NewOp1, NewOp2); 7266 MI.eraseFromParent(); 7267 return Legalized; 7268 } 7269 7270 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) { 7271 // Split DIVREM into individual instructions. 7272 unsigned Opcode = MI.getOpcode(); 7273 7274 MIRBuilder.buildInstr( 7275 Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV 7276 : TargetOpcode::G_UDIV, 7277 {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)}); 7278 MIRBuilder.buildInstr( 7279 Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM 7280 : TargetOpcode::G_UREM, 7281 {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)}); 7282 MI.eraseFromParent(); 7283 return Legalized; 7284 } 7285 7286 LegalizerHelper::LegalizeResult 7287 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) { 7288 // Expand %res = G_ABS %a into: 7289 // %v1 = G_ASHR %a, scalar_size-1 7290 // %v2 = G_ADD %a, %v1 7291 // %res = G_XOR %v2, %v1 7292 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 7293 Register OpReg = MI.getOperand(1).getReg(); 7294 auto ShiftAmt = 7295 MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1); 7296 auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt); 7297 auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift); 7298 MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift); 7299 MI.eraseFromParent(); 7300 return Legalized; 7301 } 7302 7303 LegalizerHelper::LegalizeResult 7304 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) { 7305 // Expand %res = G_ABS %a into: 7306 // %v1 = G_CONSTANT 0 7307 // %v2 = G_SUB %v1, %a 7308 // %res = G_SMAX %a, %v2 7309 Register SrcReg = MI.getOperand(1).getReg(); 7310 LLT Ty = MRI.getType(SrcReg); 7311 auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0); 7312 auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0); 7313 MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub); 7314 MI.eraseFromParent(); 7315 return Legalized; 7316 } 7317 7318 LegalizerHelper::LegalizeResult 7319 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) { 7320 Register SrcReg = MI.getOperand(1).getReg(); 7321 LLT SrcTy = MRI.getType(SrcReg); 7322 LLT DstTy = MRI.getType(SrcReg); 7323 7324 // The source could be a scalar if the IR type was <1 x sN>. 7325 if (SrcTy.isScalar()) { 7326 if (DstTy.getSizeInBits() > SrcTy.getSizeInBits()) 7327 return UnableToLegalize; // FIXME: handle extension. 7328 // This can be just a plain copy. 7329 Observer.changingInstr(MI); 7330 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY)); 7331 Observer.changedInstr(MI); 7332 return Legalized; 7333 } 7334 return UnableToLegalize;; 7335 } 7336 7337 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { 7338 // On Darwin, -Os means optimize for size without hurting performance, so 7339 // only really optimize for size when -Oz (MinSize) is used. 7340 if (MF.getTarget().getTargetTriple().isOSDarwin()) 7341 return MF.getFunction().hasMinSize(); 7342 return MF.getFunction().hasOptSize(); 7343 } 7344 7345 // Returns a list of types to use for memory op lowering in MemOps. A partial 7346 // port of findOptimalMemOpLowering in TargetLowering. 7347 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps, 7348 unsigned Limit, const MemOp &Op, 7349 unsigned DstAS, unsigned SrcAS, 7350 const AttributeList &FuncAttributes, 7351 const TargetLowering &TLI) { 7352 if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign()) 7353 return false; 7354 7355 LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes); 7356 7357 if (Ty == LLT()) { 7358 // Use the largest scalar type whose alignment constraints are satisfied. 7359 // We only need to check DstAlign here as SrcAlign is always greater or 7360 // equal to DstAlign (or zero). 7361 Ty = LLT::scalar(64); 7362 if (Op.isFixedDstAlign()) 7363 while (Op.getDstAlign() < Ty.getSizeInBytes() && 7364 !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign())) 7365 Ty = LLT::scalar(Ty.getSizeInBytes()); 7366 assert(Ty.getSizeInBits() > 0 && "Could not find valid type"); 7367 // FIXME: check for the largest legal type we can load/store to. 7368 } 7369 7370 unsigned NumMemOps = 0; 7371 uint64_t Size = Op.size(); 7372 while (Size) { 7373 unsigned TySize = Ty.getSizeInBytes(); 7374 while (TySize > Size) { 7375 // For now, only use non-vector load / store's for the left-over pieces. 7376 LLT NewTy = Ty; 7377 // FIXME: check for mem op safety and legality of the types. Not all of 7378 // SDAGisms map cleanly to GISel concepts. 7379 if (NewTy.isVector()) 7380 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32); 7381 NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1)); 7382 unsigned NewTySize = NewTy.getSizeInBytes(); 7383 assert(NewTySize > 0 && "Could not find appropriate type"); 7384 7385 // If the new LLT cannot cover all of the remaining bits, then consider 7386 // issuing a (or a pair of) unaligned and overlapping load / store. 7387 bool Fast; 7388 // Need to get a VT equivalent for allowMisalignedMemoryAccesses(). 7389 MVT VT = getMVTForLLT(Ty); 7390 if (NumMemOps && Op.allowOverlap() && NewTySize < Size && 7391 TLI.allowsMisalignedMemoryAccesses( 7392 VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1), 7393 MachineMemOperand::MONone, &Fast) && 7394 Fast) 7395 TySize = Size; 7396 else { 7397 Ty = NewTy; 7398 TySize = NewTySize; 7399 } 7400 } 7401 7402 if (++NumMemOps > Limit) 7403 return false; 7404 7405 MemOps.push_back(Ty); 7406 Size -= TySize; 7407 } 7408 7409 return true; 7410 } 7411 7412 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) { 7413 if (Ty.isVector()) 7414 return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()), 7415 Ty.getNumElements()); 7416 return IntegerType::get(C, Ty.getSizeInBits()); 7417 } 7418 7419 // Get a vectorized representation of the memset value operand, GISel edition. 7420 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) { 7421 MachineRegisterInfo &MRI = *MIB.getMRI(); 7422 unsigned NumBits = Ty.getScalarSizeInBits(); 7423 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI); 7424 if (!Ty.isVector() && ValVRegAndVal) { 7425 APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8); 7426 APInt SplatVal = APInt::getSplat(NumBits, Scalar); 7427 return MIB.buildConstant(Ty, SplatVal).getReg(0); 7428 } 7429 7430 // Extend the byte value to the larger type, and then multiply by a magic 7431 // value 0x010101... in order to replicate it across every byte. 7432 // Unless it's zero, in which case just emit a larger G_CONSTANT 0. 7433 if (ValVRegAndVal && ValVRegAndVal->Value == 0) { 7434 return MIB.buildConstant(Ty, 0).getReg(0); 7435 } 7436 7437 LLT ExtType = Ty.getScalarType(); 7438 auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val); 7439 if (NumBits > 8) { 7440 APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01)); 7441 auto MagicMI = MIB.buildConstant(ExtType, Magic); 7442 Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0); 7443 } 7444 7445 // For vector types create a G_BUILD_VECTOR. 7446 if (Ty.isVector()) 7447 Val = MIB.buildSplatVector(Ty, Val).getReg(0); 7448 7449 return Val; 7450 } 7451 7452 LegalizerHelper::LegalizeResult 7453 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val, 7454 uint64_t KnownLen, Align Alignment, 7455 bool IsVolatile) { 7456 auto &MF = *MI.getParent()->getParent(); 7457 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 7458 auto &DL = MF.getDataLayout(); 7459 LLVMContext &C = MF.getFunction().getContext(); 7460 7461 assert(KnownLen != 0 && "Have a zero length memset length!"); 7462 7463 bool DstAlignCanChange = false; 7464 MachineFrameInfo &MFI = MF.getFrameInfo(); 7465 bool OptSize = shouldLowerMemFuncForSize(MF); 7466 7467 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 7468 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 7469 DstAlignCanChange = true; 7470 7471 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize); 7472 std::vector<LLT> MemOps; 7473 7474 const auto &DstMMO = **MI.memoperands_begin(); 7475 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 7476 7477 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI); 7478 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0; 7479 7480 if (!findGISelOptimalMemOpLowering(MemOps, Limit, 7481 MemOp::Set(KnownLen, DstAlignCanChange, 7482 Alignment, 7483 /*IsZeroMemset=*/IsZeroVal, 7484 /*IsVolatile=*/IsVolatile), 7485 DstPtrInfo.getAddrSpace(), ~0u, 7486 MF.getFunction().getAttributes(), TLI)) 7487 return UnableToLegalize; 7488 7489 if (DstAlignCanChange) { 7490 // Get an estimate of the type from the LLT. 7491 Type *IRTy = getTypeForLLT(MemOps[0], C); 7492 Align NewAlign = DL.getABITypeAlign(IRTy); 7493 if (NewAlign > Alignment) { 7494 Alignment = NewAlign; 7495 unsigned FI = FIDef->getOperand(1).getIndex(); 7496 // Give the stack frame object a larger alignment if needed. 7497 if (MFI.getObjectAlign(FI) < Alignment) 7498 MFI.setObjectAlignment(FI, Alignment); 7499 } 7500 } 7501 7502 MachineIRBuilder MIB(MI); 7503 // Find the largest store and generate the bit pattern for it. 7504 LLT LargestTy = MemOps[0]; 7505 for (unsigned i = 1; i < MemOps.size(); i++) 7506 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits()) 7507 LargestTy = MemOps[i]; 7508 7509 // The memset stored value is always defined as an s8, so in order to make it 7510 // work with larger store types we need to repeat the bit pattern across the 7511 // wider type. 7512 Register MemSetValue = getMemsetValue(Val, LargestTy, MIB); 7513 7514 if (!MemSetValue) 7515 return UnableToLegalize; 7516 7517 // Generate the stores. For each store type in the list, we generate the 7518 // matching store of that type to the destination address. 7519 LLT PtrTy = MRI.getType(Dst); 7520 unsigned DstOff = 0; 7521 unsigned Size = KnownLen; 7522 for (unsigned I = 0; I < MemOps.size(); I++) { 7523 LLT Ty = MemOps[I]; 7524 unsigned TySize = Ty.getSizeInBytes(); 7525 if (TySize > Size) { 7526 // Issuing an unaligned load / store pair that overlaps with the previous 7527 // pair. Adjust the offset accordingly. 7528 assert(I == MemOps.size() - 1 && I != 0); 7529 DstOff -= TySize - Size; 7530 } 7531 7532 // If this store is smaller than the largest store see whether we can get 7533 // the smaller value for free with a truncate. 7534 Register Value = MemSetValue; 7535 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) { 7536 MVT VT = getMVTForLLT(Ty); 7537 MVT LargestVT = getMVTForLLT(LargestTy); 7538 if (!LargestTy.isVector() && !Ty.isVector() && 7539 TLI.isTruncateFree(LargestVT, VT)) 7540 Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0); 7541 else 7542 Value = getMemsetValue(Val, Ty, MIB); 7543 if (!Value) 7544 return UnableToLegalize; 7545 } 7546 7547 auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty); 7548 7549 Register Ptr = Dst; 7550 if (DstOff != 0) { 7551 auto Offset = 7552 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff); 7553 Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0); 7554 } 7555 7556 MIB.buildStore(Value, Ptr, *StoreMMO); 7557 DstOff += Ty.getSizeInBytes(); 7558 Size -= TySize; 7559 } 7560 7561 MI.eraseFromParent(); 7562 return Legalized; 7563 } 7564 7565 LegalizerHelper::LegalizeResult 7566 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) { 7567 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE); 7568 7569 Register Dst = MI.getOperand(0).getReg(); 7570 Register Src = MI.getOperand(1).getReg(); 7571 Register Len = MI.getOperand(2).getReg(); 7572 7573 const auto *MMOIt = MI.memoperands_begin(); 7574 const MachineMemOperand *MemOp = *MMOIt; 7575 bool IsVolatile = MemOp->isVolatile(); 7576 7577 // See if this is a constant length copy 7578 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI); 7579 // FIXME: support dynamically sized G_MEMCPY_INLINE 7580 assert(LenVRegAndVal.hasValue() && 7581 "inline memcpy with dynamic size is not yet supported"); 7582 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue(); 7583 if (KnownLen == 0) { 7584 MI.eraseFromParent(); 7585 return Legalized; 7586 } 7587 7588 const auto &DstMMO = **MI.memoperands_begin(); 7589 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 7590 Align DstAlign = DstMMO.getBaseAlign(); 7591 Align SrcAlign = SrcMMO.getBaseAlign(); 7592 7593 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, 7594 IsVolatile); 7595 } 7596 7597 LegalizerHelper::LegalizeResult 7598 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src, 7599 uint64_t KnownLen, Align DstAlign, 7600 Align SrcAlign, bool IsVolatile) { 7601 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE); 7602 return lowerMemcpy(MI, Dst, Src, KnownLen, 7603 std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign, 7604 IsVolatile); 7605 } 7606 7607 LegalizerHelper::LegalizeResult 7608 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src, 7609 uint64_t KnownLen, uint64_t Limit, Align DstAlign, 7610 Align SrcAlign, bool IsVolatile) { 7611 auto &MF = *MI.getParent()->getParent(); 7612 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 7613 auto &DL = MF.getDataLayout(); 7614 LLVMContext &C = MF.getFunction().getContext(); 7615 7616 assert(KnownLen != 0 && "Have a zero length memcpy length!"); 7617 7618 bool DstAlignCanChange = false; 7619 MachineFrameInfo &MFI = MF.getFrameInfo(); 7620 Align Alignment = commonAlignment(DstAlign, SrcAlign); 7621 7622 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 7623 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 7624 DstAlignCanChange = true; 7625 7626 // FIXME: infer better src pointer alignment like SelectionDAG does here. 7627 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining 7628 // if the memcpy is in a tail call position. 7629 7630 std::vector<LLT> MemOps; 7631 7632 const auto &DstMMO = **MI.memoperands_begin(); 7633 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 7634 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 7635 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); 7636 7637 if (!findGISelOptimalMemOpLowering( 7638 MemOps, Limit, 7639 MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign, 7640 IsVolatile), 7641 DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(), 7642 MF.getFunction().getAttributes(), TLI)) 7643 return UnableToLegalize; 7644 7645 if (DstAlignCanChange) { 7646 // Get an estimate of the type from the LLT. 7647 Type *IRTy = getTypeForLLT(MemOps[0], C); 7648 Align NewAlign = DL.getABITypeAlign(IRTy); 7649 7650 // Don't promote to an alignment that would require dynamic stack 7651 // realignment. 7652 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 7653 if (!TRI->hasStackRealignment(MF)) 7654 while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign)) 7655 NewAlign = NewAlign / 2; 7656 7657 if (NewAlign > Alignment) { 7658 Alignment = NewAlign; 7659 unsigned FI = FIDef->getOperand(1).getIndex(); 7660 // Give the stack frame object a larger alignment if needed. 7661 if (MFI.getObjectAlign(FI) < Alignment) 7662 MFI.setObjectAlignment(FI, Alignment); 7663 } 7664 } 7665 7666 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n"); 7667 7668 MachineIRBuilder MIB(MI); 7669 // Now we need to emit a pair of load and stores for each of the types we've 7670 // collected. I.e. for each type, generate a load from the source pointer of 7671 // that type width, and then generate a corresponding store to the dest buffer 7672 // of that value loaded. This can result in a sequence of loads and stores 7673 // mixed types, depending on what the target specifies as good types to use. 7674 unsigned CurrOffset = 0; 7675 unsigned Size = KnownLen; 7676 for (auto CopyTy : MemOps) { 7677 // Issuing an unaligned load / store pair that overlaps with the previous 7678 // pair. Adjust the offset accordingly. 7679 if (CopyTy.getSizeInBytes() > Size) 7680 CurrOffset -= CopyTy.getSizeInBytes() - Size; 7681 7682 // Construct MMOs for the accesses. 7683 auto *LoadMMO = 7684 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); 7685 auto *StoreMMO = 7686 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); 7687 7688 // Create the load. 7689 Register LoadPtr = Src; 7690 Register Offset; 7691 if (CurrOffset != 0) { 7692 LLT SrcTy = MRI.getType(Src); 7693 Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset) 7694 .getReg(0); 7695 LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0); 7696 } 7697 auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO); 7698 7699 // Create the store. 7700 Register StorePtr = Dst; 7701 if (CurrOffset != 0) { 7702 LLT DstTy = MRI.getType(Dst); 7703 StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0); 7704 } 7705 MIB.buildStore(LdVal, StorePtr, *StoreMMO); 7706 CurrOffset += CopyTy.getSizeInBytes(); 7707 Size -= CopyTy.getSizeInBytes(); 7708 } 7709 7710 MI.eraseFromParent(); 7711 return Legalized; 7712 } 7713 7714 LegalizerHelper::LegalizeResult 7715 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src, 7716 uint64_t KnownLen, Align DstAlign, Align SrcAlign, 7717 bool IsVolatile) { 7718 auto &MF = *MI.getParent()->getParent(); 7719 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 7720 auto &DL = MF.getDataLayout(); 7721 LLVMContext &C = MF.getFunction().getContext(); 7722 7723 assert(KnownLen != 0 && "Have a zero length memmove length!"); 7724 7725 bool DstAlignCanChange = false; 7726 MachineFrameInfo &MFI = MF.getFrameInfo(); 7727 bool OptSize = shouldLowerMemFuncForSize(MF); 7728 Align Alignment = commonAlignment(DstAlign, SrcAlign); 7729 7730 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 7731 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 7732 DstAlignCanChange = true; 7733 7734 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize); 7735 std::vector<LLT> MemOps; 7736 7737 const auto &DstMMO = **MI.memoperands_begin(); 7738 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 7739 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 7740 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); 7741 7742 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due 7743 // to a bug in it's findOptimalMemOpLowering implementation. For now do the 7744 // same thing here. 7745 if (!findGISelOptimalMemOpLowering( 7746 MemOps, Limit, 7747 MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign, 7748 /*IsVolatile*/ true), 7749 DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(), 7750 MF.getFunction().getAttributes(), TLI)) 7751 return UnableToLegalize; 7752 7753 if (DstAlignCanChange) { 7754 // Get an estimate of the type from the LLT. 7755 Type *IRTy = getTypeForLLT(MemOps[0], C); 7756 Align NewAlign = DL.getABITypeAlign(IRTy); 7757 7758 // Don't promote to an alignment that would require dynamic stack 7759 // realignment. 7760 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 7761 if (!TRI->hasStackRealignment(MF)) 7762 while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign)) 7763 NewAlign = NewAlign / 2; 7764 7765 if (NewAlign > Alignment) { 7766 Alignment = NewAlign; 7767 unsigned FI = FIDef->getOperand(1).getIndex(); 7768 // Give the stack frame object a larger alignment if needed. 7769 if (MFI.getObjectAlign(FI) < Alignment) 7770 MFI.setObjectAlignment(FI, Alignment); 7771 } 7772 } 7773 7774 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n"); 7775 7776 MachineIRBuilder MIB(MI); 7777 // Memmove requires that we perform the loads first before issuing the stores. 7778 // Apart from that, this loop is pretty much doing the same thing as the 7779 // memcpy codegen function. 7780 unsigned CurrOffset = 0; 7781 SmallVector<Register, 16> LoadVals; 7782 for (auto CopyTy : MemOps) { 7783 // Construct MMO for the load. 7784 auto *LoadMMO = 7785 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); 7786 7787 // Create the load. 7788 Register LoadPtr = Src; 7789 if (CurrOffset != 0) { 7790 LLT SrcTy = MRI.getType(Src); 7791 auto Offset = 7792 MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset); 7793 LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0); 7794 } 7795 LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0)); 7796 CurrOffset += CopyTy.getSizeInBytes(); 7797 } 7798 7799 CurrOffset = 0; 7800 for (unsigned I = 0; I < MemOps.size(); ++I) { 7801 LLT CopyTy = MemOps[I]; 7802 // Now store the values loaded. 7803 auto *StoreMMO = 7804 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); 7805 7806 Register StorePtr = Dst; 7807 if (CurrOffset != 0) { 7808 LLT DstTy = MRI.getType(Dst); 7809 auto Offset = 7810 MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset); 7811 StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0); 7812 } 7813 MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO); 7814 CurrOffset += CopyTy.getSizeInBytes(); 7815 } 7816 MI.eraseFromParent(); 7817 return Legalized; 7818 } 7819 7820 LegalizerHelper::LegalizeResult 7821 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { 7822 const unsigned Opc = MI.getOpcode(); 7823 // This combine is fairly complex so it's not written with a separate 7824 // matcher function. 7825 assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE || 7826 Opc == TargetOpcode::G_MEMSET) && 7827 "Expected memcpy like instruction"); 7828 7829 auto MMOIt = MI.memoperands_begin(); 7830 const MachineMemOperand *MemOp = *MMOIt; 7831 7832 Align DstAlign = MemOp->getBaseAlign(); 7833 Align SrcAlign; 7834 Register Dst = MI.getOperand(0).getReg(); 7835 Register Src = MI.getOperand(1).getReg(); 7836 Register Len = MI.getOperand(2).getReg(); 7837 7838 if (Opc != TargetOpcode::G_MEMSET) { 7839 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI"); 7840 MemOp = *(++MMOIt); 7841 SrcAlign = MemOp->getBaseAlign(); 7842 } 7843 7844 // See if this is a constant length copy 7845 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI); 7846 if (!LenVRegAndVal) 7847 return UnableToLegalize; 7848 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue(); 7849 7850 if (KnownLen == 0) { 7851 MI.eraseFromParent(); 7852 return Legalized; 7853 } 7854 7855 bool IsVolatile = MemOp->isVolatile(); 7856 if (Opc == TargetOpcode::G_MEMCPY_INLINE) 7857 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, 7858 IsVolatile); 7859 7860 // Don't try to optimize volatile. 7861 if (IsVolatile) 7862 return UnableToLegalize; 7863 7864 if (MaxLen && KnownLen > MaxLen) 7865 return UnableToLegalize; 7866 7867 if (Opc == TargetOpcode::G_MEMCPY) { 7868 auto &MF = *MI.getParent()->getParent(); 7869 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 7870 bool OptSize = shouldLowerMemFuncForSize(MF); 7871 uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize); 7872 return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign, 7873 IsVolatile); 7874 } 7875 if (Opc == TargetOpcode::G_MEMMOVE) 7876 return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); 7877 if (Opc == TargetOpcode::G_MEMSET) 7878 return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile); 7879 return UnableToLegalize; 7880 } 7881