1 //===-- lib/CodeGen/GlobalISel/GICombinerHelper.cpp -----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 9 #include "llvm/CodeGen/GlobalISel/Combiner.h" 10 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" 11 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 12 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 13 #include "llvm/CodeGen/GlobalISel/Utils.h" 14 #include "llvm/CodeGen/MachineFrameInfo.h" 15 #include "llvm/CodeGen/MachineInstr.h" 16 #include "llvm/CodeGen/MachineRegisterInfo.h" 17 #include "llvm/CodeGen/TargetInstrInfo.h" 18 #include "llvm/CodeGen/TargetLowering.h" 19 #include "llvm/Target/TargetMachine.h" 20 21 #define DEBUG_TYPE "gi-combiner" 22 23 using namespace llvm; 24 25 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer, 26 MachineIRBuilder &B, GISelKnownBits *KB) 27 : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer), 28 KB(KB) { 29 (void)this->KB; 30 } 31 32 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, 33 Register ToReg) const { 34 Observer.changingAllUsesOfReg(MRI, FromReg); 35 36 if (MRI.constrainRegAttrs(ToReg, FromReg)) 37 MRI.replaceRegWith(FromReg, ToReg); 38 else 39 Builder.buildCopy(ToReg, FromReg); 40 41 Observer.finishedChangingAllUsesOfReg(); 42 } 43 44 void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI, 45 MachineOperand &FromRegOp, 46 Register ToReg) const { 47 assert(FromRegOp.getParent() && "Expected an operand in an MI"); 48 Observer.changingInstr(*FromRegOp.getParent()); 49 50 FromRegOp.setReg(ToReg); 51 52 Observer.changedInstr(*FromRegOp.getParent()); 53 } 54 55 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) { 56 if (matchCombineCopy(MI)) { 57 applyCombineCopy(MI); 58 return true; 59 } 60 return false; 61 } 62 bool CombinerHelper::matchCombineCopy(MachineInstr &MI) { 63 if (MI.getOpcode() != TargetOpcode::COPY) 64 return false; 65 unsigned DstReg = MI.getOperand(0).getReg(); 66 unsigned SrcReg = MI.getOperand(1).getReg(); 67 LLT DstTy = MRI.getType(DstReg); 68 LLT SrcTy = MRI.getType(SrcReg); 69 // Simple Copy Propagation. 70 // a(sx) = COPY b(sx) -> Replace all uses of a with b. 71 if (DstTy.isValid() && SrcTy.isValid() && DstTy == SrcTy) 72 return true; 73 return false; 74 } 75 void CombinerHelper::applyCombineCopy(MachineInstr &MI) { 76 unsigned DstReg = MI.getOperand(0).getReg(); 77 unsigned SrcReg = MI.getOperand(1).getReg(); 78 MI.eraseFromParent(); 79 replaceRegWith(MRI, DstReg, SrcReg); 80 } 81 82 namespace { 83 84 /// Select a preference between two uses. CurrentUse is the current preference 85 /// while *ForCandidate is attributes of the candidate under consideration. 86 PreferredTuple ChoosePreferredUse(PreferredTuple &CurrentUse, 87 const LLT &TyForCandidate, 88 unsigned OpcodeForCandidate, 89 MachineInstr *MIForCandidate) { 90 if (!CurrentUse.Ty.isValid()) { 91 if (CurrentUse.ExtendOpcode == OpcodeForCandidate || 92 CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT) 93 return {TyForCandidate, OpcodeForCandidate, MIForCandidate}; 94 return CurrentUse; 95 } 96 97 // We permit the extend to hoist through basic blocks but this is only 98 // sensible if the target has extending loads. If you end up lowering back 99 // into a load and extend during the legalizer then the end result is 100 // hoisting the extend up to the load. 101 102 // Prefer defined extensions to undefined extensions as these are more 103 // likely to reduce the number of instructions. 104 if (OpcodeForCandidate == TargetOpcode::G_ANYEXT && 105 CurrentUse.ExtendOpcode != TargetOpcode::G_ANYEXT) 106 return CurrentUse; 107 else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT && 108 OpcodeForCandidate != TargetOpcode::G_ANYEXT) 109 return {TyForCandidate, OpcodeForCandidate, MIForCandidate}; 110 111 // Prefer sign extensions to zero extensions as sign-extensions tend to be 112 // more expensive. 113 if (CurrentUse.Ty == TyForCandidate) { 114 if (CurrentUse.ExtendOpcode == TargetOpcode::G_SEXT && 115 OpcodeForCandidate == TargetOpcode::G_ZEXT) 116 return CurrentUse; 117 else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ZEXT && 118 OpcodeForCandidate == TargetOpcode::G_SEXT) 119 return {TyForCandidate, OpcodeForCandidate, MIForCandidate}; 120 } 121 122 // This is potentially target specific. We've chosen the largest type 123 // because G_TRUNC is usually free. One potential catch with this is that 124 // some targets have a reduced number of larger registers than smaller 125 // registers and this choice potentially increases the live-range for the 126 // larger value. 127 if (TyForCandidate.getSizeInBits() > CurrentUse.Ty.getSizeInBits()) { 128 return {TyForCandidate, OpcodeForCandidate, MIForCandidate}; 129 } 130 return CurrentUse; 131 } 132 133 /// Find a suitable place to insert some instructions and insert them. This 134 /// function accounts for special cases like inserting before a PHI node. 135 /// The current strategy for inserting before PHI's is to duplicate the 136 /// instructions for each predecessor. However, while that's ok for G_TRUNC 137 /// on most targets since it generally requires no code, other targets/cases may 138 /// want to try harder to find a dominating block. 139 static void InsertInsnsWithoutSideEffectsBeforeUse( 140 MachineIRBuilder &Builder, MachineInstr &DefMI, MachineOperand &UseMO, 141 std::function<void(MachineBasicBlock *, MachineBasicBlock::iterator, 142 MachineOperand &UseMO)> 143 Inserter) { 144 MachineInstr &UseMI = *UseMO.getParent(); 145 146 MachineBasicBlock *InsertBB = UseMI.getParent(); 147 148 // If the use is a PHI then we want the predecessor block instead. 149 if (UseMI.isPHI()) { 150 MachineOperand *PredBB = std::next(&UseMO); 151 InsertBB = PredBB->getMBB(); 152 } 153 154 // If the block is the same block as the def then we want to insert just after 155 // the def instead of at the start of the block. 156 if (InsertBB == DefMI.getParent()) { 157 MachineBasicBlock::iterator InsertPt = &DefMI; 158 Inserter(InsertBB, std::next(InsertPt), UseMO); 159 return; 160 } 161 162 // Otherwise we want the start of the BB 163 Inserter(InsertBB, InsertBB->getFirstNonPHI(), UseMO); 164 } 165 } // end anonymous namespace 166 167 bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) { 168 PreferredTuple Preferred; 169 if (matchCombineExtendingLoads(MI, Preferred)) { 170 applyCombineExtendingLoads(MI, Preferred); 171 return true; 172 } 173 return false; 174 } 175 176 bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI, 177 PreferredTuple &Preferred) { 178 // We match the loads and follow the uses to the extend instead of matching 179 // the extends and following the def to the load. This is because the load 180 // must remain in the same position for correctness (unless we also add code 181 // to find a safe place to sink it) whereas the extend is freely movable. 182 // It also prevents us from duplicating the load for the volatile case or just 183 // for performance. 184 185 if (MI.getOpcode() != TargetOpcode::G_LOAD && 186 MI.getOpcode() != TargetOpcode::G_SEXTLOAD && 187 MI.getOpcode() != TargetOpcode::G_ZEXTLOAD) 188 return false; 189 190 auto &LoadValue = MI.getOperand(0); 191 assert(LoadValue.isReg() && "Result wasn't a register?"); 192 193 LLT LoadValueTy = MRI.getType(LoadValue.getReg()); 194 if (!LoadValueTy.isScalar()) 195 return false; 196 197 // Most architectures are going to legalize <s8 loads into at least a 1 byte 198 // load, and the MMOs can only describe memory accesses in multiples of bytes. 199 // If we try to perform extload combining on those, we can end up with 200 // %a(s8) = extload %ptr (load 1 byte from %ptr) 201 // ... which is an illegal extload instruction. 202 if (LoadValueTy.getSizeInBits() < 8) 203 return false; 204 205 // For non power-of-2 types, they will very likely be legalized into multiple 206 // loads. Don't bother trying to match them into extending loads. 207 if (!isPowerOf2_32(LoadValueTy.getSizeInBits())) 208 return false; 209 210 // Find the preferred type aside from the any-extends (unless it's the only 211 // one) and non-extending ops. We'll emit an extending load to that type and 212 // and emit a variant of (extend (trunc X)) for the others according to the 213 // relative type sizes. At the same time, pick an extend to use based on the 214 // extend involved in the chosen type. 215 unsigned PreferredOpcode = MI.getOpcode() == TargetOpcode::G_LOAD 216 ? TargetOpcode::G_ANYEXT 217 : MI.getOpcode() == TargetOpcode::G_SEXTLOAD 218 ? TargetOpcode::G_SEXT 219 : TargetOpcode::G_ZEXT; 220 Preferred = {LLT(), PreferredOpcode, nullptr}; 221 for (auto &UseMI : MRI.use_instructions(LoadValue.getReg())) { 222 if (UseMI.getOpcode() == TargetOpcode::G_SEXT || 223 UseMI.getOpcode() == TargetOpcode::G_ZEXT || 224 UseMI.getOpcode() == TargetOpcode::G_ANYEXT) { 225 Preferred = ChoosePreferredUse(Preferred, 226 MRI.getType(UseMI.getOperand(0).getReg()), 227 UseMI.getOpcode(), &UseMI); 228 } 229 } 230 231 // There were no extends 232 if (!Preferred.MI) 233 return false; 234 // It should be impossible to chose an extend without selecting a different 235 // type since by definition the result of an extend is larger. 236 assert(Preferred.Ty != LoadValueTy && "Extending to same type?"); 237 238 LLVM_DEBUG(dbgs() << "Preferred use is: " << *Preferred.MI); 239 return true; 240 } 241 242 void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI, 243 PreferredTuple &Preferred) { 244 // Rewrite the load to the chosen extending load. 245 Register ChosenDstReg = Preferred.MI->getOperand(0).getReg(); 246 247 // Inserter to insert a truncate back to the original type at a given point 248 // with some basic CSE to limit truncate duplication to one per BB. 249 DenseMap<MachineBasicBlock *, MachineInstr *> EmittedInsns; 250 auto InsertTruncAt = [&](MachineBasicBlock *InsertIntoBB, 251 MachineBasicBlock::iterator InsertBefore, 252 MachineOperand &UseMO) { 253 MachineInstr *PreviouslyEmitted = EmittedInsns.lookup(InsertIntoBB); 254 if (PreviouslyEmitted) { 255 Observer.changingInstr(*UseMO.getParent()); 256 UseMO.setReg(PreviouslyEmitted->getOperand(0).getReg()); 257 Observer.changedInstr(*UseMO.getParent()); 258 return; 259 } 260 261 Builder.setInsertPt(*InsertIntoBB, InsertBefore); 262 Register NewDstReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg()); 263 MachineInstr *NewMI = Builder.buildTrunc(NewDstReg, ChosenDstReg); 264 EmittedInsns[InsertIntoBB] = NewMI; 265 replaceRegOpWith(MRI, UseMO, NewDstReg); 266 }; 267 268 Observer.changingInstr(MI); 269 MI.setDesc( 270 Builder.getTII().get(Preferred.ExtendOpcode == TargetOpcode::G_SEXT 271 ? TargetOpcode::G_SEXTLOAD 272 : Preferred.ExtendOpcode == TargetOpcode::G_ZEXT 273 ? TargetOpcode::G_ZEXTLOAD 274 : TargetOpcode::G_LOAD)); 275 276 // Rewrite all the uses to fix up the types. 277 auto &LoadValue = MI.getOperand(0); 278 SmallVector<MachineOperand *, 4> Uses; 279 for (auto &UseMO : MRI.use_operands(LoadValue.getReg())) 280 Uses.push_back(&UseMO); 281 282 for (auto *UseMO : Uses) { 283 MachineInstr *UseMI = UseMO->getParent(); 284 285 // If the extend is compatible with the preferred extend then we should fix 286 // up the type and extend so that it uses the preferred use. 287 if (UseMI->getOpcode() == Preferred.ExtendOpcode || 288 UseMI->getOpcode() == TargetOpcode::G_ANYEXT) { 289 unsigned UseDstReg = UseMI->getOperand(0).getReg(); 290 MachineOperand &UseSrcMO = UseMI->getOperand(1); 291 const LLT &UseDstTy = MRI.getType(UseDstReg); 292 if (UseDstReg != ChosenDstReg) { 293 if (Preferred.Ty == UseDstTy) { 294 // If the use has the same type as the preferred use, then merge 295 // the vregs and erase the extend. For example: 296 // %1:_(s8) = G_LOAD ... 297 // %2:_(s32) = G_SEXT %1(s8) 298 // %3:_(s32) = G_ANYEXT %1(s8) 299 // ... = ... %3(s32) 300 // rewrites to: 301 // %2:_(s32) = G_SEXTLOAD ... 302 // ... = ... %2(s32) 303 replaceRegWith(MRI, UseDstReg, ChosenDstReg); 304 Observer.erasingInstr(*UseMO->getParent()); 305 UseMO->getParent()->eraseFromParent(); 306 } else if (Preferred.Ty.getSizeInBits() < UseDstTy.getSizeInBits()) { 307 // If the preferred size is smaller, then keep the extend but extend 308 // from the result of the extending load. For example: 309 // %1:_(s8) = G_LOAD ... 310 // %2:_(s32) = G_SEXT %1(s8) 311 // %3:_(s64) = G_ANYEXT %1(s8) 312 // ... = ... %3(s64) 313 /// rewrites to: 314 // %2:_(s32) = G_SEXTLOAD ... 315 // %3:_(s64) = G_ANYEXT %2:_(s32) 316 // ... = ... %3(s64) 317 replaceRegOpWith(MRI, UseSrcMO, ChosenDstReg); 318 } else { 319 // If the preferred size is large, then insert a truncate. For 320 // example: 321 // %1:_(s8) = G_LOAD ... 322 // %2:_(s64) = G_SEXT %1(s8) 323 // %3:_(s32) = G_ZEXT %1(s8) 324 // ... = ... %3(s32) 325 /// rewrites to: 326 // %2:_(s64) = G_SEXTLOAD ... 327 // %4:_(s8) = G_TRUNC %2:_(s32) 328 // %3:_(s64) = G_ZEXT %2:_(s8) 329 // ... = ... %3(s64) 330 InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO, 331 InsertTruncAt); 332 } 333 continue; 334 } 335 // The use is (one of) the uses of the preferred use we chose earlier. 336 // We're going to update the load to def this value later so just erase 337 // the old extend. 338 Observer.erasingInstr(*UseMO->getParent()); 339 UseMO->getParent()->eraseFromParent(); 340 continue; 341 } 342 343 // The use isn't an extend. Truncate back to the type we originally loaded. 344 // This is free on many targets. 345 InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO, InsertTruncAt); 346 } 347 348 MI.getOperand(0).setReg(ChosenDstReg); 349 Observer.changedInstr(MI); 350 } 351 352 bool CombinerHelper::matchCombineBr(MachineInstr &MI) { 353 assert(MI.getOpcode() == TargetOpcode::G_BR && "Expected a G_BR"); 354 // Try to match the following: 355 // bb1: 356 // %c(s32) = G_ICMP pred, %a, %b 357 // %c1(s1) = G_TRUNC %c(s32) 358 // G_BRCOND %c1, %bb2 359 // G_BR %bb3 360 // bb2: 361 // ... 362 // bb3: 363 364 // The above pattern does not have a fall through to the successor bb2, always 365 // resulting in a branch no matter which path is taken. Here we try to find 366 // and replace that pattern with conditional branch to bb3 and otherwise 367 // fallthrough to bb2. 368 369 MachineBasicBlock *MBB = MI.getParent(); 370 MachineBasicBlock::iterator BrIt(MI); 371 if (BrIt == MBB->begin()) 372 return false; 373 assert(std::next(BrIt) == MBB->end() && "expected G_BR to be a terminator"); 374 375 MachineInstr *BrCond = &*std::prev(BrIt); 376 if (BrCond->getOpcode() != TargetOpcode::G_BRCOND) 377 return false; 378 379 // Check that the next block is the conditional branch target. 380 if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB())) 381 return false; 382 383 MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg()); 384 if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP || 385 !MRI.hasOneUse(CmpMI->getOperand(0).getReg())) 386 return false; 387 return true; 388 } 389 390 bool CombinerHelper::tryCombineBr(MachineInstr &MI) { 391 if (!matchCombineBr(MI)) 392 return false; 393 MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB(); 394 MachineBasicBlock::iterator BrIt(MI); 395 MachineInstr *BrCond = &*std::prev(BrIt); 396 MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg()); 397 398 CmpInst::Predicate InversePred = CmpInst::getInversePredicate( 399 (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate()); 400 401 // Invert the G_ICMP condition. 402 Observer.changingInstr(*CmpMI); 403 CmpMI->getOperand(1).setPredicate(InversePred); 404 Observer.changedInstr(*CmpMI); 405 406 // Change the conditional branch target. 407 Observer.changingInstr(*BrCond); 408 BrCond->getOperand(1).setMBB(BrTarget); 409 Observer.changedInstr(*BrCond); 410 MI.eraseFromParent(); 411 return true; 412 } 413 414 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { 415 // On Darwin, -Os means optimize for size without hurting performance, so 416 // only really optimize for size when -Oz (MinSize) is used. 417 if (MF.getTarget().getTargetTriple().isOSDarwin()) 418 return MF.getFunction().hasMinSize(); 419 return MF.getFunction().hasOptSize(); 420 } 421 422 // Get a rough equivalent of an MVT for a given LLT. 423 static MVT getMVTForLLT(LLT Ty) { 424 if (!Ty.isVector()) 425 return MVT::getIntegerVT(Ty.getSizeInBits()); 426 427 return MVT::getVectorVT( 428 MVT::getIntegerVT(Ty.getElementType().getSizeInBits()), 429 Ty.getNumElements()); 430 } 431 432 // Returns a list of types to use for memory op lowering in MemOps. A partial 433 // port of findOptimalMemOpLowering in TargetLowering. 434 static bool findGISelOptimalMemOpLowering( 435 std::vector<LLT> &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign, 436 unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, 437 bool AllowOverlap, unsigned DstAS, unsigned SrcAS, 438 const AttributeList &FuncAttributes, const TargetLowering &TLI) { 439 // If 'SrcAlign' is zero, that means the memory operation does not need to 440 // load the value, i.e. memset or memcpy from constant string. Otherwise, 441 // it's the inferred alignment of the source. 'DstAlign', on the other hand, 442 // is the specified alignment of the memory operation. If it is zero, that 443 // means it's possible to change the alignment of the destination. 444 // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does 445 // not need to be loaded. 446 if (SrcAlign != 0 && SrcAlign < DstAlign) 447 return false; 448 449 LLT Ty = TLI.getOptimalMemOpLLT(Size, DstAlign, SrcAlign, IsMemset, 450 ZeroMemset, MemcpyStrSrc, FuncAttributes); 451 452 if (Ty == LLT()) { 453 // Use the largest scalar type whose alignment constraints are satisfied. 454 // We only need to check DstAlign here as SrcAlign is always greater or 455 // equal to DstAlign (or zero). 456 Ty = LLT::scalar(64); 457 while (DstAlign && DstAlign < Ty.getSizeInBytes() && 458 !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, DstAlign)) 459 Ty = LLT::scalar(Ty.getSizeInBytes()); 460 assert(Ty.getSizeInBits() > 0 && "Could not find valid type"); 461 // FIXME: check for the largest legal type we can load/store to. 462 } 463 464 unsigned NumMemOps = 0; 465 while (Size != 0) { 466 unsigned TySize = Ty.getSizeInBytes(); 467 while (TySize > Size) { 468 // For now, only use non-vector load / store's for the left-over pieces. 469 LLT NewTy = Ty; 470 // FIXME: check for mem op safety and legality of the types. Not all of 471 // SDAGisms map cleanly to GISel concepts. 472 if (NewTy.isVector()) 473 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32); 474 unsigned NewTySize = NewTy.getSizeInBytes(); 475 476 NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits()-1)); 477 NewTySize = NewTy.getSizeInBytes(); 478 assert(NewTySize > 0 && "Could not find appropriate type"); 479 480 // If the new LLT cannot cover all of the remaining bits, then consider 481 // issuing a (or a pair of) unaligned and overlapping load / store. 482 bool Fast; 483 // Need to get a VT equivalent for allowMisalignedMemoryAccesses(). 484 MVT VT = getMVTForLLT(Ty); 485 if (NumMemOps && AllowOverlap && NewTySize < Size && 486 TLI.allowsMisalignedMemoryAccesses( 487 VT, DstAS, DstAlign, MachineMemOperand::MONone, &Fast) && 488 Fast) 489 TySize = Size; 490 else { 491 Ty = NewTy; 492 TySize = NewTySize; 493 } 494 } 495 496 if (++NumMemOps > Limit) 497 return false; 498 499 MemOps.push_back(Ty); 500 Size -= TySize; 501 } 502 503 return true; 504 } 505 506 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) { 507 if (Ty.isVector()) 508 return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()), 509 Ty.getNumElements()); 510 return IntegerType::get(C, Ty.getSizeInBits()); 511 } 512 513 // Get a vectorized representation of the memset value operand, GISel edition. 514 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) { 515 MachineRegisterInfo &MRI = *MIB.getMRI(); 516 unsigned NumBits = Ty.getScalarSizeInBits(); 517 auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI); 518 if (!Ty.isVector() && ValVRegAndVal) { 519 unsigned KnownVal = ValVRegAndVal->Value; 520 APInt Scalar = APInt(8, KnownVal); 521 APInt SplatVal = APInt::getSplat(NumBits, Scalar); 522 return MIB.buildConstant(Ty, SplatVal).getReg(0); 523 } 524 // FIXME: for vector types create a G_BUILD_VECTOR. 525 if (Ty.isVector()) 526 return Register(); 527 528 // Extend the byte value to the larger type, and then multiply by a magic 529 // value 0x010101... in order to replicate it across every byte. 530 LLT ExtType = Ty.getScalarType(); 531 auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val); 532 if (NumBits > 8) { 533 APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01)); 534 auto MagicMI = MIB.buildConstant(ExtType, Magic); 535 Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0); 536 } 537 538 assert(ExtType == Ty && "Vector memset value type not supported yet"); 539 return Val; 540 } 541 542 bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val, 543 unsigned KnownLen, unsigned Align, 544 bool IsVolatile) { 545 auto &MF = *MI.getParent()->getParent(); 546 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 547 auto &DL = MF.getDataLayout(); 548 LLVMContext &C = MF.getFunction().getContext(); 549 550 assert(KnownLen != 0 && "Have a zero length memset length!"); 551 552 bool DstAlignCanChange = false; 553 MachineFrameInfo &MFI = MF.getFrameInfo(); 554 bool OptSize = shouldLowerMemFuncForSize(MF); 555 556 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 557 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 558 DstAlignCanChange = true; 559 560 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize); 561 std::vector<LLT> MemOps; 562 563 const auto &DstMMO = **MI.memoperands_begin(); 564 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 565 566 auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI); 567 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0; 568 569 if (!findGISelOptimalMemOpLowering( 570 MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), 0, 571 /*IsMemset=*/true, 572 /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false, 573 /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), ~0u, 574 MF.getFunction().getAttributes(), TLI)) 575 return false; 576 577 if (DstAlignCanChange) { 578 // Get an estimate of the type from the LLT. 579 Type *IRTy = getTypeForLLT(MemOps[0], C); 580 unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy); 581 if (NewAlign > Align) { 582 unsigned FI = FIDef->getOperand(1).getIndex(); 583 // Give the stack frame object a larger alignment if needed. 584 if (MFI.getObjectAlignment(FI) < NewAlign) 585 MFI.setObjectAlignment(FI, NewAlign); 586 Align = NewAlign; 587 } 588 } 589 590 MachineIRBuilder MIB(MI); 591 // Find the largest store and generate the bit pattern for it. 592 LLT LargestTy = MemOps[0]; 593 for (unsigned i = 1; i < MemOps.size(); i++) 594 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits()) 595 LargestTy = MemOps[i]; 596 597 // The memset stored value is always defined as an s8, so in order to make it 598 // work with larger store types we need to repeat the bit pattern across the 599 // wider type. 600 Register MemSetValue = getMemsetValue(Val, LargestTy, MIB); 601 602 if (!MemSetValue) 603 return false; 604 605 // Generate the stores. For each store type in the list, we generate the 606 // matching store of that type to the destination address. 607 LLT PtrTy = MRI.getType(Dst); 608 unsigned DstOff = 0; 609 unsigned Size = KnownLen; 610 for (unsigned I = 0; I < MemOps.size(); I++) { 611 LLT Ty = MemOps[I]; 612 unsigned TySize = Ty.getSizeInBytes(); 613 if (TySize > Size) { 614 // Issuing an unaligned load / store pair that overlaps with the previous 615 // pair. Adjust the offset accordingly. 616 assert(I == MemOps.size() - 1 && I != 0); 617 DstOff -= TySize - Size; 618 } 619 620 // If this store is smaller than the largest store see whether we can get 621 // the smaller value for free with a truncate. 622 Register Value = MemSetValue; 623 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) { 624 MVT VT = getMVTForLLT(Ty); 625 MVT LargestVT = getMVTForLLT(LargestTy); 626 if (!LargestTy.isVector() && !Ty.isVector() && 627 TLI.isTruncateFree(LargestVT, VT)) 628 Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0); 629 else 630 Value = getMemsetValue(Val, Ty, MIB); 631 if (!Value) 632 return false; 633 } 634 635 auto *StoreMMO = 636 MF.getMachineMemOperand(&DstMMO, DstOff, Ty.getSizeInBytes()); 637 638 Register Ptr = Dst; 639 if (DstOff != 0) { 640 auto Offset = 641 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff); 642 Ptr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0); 643 } 644 645 MIB.buildStore(Value, Ptr, *StoreMMO); 646 DstOff += Ty.getSizeInBytes(); 647 Size -= TySize; 648 } 649 650 MI.eraseFromParent(); 651 return true; 652 } 653 654 655 bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst, 656 Register Src, unsigned KnownLen, 657 unsigned DstAlign, unsigned SrcAlign, 658 bool IsVolatile) { 659 auto &MF = *MI.getParent()->getParent(); 660 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 661 auto &DL = MF.getDataLayout(); 662 LLVMContext &C = MF.getFunction().getContext(); 663 664 assert(KnownLen != 0 && "Have a zero length memcpy length!"); 665 666 bool DstAlignCanChange = false; 667 MachineFrameInfo &MFI = MF.getFrameInfo(); 668 bool OptSize = shouldLowerMemFuncForSize(MF); 669 unsigned Align = MinAlign(DstAlign, SrcAlign); 670 671 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 672 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 673 DstAlignCanChange = true; 674 675 // FIXME: infer better src pointer alignment like SelectionDAG does here. 676 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining 677 // if the memcpy is in a tail call position. 678 679 unsigned Limit = TLI.getMaxStoresPerMemcpy(OptSize); 680 std::vector<LLT> MemOps; 681 682 const auto &DstMMO = **MI.memoperands_begin(); 683 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 684 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 685 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); 686 687 if (!findGISelOptimalMemOpLowering( 688 MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), SrcAlign, 689 /*IsMemset=*/false, 690 /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false, 691 /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), 692 SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI)) 693 return false; 694 695 if (DstAlignCanChange) { 696 // Get an estimate of the type from the LLT. 697 Type *IRTy = getTypeForLLT(MemOps[0], C); 698 unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy); 699 700 // Don't promote to an alignment that would require dynamic stack 701 // realignment. 702 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 703 if (!TRI->needsStackRealignment(MF)) 704 while (NewAlign > Align && 705 DL.exceedsNaturalStackAlignment(llvm::Align(NewAlign))) 706 NewAlign /= 2; 707 708 if (NewAlign > Align) { 709 unsigned FI = FIDef->getOperand(1).getIndex(); 710 // Give the stack frame object a larger alignment if needed. 711 if (MFI.getObjectAlignment(FI) < NewAlign) 712 MFI.setObjectAlignment(FI, NewAlign); 713 Align = NewAlign; 714 } 715 } 716 717 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n"); 718 719 MachineIRBuilder MIB(MI); 720 // Now we need to emit a pair of load and stores for each of the types we've 721 // collected. I.e. for each type, generate a load from the source pointer of 722 // that type width, and then generate a corresponding store to the dest buffer 723 // of that value loaded. This can result in a sequence of loads and stores 724 // mixed types, depending on what the target specifies as good types to use. 725 unsigned CurrOffset = 0; 726 LLT PtrTy = MRI.getType(Src); 727 unsigned Size = KnownLen; 728 for (auto CopyTy : MemOps) { 729 // Issuing an unaligned load / store pair that overlaps with the previous 730 // pair. Adjust the offset accordingly. 731 if (CopyTy.getSizeInBytes() > Size) 732 CurrOffset -= CopyTy.getSizeInBytes() - Size; 733 734 // Construct MMOs for the accesses. 735 auto *LoadMMO = 736 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); 737 auto *StoreMMO = 738 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); 739 740 // Create the load. 741 Register LoadPtr = Src; 742 Register Offset; 743 if (CurrOffset != 0) { 744 Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset) 745 .getReg(0); 746 LoadPtr = MIB.buildGEP(PtrTy, Src, Offset).getReg(0); 747 } 748 auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO); 749 750 // Create the store. 751 Register StorePtr = 752 CurrOffset == 0 ? Dst : MIB.buildGEP(PtrTy, Dst, Offset).getReg(0); 753 MIB.buildStore(LdVal, StorePtr, *StoreMMO); 754 CurrOffset += CopyTy.getSizeInBytes(); 755 Size -= CopyTy.getSizeInBytes(); 756 } 757 758 MI.eraseFromParent(); 759 return true; 760 } 761 762 bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst, 763 Register Src, unsigned KnownLen, 764 unsigned DstAlign, unsigned SrcAlign, 765 bool IsVolatile) { 766 auto &MF = *MI.getParent()->getParent(); 767 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 768 auto &DL = MF.getDataLayout(); 769 LLVMContext &C = MF.getFunction().getContext(); 770 771 assert(KnownLen != 0 && "Have a zero length memmove length!"); 772 773 bool DstAlignCanChange = false; 774 MachineFrameInfo &MFI = MF.getFrameInfo(); 775 bool OptSize = shouldLowerMemFuncForSize(MF); 776 unsigned Align = MinAlign(DstAlign, SrcAlign); 777 778 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 779 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 780 DstAlignCanChange = true; 781 782 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize); 783 std::vector<LLT> MemOps; 784 785 const auto &DstMMO = **MI.memoperands_begin(); 786 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 787 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 788 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); 789 790 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due 791 // to a bug in it's findOptimalMemOpLowering implementation. For now do the 792 // same thing here. 793 if (!findGISelOptimalMemOpLowering( 794 MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), SrcAlign, 795 /*IsMemset=*/false, 796 /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false, 797 /*AllowOverlap=*/false, DstPtrInfo.getAddrSpace(), 798 SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI)) 799 return false; 800 801 if (DstAlignCanChange) { 802 // Get an estimate of the type from the LLT. 803 Type *IRTy = getTypeForLLT(MemOps[0], C); 804 unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy); 805 806 // Don't promote to an alignment that would require dynamic stack 807 // realignment. 808 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 809 if (!TRI->needsStackRealignment(MF)) 810 while (NewAlign > Align && 811 DL.exceedsNaturalStackAlignment(llvm::Align(NewAlign))) 812 NewAlign /= 2; 813 814 if (NewAlign > Align) { 815 unsigned FI = FIDef->getOperand(1).getIndex(); 816 // Give the stack frame object a larger alignment if needed. 817 if (MFI.getObjectAlignment(FI) < NewAlign) 818 MFI.setObjectAlignment(FI, NewAlign); 819 Align = NewAlign; 820 } 821 } 822 823 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n"); 824 825 MachineIRBuilder MIB(MI); 826 // Memmove requires that we perform the loads first before issuing the stores. 827 // Apart from that, this loop is pretty much doing the same thing as the 828 // memcpy codegen function. 829 unsigned CurrOffset = 0; 830 LLT PtrTy = MRI.getType(Src); 831 SmallVector<Register, 16> LoadVals; 832 for (auto CopyTy : MemOps) { 833 // Construct MMO for the load. 834 auto *LoadMMO = 835 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); 836 837 // Create the load. 838 Register LoadPtr = Src; 839 if (CurrOffset != 0) { 840 auto Offset = 841 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset); 842 LoadPtr = MIB.buildGEP(PtrTy, Src, Offset).getReg(0); 843 } 844 LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0)); 845 CurrOffset += CopyTy.getSizeInBytes(); 846 } 847 848 CurrOffset = 0; 849 for (unsigned I = 0; I < MemOps.size(); ++I) { 850 LLT CopyTy = MemOps[I]; 851 // Now store the values loaded. 852 auto *StoreMMO = 853 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); 854 855 Register StorePtr = Dst; 856 if (CurrOffset != 0) { 857 auto Offset = 858 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset); 859 StorePtr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0); 860 } 861 MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO); 862 CurrOffset += CopyTy.getSizeInBytes(); 863 } 864 MI.eraseFromParent(); 865 return true; 866 } 867 868 bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { 869 // This combine is fairly complex so it's not written with a separate 870 // matcher function. 871 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 872 Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID(); 873 assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove || 874 ID == Intrinsic::memset) && 875 "Expected a memcpy like intrinsic"); 876 877 auto MMOIt = MI.memoperands_begin(); 878 const MachineMemOperand *MemOp = *MMOIt; 879 bool IsVolatile = MemOp->isVolatile(); 880 // Don't try to optimize volatile. 881 if (IsVolatile) 882 return false; 883 884 unsigned DstAlign = MemOp->getBaseAlignment(); 885 unsigned SrcAlign = 0; 886 unsigned Dst = MI.getOperand(1).getReg(); 887 unsigned Src = MI.getOperand(2).getReg(); 888 Register Len = MI.getOperand(3).getReg(); 889 890 if (ID != Intrinsic::memset) { 891 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI"); 892 MemOp = *(++MMOIt); 893 SrcAlign = MemOp->getBaseAlignment(); 894 } 895 896 // See if this is a constant length copy 897 auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI); 898 if (!LenVRegAndVal) 899 return false; // Leave it to the legalizer to lower it to a libcall. 900 unsigned KnownLen = LenVRegAndVal->Value; 901 902 if (KnownLen == 0) { 903 MI.eraseFromParent(); 904 return true; 905 } 906 907 if (MaxLen && KnownLen > MaxLen) 908 return false; 909 910 if (ID == Intrinsic::memcpy) 911 return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); 912 if (ID == Intrinsic::memmove) 913 return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); 914 if (ID == Intrinsic::memset) 915 return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile); 916 return false; 917 } 918 919 bool CombinerHelper::tryCombine(MachineInstr &MI) { 920 if (tryCombineCopy(MI)) 921 return true; 922 return tryCombineExtendingLoads(MI); 923 } 924