1 //===-- lib/CodeGen/GlobalISel/GICombinerHelper.cpp -----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 9 #include "llvm/CodeGen/GlobalISel/Combiner.h" 10 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" 11 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 12 #include "llvm/CodeGen/GlobalISel/Utils.h" 13 #include "llvm/CodeGen/MachineFrameInfo.h" 14 #include "llvm/CodeGen/MachineInstr.h" 15 #include "llvm/CodeGen/MachineRegisterInfo.h" 16 #include "llvm/CodeGen/TargetInstrInfo.h" 17 #include "llvm/CodeGen/TargetLowering.h" 18 #include "llvm/Target/TargetMachine.h" 19 20 #define DEBUG_TYPE "gi-combiner" 21 22 using namespace llvm; 23 24 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer, 25 MachineIRBuilder &B) 26 : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer) {} 27 28 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, 29 Register ToReg) const { 30 Observer.changingAllUsesOfReg(MRI, FromReg); 31 32 if (MRI.constrainRegAttrs(ToReg, FromReg)) 33 MRI.replaceRegWith(FromReg, ToReg); 34 else 35 Builder.buildCopy(ToReg, FromReg); 36 37 Observer.finishedChangingAllUsesOfReg(); 38 } 39 40 void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI, 41 MachineOperand &FromRegOp, 42 Register ToReg) const { 43 assert(FromRegOp.getParent() && "Expected an operand in an MI"); 44 Observer.changingInstr(*FromRegOp.getParent()); 45 46 FromRegOp.setReg(ToReg); 47 48 Observer.changedInstr(*FromRegOp.getParent()); 49 } 50 51 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) { 52 if (matchCombineCopy(MI)) { 53 applyCombineCopy(MI); 54 return true; 55 } 56 return false; 57 } 58 bool CombinerHelper::matchCombineCopy(MachineInstr &MI) { 59 if (MI.getOpcode() != TargetOpcode::COPY) 60 return false; 61 unsigned DstReg = MI.getOperand(0).getReg(); 62 unsigned SrcReg = MI.getOperand(1).getReg(); 63 LLT DstTy = MRI.getType(DstReg); 64 LLT SrcTy = MRI.getType(SrcReg); 65 // Simple Copy Propagation. 66 // a(sx) = COPY b(sx) -> Replace all uses of a with b. 67 if (DstTy.isValid() && SrcTy.isValid() && DstTy == SrcTy) 68 return true; 69 return false; 70 } 71 void CombinerHelper::applyCombineCopy(MachineInstr &MI) { 72 unsigned DstReg = MI.getOperand(0).getReg(); 73 unsigned SrcReg = MI.getOperand(1).getReg(); 74 MI.eraseFromParent(); 75 replaceRegWith(MRI, DstReg, SrcReg); 76 } 77 78 namespace { 79 80 /// Select a preference between two uses. CurrentUse is the current preference 81 /// while *ForCandidate is attributes of the candidate under consideration. 82 PreferredTuple ChoosePreferredUse(PreferredTuple &CurrentUse, 83 const LLT &TyForCandidate, 84 unsigned OpcodeForCandidate, 85 MachineInstr *MIForCandidate) { 86 if (!CurrentUse.Ty.isValid()) { 87 if (CurrentUse.ExtendOpcode == OpcodeForCandidate || 88 CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT) 89 return {TyForCandidate, OpcodeForCandidate, MIForCandidate}; 90 return CurrentUse; 91 } 92 93 // We permit the extend to hoist through basic blocks but this is only 94 // sensible if the target has extending loads. If you end up lowering back 95 // into a load and extend during the legalizer then the end result is 96 // hoisting the extend up to the load. 97 98 // Prefer defined extensions to undefined extensions as these are more 99 // likely to reduce the number of instructions. 100 if (OpcodeForCandidate == TargetOpcode::G_ANYEXT && 101 CurrentUse.ExtendOpcode != TargetOpcode::G_ANYEXT) 102 return CurrentUse; 103 else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT && 104 OpcodeForCandidate != TargetOpcode::G_ANYEXT) 105 return {TyForCandidate, OpcodeForCandidate, MIForCandidate}; 106 107 // Prefer sign extensions to zero extensions as sign-extensions tend to be 108 // more expensive. 109 if (CurrentUse.Ty == TyForCandidate) { 110 if (CurrentUse.ExtendOpcode == TargetOpcode::G_SEXT && 111 OpcodeForCandidate == TargetOpcode::G_ZEXT) 112 return CurrentUse; 113 else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ZEXT && 114 OpcodeForCandidate == TargetOpcode::G_SEXT) 115 return {TyForCandidate, OpcodeForCandidate, MIForCandidate}; 116 } 117 118 // This is potentially target specific. We've chosen the largest type 119 // because G_TRUNC is usually free. One potential catch with this is that 120 // some targets have a reduced number of larger registers than smaller 121 // registers and this choice potentially increases the live-range for the 122 // larger value. 123 if (TyForCandidate.getSizeInBits() > CurrentUse.Ty.getSizeInBits()) { 124 return {TyForCandidate, OpcodeForCandidate, MIForCandidate}; 125 } 126 return CurrentUse; 127 } 128 129 /// Find a suitable place to insert some instructions and insert them. This 130 /// function accounts for special cases like inserting before a PHI node. 131 /// The current strategy for inserting before PHI's is to duplicate the 132 /// instructions for each predecessor. However, while that's ok for G_TRUNC 133 /// on most targets since it generally requires no code, other targets/cases may 134 /// want to try harder to find a dominating block. 135 static void InsertInsnsWithoutSideEffectsBeforeUse( 136 MachineIRBuilder &Builder, MachineInstr &DefMI, MachineOperand &UseMO, 137 std::function<void(MachineBasicBlock *, MachineBasicBlock::iterator, 138 MachineOperand &UseMO)> 139 Inserter) { 140 MachineInstr &UseMI = *UseMO.getParent(); 141 142 MachineBasicBlock *InsertBB = UseMI.getParent(); 143 144 // If the use is a PHI then we want the predecessor block instead. 145 if (UseMI.isPHI()) { 146 MachineOperand *PredBB = std::next(&UseMO); 147 InsertBB = PredBB->getMBB(); 148 } 149 150 // If the block is the same block as the def then we want to insert just after 151 // the def instead of at the start of the block. 152 if (InsertBB == DefMI.getParent()) { 153 MachineBasicBlock::iterator InsertPt = &DefMI; 154 Inserter(InsertBB, std::next(InsertPt), UseMO); 155 return; 156 } 157 158 // Otherwise we want the start of the BB 159 Inserter(InsertBB, InsertBB->getFirstNonPHI(), UseMO); 160 } 161 } // end anonymous namespace 162 163 bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) { 164 PreferredTuple Preferred; 165 if (matchCombineExtendingLoads(MI, Preferred)) { 166 applyCombineExtendingLoads(MI, Preferred); 167 return true; 168 } 169 return false; 170 } 171 172 bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI, 173 PreferredTuple &Preferred) { 174 // We match the loads and follow the uses to the extend instead of matching 175 // the extends and following the def to the load. This is because the load 176 // must remain in the same position for correctness (unless we also add code 177 // to find a safe place to sink it) whereas the extend is freely movable. 178 // It also prevents us from duplicating the load for the volatile case or just 179 // for performance. 180 181 if (MI.getOpcode() != TargetOpcode::G_LOAD && 182 MI.getOpcode() != TargetOpcode::G_SEXTLOAD && 183 MI.getOpcode() != TargetOpcode::G_ZEXTLOAD) 184 return false; 185 186 auto &LoadValue = MI.getOperand(0); 187 assert(LoadValue.isReg() && "Result wasn't a register?"); 188 189 LLT LoadValueTy = MRI.getType(LoadValue.getReg()); 190 if (!LoadValueTy.isScalar()) 191 return false; 192 193 // Most architectures are going to legalize <s8 loads into at least a 1 byte 194 // load, and the MMOs can only describe memory accesses in multiples of bytes. 195 // If we try to perform extload combining on those, we can end up with 196 // %a(s8) = extload %ptr (load 1 byte from %ptr) 197 // ... which is an illegal extload instruction. 198 if (LoadValueTy.getSizeInBits() < 8) 199 return false; 200 201 // For non power-of-2 types, they will very likely be legalized into multiple 202 // loads. Don't bother trying to match them into extending loads. 203 if (!isPowerOf2_32(LoadValueTy.getSizeInBits())) 204 return false; 205 206 // Find the preferred type aside from the any-extends (unless it's the only 207 // one) and non-extending ops. We'll emit an extending load to that type and 208 // and emit a variant of (extend (trunc X)) for the others according to the 209 // relative type sizes. At the same time, pick an extend to use based on the 210 // extend involved in the chosen type. 211 unsigned PreferredOpcode = MI.getOpcode() == TargetOpcode::G_LOAD 212 ? TargetOpcode::G_ANYEXT 213 : MI.getOpcode() == TargetOpcode::G_SEXTLOAD 214 ? TargetOpcode::G_SEXT 215 : TargetOpcode::G_ZEXT; 216 Preferred = {LLT(), PreferredOpcode, nullptr}; 217 for (auto &UseMI : MRI.use_instructions(LoadValue.getReg())) { 218 if (UseMI.getOpcode() == TargetOpcode::G_SEXT || 219 UseMI.getOpcode() == TargetOpcode::G_ZEXT || 220 UseMI.getOpcode() == TargetOpcode::G_ANYEXT) { 221 Preferred = ChoosePreferredUse(Preferred, 222 MRI.getType(UseMI.getOperand(0).getReg()), 223 UseMI.getOpcode(), &UseMI); 224 } 225 } 226 227 // There were no extends 228 if (!Preferred.MI) 229 return false; 230 // It should be impossible to chose an extend without selecting a different 231 // type since by definition the result of an extend is larger. 232 assert(Preferred.Ty != LoadValueTy && "Extending to same type?"); 233 234 LLVM_DEBUG(dbgs() << "Preferred use is: " << *Preferred.MI); 235 return true; 236 } 237 238 void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI, 239 PreferredTuple &Preferred) { 240 // Rewrite the load to the chosen extending load. 241 Register ChosenDstReg = Preferred.MI->getOperand(0).getReg(); 242 243 // Inserter to insert a truncate back to the original type at a given point 244 // with some basic CSE to limit truncate duplication to one per BB. 245 DenseMap<MachineBasicBlock *, MachineInstr *> EmittedInsns; 246 auto InsertTruncAt = [&](MachineBasicBlock *InsertIntoBB, 247 MachineBasicBlock::iterator InsertBefore, 248 MachineOperand &UseMO) { 249 MachineInstr *PreviouslyEmitted = EmittedInsns.lookup(InsertIntoBB); 250 if (PreviouslyEmitted) { 251 Observer.changingInstr(*UseMO.getParent()); 252 UseMO.setReg(PreviouslyEmitted->getOperand(0).getReg()); 253 Observer.changedInstr(*UseMO.getParent()); 254 return; 255 } 256 257 Builder.setInsertPt(*InsertIntoBB, InsertBefore); 258 Register NewDstReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg()); 259 MachineInstr *NewMI = Builder.buildTrunc(NewDstReg, ChosenDstReg); 260 EmittedInsns[InsertIntoBB] = NewMI; 261 replaceRegOpWith(MRI, UseMO, NewDstReg); 262 }; 263 264 Observer.changingInstr(MI); 265 MI.setDesc( 266 Builder.getTII().get(Preferred.ExtendOpcode == TargetOpcode::G_SEXT 267 ? TargetOpcode::G_SEXTLOAD 268 : Preferred.ExtendOpcode == TargetOpcode::G_ZEXT 269 ? TargetOpcode::G_ZEXTLOAD 270 : TargetOpcode::G_LOAD)); 271 272 // Rewrite all the uses to fix up the types. 273 auto &LoadValue = MI.getOperand(0); 274 SmallVector<MachineOperand *, 4> Uses; 275 for (auto &UseMO : MRI.use_operands(LoadValue.getReg())) 276 Uses.push_back(&UseMO); 277 278 for (auto *UseMO : Uses) { 279 MachineInstr *UseMI = UseMO->getParent(); 280 281 // If the extend is compatible with the preferred extend then we should fix 282 // up the type and extend so that it uses the preferred use. 283 if (UseMI->getOpcode() == Preferred.ExtendOpcode || 284 UseMI->getOpcode() == TargetOpcode::G_ANYEXT) { 285 unsigned UseDstReg = UseMI->getOperand(0).getReg(); 286 MachineOperand &UseSrcMO = UseMI->getOperand(1); 287 const LLT &UseDstTy = MRI.getType(UseDstReg); 288 if (UseDstReg != ChosenDstReg) { 289 if (Preferred.Ty == UseDstTy) { 290 // If the use has the same type as the preferred use, then merge 291 // the vregs and erase the extend. For example: 292 // %1:_(s8) = G_LOAD ... 293 // %2:_(s32) = G_SEXT %1(s8) 294 // %3:_(s32) = G_ANYEXT %1(s8) 295 // ... = ... %3(s32) 296 // rewrites to: 297 // %2:_(s32) = G_SEXTLOAD ... 298 // ... = ... %2(s32) 299 replaceRegWith(MRI, UseDstReg, ChosenDstReg); 300 Observer.erasingInstr(*UseMO->getParent()); 301 UseMO->getParent()->eraseFromParent(); 302 } else if (Preferred.Ty.getSizeInBits() < UseDstTy.getSizeInBits()) { 303 // If the preferred size is smaller, then keep the extend but extend 304 // from the result of the extending load. For example: 305 // %1:_(s8) = G_LOAD ... 306 // %2:_(s32) = G_SEXT %1(s8) 307 // %3:_(s64) = G_ANYEXT %1(s8) 308 // ... = ... %3(s64) 309 /// rewrites to: 310 // %2:_(s32) = G_SEXTLOAD ... 311 // %3:_(s64) = G_ANYEXT %2:_(s32) 312 // ... = ... %3(s64) 313 replaceRegOpWith(MRI, UseSrcMO, ChosenDstReg); 314 } else { 315 // If the preferred size is large, then insert a truncate. For 316 // example: 317 // %1:_(s8) = G_LOAD ... 318 // %2:_(s64) = G_SEXT %1(s8) 319 // %3:_(s32) = G_ZEXT %1(s8) 320 // ... = ... %3(s32) 321 /// rewrites to: 322 // %2:_(s64) = G_SEXTLOAD ... 323 // %4:_(s8) = G_TRUNC %2:_(s32) 324 // %3:_(s64) = G_ZEXT %2:_(s8) 325 // ... = ... %3(s64) 326 InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO, 327 InsertTruncAt); 328 } 329 continue; 330 } 331 // The use is (one of) the uses of the preferred use we chose earlier. 332 // We're going to update the load to def this value later so just erase 333 // the old extend. 334 Observer.erasingInstr(*UseMO->getParent()); 335 UseMO->getParent()->eraseFromParent(); 336 continue; 337 } 338 339 // The use isn't an extend. Truncate back to the type we originally loaded. 340 // This is free on many targets. 341 InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO, InsertTruncAt); 342 } 343 344 MI.getOperand(0).setReg(ChosenDstReg); 345 Observer.changedInstr(MI); 346 } 347 348 bool CombinerHelper::matchCombineBr(MachineInstr &MI) { 349 assert(MI.getOpcode() == TargetOpcode::G_BR && "Expected a G_BR"); 350 // Try to match the following: 351 // bb1: 352 // %c(s32) = G_ICMP pred, %a, %b 353 // %c1(s1) = G_TRUNC %c(s32) 354 // G_BRCOND %c1, %bb2 355 // G_BR %bb3 356 // bb2: 357 // ... 358 // bb3: 359 360 // The above pattern does not have a fall through to the successor bb2, always 361 // resulting in a branch no matter which path is taken. Here we try to find 362 // and replace that pattern with conditional branch to bb3 and otherwise 363 // fallthrough to bb2. 364 365 MachineBasicBlock *MBB = MI.getParent(); 366 MachineBasicBlock::iterator BrIt(MI); 367 if (BrIt == MBB->begin()) 368 return false; 369 assert(std::next(BrIt) == MBB->end() && "expected G_BR to be a terminator"); 370 371 MachineInstr *BrCond = &*std::prev(BrIt); 372 if (BrCond->getOpcode() != TargetOpcode::G_BRCOND) 373 return false; 374 375 // Check that the next block is the conditional branch target. 376 if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB())) 377 return false; 378 379 MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg()); 380 if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP || 381 !MRI.hasOneUse(CmpMI->getOperand(0).getReg())) 382 return false; 383 return true; 384 } 385 386 bool CombinerHelper::tryCombineBr(MachineInstr &MI) { 387 if (!matchCombineBr(MI)) 388 return false; 389 MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB(); 390 MachineBasicBlock::iterator BrIt(MI); 391 MachineInstr *BrCond = &*std::prev(BrIt); 392 MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg()); 393 394 CmpInst::Predicate InversePred = CmpInst::getInversePredicate( 395 (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate()); 396 397 // Invert the G_ICMP condition. 398 Observer.changingInstr(*CmpMI); 399 CmpMI->getOperand(1).setPredicate(InversePred); 400 Observer.changedInstr(*CmpMI); 401 402 // Change the conditional branch target. 403 Observer.changingInstr(*BrCond); 404 BrCond->getOperand(1).setMBB(BrTarget); 405 Observer.changedInstr(*BrCond); 406 MI.eraseFromParent(); 407 return true; 408 } 409 410 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { 411 // On Darwin, -Os means optimize for size without hurting performance, so 412 // only really optimize for size when -Oz (MinSize) is used. 413 if (MF.getTarget().getTargetTriple().isOSDarwin()) 414 return MF.getFunction().hasMinSize(); 415 return MF.getFunction().hasOptSize(); 416 } 417 418 // Get a rough equivalent of an MVT for a given LLT. 419 static MVT getMVTForLLT(LLT Ty) { 420 if (!Ty.isVector()) 421 return MVT::getIntegerVT(Ty.getSizeInBits()); 422 423 return MVT::getVectorVT( 424 MVT::getIntegerVT(Ty.getElementType().getSizeInBits()), 425 Ty.getNumElements()); 426 } 427 428 // Returns a list of types to use for memory op lowering in MemOps. A partial 429 // port of findOptimalMemOpLowering in TargetLowering. 430 static bool findGISelOptimalMemOpLowering( 431 std::vector<LLT> &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign, 432 unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, 433 bool AllowOverlap, unsigned DstAS, unsigned SrcAS, 434 const AttributeList &FuncAttributes, const TargetLowering &TLI) { 435 // If 'SrcAlign' is zero, that means the memory operation does not need to 436 // load the value, i.e. memset or memcpy from constant string. Otherwise, 437 // it's the inferred alignment of the source. 'DstAlign', on the other hand, 438 // is the specified alignment of the memory operation. If it is zero, that 439 // means it's possible to change the alignment of the destination. 440 // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does 441 // not need to be loaded. 442 if (SrcAlign != 0 && SrcAlign < DstAlign) 443 return false; 444 445 LLT Ty = TLI.getOptimalMemOpLLT(Size, DstAlign, SrcAlign, IsMemset, 446 ZeroMemset, MemcpyStrSrc, FuncAttributes); 447 448 if (Ty == LLT()) { 449 // Use the largest scalar type whose alignment constraints are satisfied. 450 // We only need to check DstAlign here as SrcAlign is always greater or 451 // equal to DstAlign (or zero). 452 Ty = LLT::scalar(64); 453 while (DstAlign && DstAlign < Ty.getSizeInBytes() && 454 !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, DstAlign)) 455 Ty = LLT::scalar(Ty.getSizeInBytes()); 456 assert(Ty.getSizeInBits() > 0 && "Could not find valid type"); 457 // FIXME: check for the largest legal type we can load/store to. 458 } 459 460 unsigned NumMemOps = 0; 461 while (Size != 0) { 462 unsigned TySize = Ty.getSizeInBytes(); 463 while (TySize > Size) { 464 // For now, only use non-vector load / store's for the left-over pieces. 465 LLT NewTy = Ty; 466 // FIXME: check for mem op safety and legality of the types. Not all of 467 // SDAGisms map cleanly to GISel concepts. 468 if (NewTy.isVector()) 469 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32); 470 unsigned NewTySize = NewTy.getSizeInBytes(); 471 472 NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits()-1)); 473 NewTySize = NewTy.getSizeInBytes(); 474 assert(NewTySize > 0 && "Could not find appropriate type"); 475 476 // If the new LLT cannot cover all of the remaining bits, then consider 477 // issuing a (or a pair of) unaligned and overlapping load / store. 478 bool Fast; 479 // Need to get a VT equivalent for allowMisalignedMemoryAccesses(). 480 MVT VT = getMVTForLLT(Ty); 481 if (NumMemOps && AllowOverlap && NewTySize < Size && 482 TLI.allowsMisalignedMemoryAccesses( 483 VT, DstAS, DstAlign, MachineMemOperand::MONone, &Fast) && 484 Fast) 485 TySize = Size; 486 else { 487 Ty = NewTy; 488 TySize = NewTySize; 489 } 490 } 491 492 if (++NumMemOps > Limit) 493 return false; 494 495 MemOps.push_back(Ty); 496 Size -= TySize; 497 } 498 499 return true; 500 } 501 502 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) { 503 if (Ty.isVector()) 504 return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()), 505 Ty.getNumElements()); 506 return IntegerType::get(C, Ty.getSizeInBits()); 507 } 508 509 // Get a vectorized representation of the memset value operand, GISel edition. 510 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) { 511 MachineRegisterInfo &MRI = *MIB.getMRI(); 512 unsigned NumBits = Ty.getScalarSizeInBits(); 513 auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI); 514 if (!Ty.isVector() && ValVRegAndVal) { 515 unsigned KnownVal = ValVRegAndVal->Value; 516 APInt Scalar = APInt(8, KnownVal); 517 APInt SplatVal = APInt::getSplat(NumBits, Scalar); 518 return MIB.buildConstant(Ty, SplatVal).getReg(0); 519 } 520 // FIXME: for vector types create a G_BUILD_VECTOR. 521 if (Ty.isVector()) 522 return Register(); 523 524 // Extend the byte value to the larger type, and then multiply by a magic 525 // value 0x010101... in order to replicate it across every byte. 526 LLT ExtType = Ty.getScalarType(); 527 auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val); 528 if (NumBits > 8) { 529 APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01)); 530 auto MagicMI = MIB.buildConstant(ExtType, Magic); 531 Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0); 532 } 533 534 assert(ExtType == Ty && "Vector memset value type not supported yet"); 535 return Val; 536 } 537 538 bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val, 539 unsigned KnownLen, unsigned Align, 540 bool IsVolatile) { 541 auto &MF = *MI.getParent()->getParent(); 542 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 543 auto &DL = MF.getDataLayout(); 544 LLVMContext &C = MF.getFunction().getContext(); 545 546 assert(KnownLen != 0 && "Have a zero length memset length!"); 547 548 bool DstAlignCanChange = false; 549 MachineFrameInfo &MFI = MF.getFrameInfo(); 550 bool OptSize = shouldLowerMemFuncForSize(MF); 551 552 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 553 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 554 DstAlignCanChange = true; 555 556 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize); 557 std::vector<LLT> MemOps; 558 559 const auto &DstMMO = **MI.memoperands_begin(); 560 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 561 562 auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI); 563 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0; 564 565 if (!findGISelOptimalMemOpLowering( 566 MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), 0, 567 /*IsMemset=*/true, 568 /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false, 569 /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), ~0u, 570 MF.getFunction().getAttributes(), TLI)) 571 return false; 572 573 if (DstAlignCanChange) { 574 // Get an estimate of the type from the LLT. 575 Type *IRTy = getTypeForLLT(MemOps[0], C); 576 unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy); 577 if (NewAlign > Align) { 578 unsigned FI = FIDef->getOperand(1).getIndex(); 579 // Give the stack frame object a larger alignment if needed. 580 if (MFI.getObjectAlignment(FI) < NewAlign) 581 MFI.setObjectAlignment(FI, NewAlign); 582 Align = NewAlign; 583 } 584 } 585 586 MachineIRBuilder MIB(MI); 587 // Find the largest store and generate the bit pattern for it. 588 LLT LargestTy = MemOps[0]; 589 for (unsigned i = 1; i < MemOps.size(); i++) 590 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits()) 591 LargestTy = MemOps[i]; 592 593 // The memset stored value is always defined as an s8, so in order to make it 594 // work with larger store types we need to repeat the bit pattern across the 595 // wider type. 596 Register MemSetValue = getMemsetValue(Val, LargestTy, MIB); 597 598 if (!MemSetValue) 599 return false; 600 601 // Generate the stores. For each store type in the list, we generate the 602 // matching store of that type to the destination address. 603 LLT PtrTy = MRI.getType(Dst); 604 unsigned DstOff = 0; 605 unsigned Size = KnownLen; 606 for (unsigned I = 0; I < MemOps.size(); I++) { 607 LLT Ty = MemOps[I]; 608 unsigned TySize = Ty.getSizeInBytes(); 609 if (TySize > Size) { 610 // Issuing an unaligned load / store pair that overlaps with the previous 611 // pair. Adjust the offset accordingly. 612 assert(I == MemOps.size() - 1 && I != 0); 613 DstOff -= TySize - Size; 614 } 615 616 // If this store is smaller than the largest store see whether we can get 617 // the smaller value for free with a truncate. 618 Register Value = MemSetValue; 619 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) { 620 MVT VT = getMVTForLLT(Ty); 621 MVT LargestVT = getMVTForLLT(LargestTy); 622 if (!LargestTy.isVector() && !Ty.isVector() && 623 TLI.isTruncateFree(LargestVT, VT)) 624 Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0); 625 else 626 Value = getMemsetValue(Val, Ty, MIB); 627 if (!Value) 628 return false; 629 } 630 631 auto *StoreMMO = 632 MF.getMachineMemOperand(&DstMMO, DstOff, Ty.getSizeInBytes()); 633 634 Register Ptr = Dst; 635 if (DstOff != 0) { 636 auto Offset = 637 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff); 638 Ptr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0); 639 } 640 641 MIB.buildStore(Value, Ptr, *StoreMMO); 642 DstOff += Ty.getSizeInBytes(); 643 Size -= TySize; 644 } 645 646 MI.eraseFromParent(); 647 return true; 648 } 649 650 651 bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst, 652 Register Src, unsigned KnownLen, 653 unsigned DstAlign, unsigned SrcAlign, 654 bool IsVolatile) { 655 auto &MF = *MI.getParent()->getParent(); 656 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 657 auto &DL = MF.getDataLayout(); 658 LLVMContext &C = MF.getFunction().getContext(); 659 660 assert(KnownLen != 0 && "Have a zero length memcpy length!"); 661 662 bool DstAlignCanChange = false; 663 MachineFrameInfo &MFI = MF.getFrameInfo(); 664 bool OptSize = shouldLowerMemFuncForSize(MF); 665 unsigned Align = MinAlign(DstAlign, SrcAlign); 666 667 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 668 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 669 DstAlignCanChange = true; 670 671 // FIXME: infer better src pointer alignment like SelectionDAG does here. 672 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining 673 // if the memcpy is in a tail call position. 674 675 unsigned Limit = TLI.getMaxStoresPerMemcpy(OptSize); 676 std::vector<LLT> MemOps; 677 678 const auto &DstMMO = **MI.memoperands_begin(); 679 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 680 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 681 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); 682 683 if (!findGISelOptimalMemOpLowering( 684 MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), SrcAlign, 685 /*IsMemset=*/false, 686 /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false, 687 /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), 688 SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI)) 689 return false; 690 691 if (DstAlignCanChange) { 692 // Get an estimate of the type from the LLT. 693 Type *IRTy = getTypeForLLT(MemOps[0], C); 694 unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy); 695 696 // Don't promote to an alignment that would require dynamic stack 697 // realignment. 698 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 699 if (!TRI->needsStackRealignment(MF)) 700 while (NewAlign > Align && 701 DL.exceedsNaturalStackAlignment(NewAlign)) 702 NewAlign /= 2; 703 704 if (NewAlign > Align) { 705 unsigned FI = FIDef->getOperand(1).getIndex(); 706 // Give the stack frame object a larger alignment if needed. 707 if (MFI.getObjectAlignment(FI) < NewAlign) 708 MFI.setObjectAlignment(FI, NewAlign); 709 Align = NewAlign; 710 } 711 } 712 713 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n"); 714 715 MachineIRBuilder MIB(MI); 716 // Now we need to emit a pair of load and stores for each of the types we've 717 // collected. I.e. for each type, generate a load from the source pointer of 718 // that type width, and then generate a corresponding store to the dest buffer 719 // of that value loaded. This can result in a sequence of loads and stores 720 // mixed types, depending on what the target specifies as good types to use. 721 unsigned CurrOffset = 0; 722 LLT PtrTy = MRI.getType(Src); 723 unsigned Size = KnownLen; 724 for (auto CopyTy : MemOps) { 725 // Issuing an unaligned load / store pair that overlaps with the previous 726 // pair. Adjust the offset accordingly. 727 if (CopyTy.getSizeInBytes() > Size) 728 CurrOffset -= CopyTy.getSizeInBytes() - Size; 729 730 // Construct MMOs for the accesses. 731 auto *LoadMMO = 732 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); 733 auto *StoreMMO = 734 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); 735 736 // Create the load. 737 Register LoadPtr = Src; 738 Register Offset; 739 if (CurrOffset != 0) { 740 Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset) 741 .getReg(0); 742 LoadPtr = MIB.buildGEP(PtrTy, Src, Offset).getReg(0); 743 } 744 auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO); 745 746 // Create the store. 747 Register StorePtr = 748 CurrOffset == 0 ? Dst : MIB.buildGEP(PtrTy, Dst, Offset).getReg(0); 749 MIB.buildStore(LdVal, StorePtr, *StoreMMO); 750 CurrOffset += CopyTy.getSizeInBytes(); 751 Size -= CopyTy.getSizeInBytes(); 752 } 753 754 MI.eraseFromParent(); 755 return true; 756 } 757 758 bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst, 759 Register Src, unsigned KnownLen, 760 unsigned DstAlign, unsigned SrcAlign, 761 bool IsVolatile) { 762 auto &MF = *MI.getParent()->getParent(); 763 const auto &TLI = *MF.getSubtarget().getTargetLowering(); 764 auto &DL = MF.getDataLayout(); 765 LLVMContext &C = MF.getFunction().getContext(); 766 767 assert(KnownLen != 0 && "Have a zero length memmove length!"); 768 769 bool DstAlignCanChange = false; 770 MachineFrameInfo &MFI = MF.getFrameInfo(); 771 bool OptSize = shouldLowerMemFuncForSize(MF); 772 unsigned Align = MinAlign(DstAlign, SrcAlign); 773 774 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); 775 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) 776 DstAlignCanChange = true; 777 778 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize); 779 std::vector<LLT> MemOps; 780 781 const auto &DstMMO = **MI.memoperands_begin(); 782 const auto &SrcMMO = **std::next(MI.memoperands_begin()); 783 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); 784 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); 785 786 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due 787 // to a bug in it's findOptimalMemOpLowering implementation. For now do the 788 // same thing here. 789 if (!findGISelOptimalMemOpLowering( 790 MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), SrcAlign, 791 /*IsMemset=*/false, 792 /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false, 793 /*AllowOverlap=*/false, DstPtrInfo.getAddrSpace(), 794 SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI)) 795 return false; 796 797 if (DstAlignCanChange) { 798 // Get an estimate of the type from the LLT. 799 Type *IRTy = getTypeForLLT(MemOps[0], C); 800 unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy); 801 802 // Don't promote to an alignment that would require dynamic stack 803 // realignment. 804 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); 805 if (!TRI->needsStackRealignment(MF)) 806 while (NewAlign > Align && 807 DL.exceedsNaturalStackAlignment(NewAlign)) 808 NewAlign /= 2; 809 810 if (NewAlign > Align) { 811 unsigned FI = FIDef->getOperand(1).getIndex(); 812 // Give the stack frame object a larger alignment if needed. 813 if (MFI.getObjectAlignment(FI) < NewAlign) 814 MFI.setObjectAlignment(FI, NewAlign); 815 Align = NewAlign; 816 } 817 } 818 819 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n"); 820 821 MachineIRBuilder MIB(MI); 822 // Memmove requires that we perform the loads first before issuing the stores. 823 // Apart from that, this loop is pretty much doing the same thing as the 824 // memcpy codegen function. 825 unsigned CurrOffset = 0; 826 LLT PtrTy = MRI.getType(Src); 827 SmallVector<Register, 16> LoadVals; 828 for (auto CopyTy : MemOps) { 829 // Construct MMO for the load. 830 auto *LoadMMO = 831 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); 832 833 // Create the load. 834 Register LoadPtr = Src; 835 if (CurrOffset != 0) { 836 auto Offset = 837 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset); 838 LoadPtr = MIB.buildGEP(PtrTy, Src, Offset).getReg(0); 839 } 840 LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0)); 841 CurrOffset += CopyTy.getSizeInBytes(); 842 } 843 844 CurrOffset = 0; 845 for (unsigned I = 0; I < MemOps.size(); ++I) { 846 LLT CopyTy = MemOps[I]; 847 // Now store the values loaded. 848 auto *StoreMMO = 849 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); 850 851 Register StorePtr = Dst; 852 if (CurrOffset != 0) { 853 auto Offset = 854 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset); 855 StorePtr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0); 856 } 857 MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO); 858 CurrOffset += CopyTy.getSizeInBytes(); 859 } 860 MI.eraseFromParent(); 861 return true; 862 } 863 864 bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI) { 865 // This combine is fairly complex so it's not written with a separate 866 // matcher function. 867 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); 868 Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID(); 869 assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove || 870 ID == Intrinsic::memset) && 871 "Expected a memcpy like intrinsic"); 872 873 auto MMOIt = MI.memoperands_begin(); 874 const MachineMemOperand *MemOp = *MMOIt; 875 bool IsVolatile = MemOp->isVolatile(); 876 // Don't try to optimize volatile. 877 if (IsVolatile) 878 return false; 879 880 unsigned DstAlign = MemOp->getBaseAlignment(); 881 unsigned SrcAlign = 0; 882 unsigned Dst = MI.getOperand(1).getReg(); 883 unsigned Src = MI.getOperand(2).getReg(); 884 Register Len = MI.getOperand(3).getReg(); 885 886 if (ID != Intrinsic::memset) { 887 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI"); 888 MemOp = *(++MMOIt); 889 SrcAlign = MemOp->getBaseAlignment(); 890 } 891 892 // See if this is a constant length copy 893 auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI); 894 if (!LenVRegAndVal) 895 return false; // Leave it to the legalizer to lower it to a libcall. 896 unsigned KnownLen = LenVRegAndVal->Value; 897 898 if (KnownLen == 0) { 899 MI.eraseFromParent(); 900 return true; 901 } 902 903 if (ID == Intrinsic::memcpy) 904 return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); 905 if (ID == Intrinsic::memmove) 906 return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); 907 if (ID == Intrinsic::memset) 908 return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile); 909 return false; 910 } 911 912 bool CombinerHelper::tryCombine(MachineInstr &MI) { 913 if (tryCombineCopy(MI)) 914 return true; 915 return tryCombineExtendingLoads(MI); 916 } 917