1 //===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines a pass that optimizes call sequences on x86. 11 // Currently, it converts movs of function parameters onto the stack into 12 // pushes. This is beneficial for two main reasons: 13 // 1) The push instruction encoding is much smaller than an esp-relative mov 14 // 2) It is possible to push memory arguments directly. So, if the 15 // the transformation is preformed pre-reg-alloc, it can help relieve 16 // register pressure. 17 // 18 //===----------------------------------------------------------------------===// 19 20 #include <algorithm> 21 22 #include "X86.h" 23 #include "X86InstrInfo.h" 24 #include "X86Subtarget.h" 25 #include "X86MachineFunctionInfo.h" 26 #include "llvm/ADT/Statistic.h" 27 #include "llvm/CodeGen/MachineFunctionPass.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/CodeGen/MachineRegisterInfo.h" 30 #include "llvm/CodeGen/Passes.h" 31 #include "llvm/IR/Function.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Target/TargetInstrInfo.h" 35 36 using namespace llvm; 37 38 #define DEBUG_TYPE "x86-cf-opt" 39 40 cl::opt<bool> NoX86CFOpt("no-x86-call-frame-opt", 41 cl::desc("Avoid optimizing x86 call frames for size"), 42 cl::init(false), cl::Hidden); 43 44 namespace { 45 class X86CallFrameOptimization : public MachineFunctionPass { 46 public: 47 X86CallFrameOptimization() : MachineFunctionPass(ID) {} 48 49 bool runOnMachineFunction(MachineFunction &MF) override; 50 51 private: 52 bool shouldPerformTransformation(MachineFunction &MF); 53 54 bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB, 55 MachineBasicBlock::iterator I); 56 57 MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, 58 unsigned Reg); 59 60 const char *getPassName() const override { 61 return "X86 Optimize Call Frame"; 62 } 63 64 const TargetInstrInfo *TII; 65 const TargetFrameLowering *TFL; 66 const MachineRegisterInfo *MRI; 67 static char ID; 68 }; 69 70 char X86CallFrameOptimization::ID = 0; 71 } 72 73 FunctionPass *llvm::createX86CallFrameOptimization() { 74 return new X86CallFrameOptimization(); 75 } 76 77 // This checks whether the transformation is legal and profitable 78 bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) { 79 if (NoX86CFOpt.getValue()) 80 return false; 81 82 // We currently only support call sequences where *all* parameters. 83 // are passed on the stack. 84 // No point in running this in 64-bit mode, since some arguments are 85 // passed in-register in all common calling conventions, so the pattern 86 // we're looking for will never match. 87 const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); 88 if (STI.is64Bit()) 89 return false; 90 91 // You would expect straight-line code between call-frame setup and 92 // call-frame destroy. You would be wrong. There are circumstances (e.g. 93 // CMOV_GR8 expansion of a select that feeds a function call!) where we can 94 // end up with the setup and the destroy in different basic blocks. 95 // This is bad, and breaks SP adjustment. 96 // So, check that all of the frames in the function are closed inside 97 // the same block, and, for good measure, that there are no nested frames. 98 int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); 99 int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); 100 for (MachineBasicBlock &BB : MF) { 101 bool InsideFrameSequence = false; 102 for (MachineInstr &MI : BB) { 103 if (MI.getOpcode() == FrameSetupOpcode) { 104 if (InsideFrameSequence) 105 return false; 106 InsideFrameSequence = true; 107 } 108 else if (MI.getOpcode() == FrameDestroyOpcode) { 109 if (!InsideFrameSequence) 110 return false; 111 InsideFrameSequence = false; 112 } 113 } 114 115 if (InsideFrameSequence) 116 return false; 117 } 118 119 // Now that we know the transformation is legal, check if it is 120 // profitable. 121 // TODO: Add a heuristic that actually looks at the function, 122 // and enable this for more cases. 123 124 // This transformation is always a win when we expected to have 125 // a reserved call frame. Under other circumstances, it may be either 126 // a win or a loss, and requires a heuristic. 127 // For now, enable it only for the relatively clear win cases. 128 bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects(); 129 if (CannotReserveFrame) 130 return true; 131 132 // For now, don't even try to evaluate the profitability when 133 // not optimizing for size. 134 AttributeSet FnAttrs = MF.getFunction()->getAttributes(); 135 bool OptForSize = 136 FnAttrs.hasAttribute(AttributeSet::FunctionIndex, 137 Attribute::OptimizeForSize) || 138 FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); 139 140 if (!OptForSize) 141 return false; 142 143 // Stack re-alignment can make this unprofitable even in terms of size. 144 // As mentioned above, a better heuristic is needed. For now, don't do this 145 // when the required alignment is above 8. (4 would be the safe choice, but 146 // some experimentation showed 8 is generally good). 147 if (TFL->getStackAlignment() > 8) 148 return false; 149 150 return true; 151 } 152 153 bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { 154 TII = MF.getSubtarget().getInstrInfo(); 155 TFL = MF.getSubtarget().getFrameLowering(); 156 MRI = &MF.getRegInfo(); 157 158 if (!shouldPerformTransformation(MF)) 159 return false; 160 161 int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); 162 163 bool Changed = false; 164 165 for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) 166 for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) 167 if (I->getOpcode() == FrameSetupOpcode) 168 Changed |= adjustCallSequence(MF, *BB, I); 169 170 return Changed; 171 } 172 173 bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, 174 MachineBasicBlock &MBB, 175 MachineBasicBlock::iterator I) { 176 177 // Check that this particular call sequence is amenable to the 178 // transformation. 179 const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( 180 MF.getSubtarget().getRegisterInfo()); 181 unsigned StackPtr = RegInfo.getStackRegister(); 182 int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); 183 184 // We expect to enter this at the beginning of a call sequence 185 assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); 186 MachineBasicBlock::iterator FrameSetup = I++; 187 188 189 // For globals in PIC mode, we can have some LEAs here. 190 // Ignore them, they don't bother us. 191 // TODO: Extend this to something that covers more cases. 192 while (I->getOpcode() == X86::LEA32r) 193 ++I; 194 195 // We expect a copy instruction here. 196 // TODO: The copy instruction is a lowering artifact. 197 // We should also support a copy-less version, where the stack 198 // pointer is used directly. 199 if (!I->isCopy() || !I->getOperand(0).isReg()) 200 return false; 201 MachineBasicBlock::iterator SPCopy = I++; 202 StackPtr = SPCopy->getOperand(0).getReg(); 203 204 // Scan the call setup sequence for the pattern we're looking for. 205 // We only handle a simple case - a sequence of MOV32mi or MOV32mr 206 // instructions, that push a sequence of 32-bit values onto the stack, with 207 // no gaps between them. 208 SmallVector<MachineInstr*, 4> MovVector(4, nullptr); 209 unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; 210 if (MaxAdjust > 4) 211 MovVector.resize(MaxAdjust, nullptr); 212 213 do { 214 int Opcode = I->getOpcode(); 215 if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) 216 break; 217 218 // We only want movs of the form: 219 // movl imm/r32, k(%esp) 220 // If we run into something else, bail. 221 // Note that AddrBaseReg may, counter to its name, not be a register, 222 // but rather a frame index. 223 // TODO: Support the fi case. This should probably work now that we 224 // have the infrastructure to track the stack pointer within a call 225 // sequence. 226 if (!I->getOperand(X86::AddrBaseReg).isReg() || 227 (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || 228 !I->getOperand(X86::AddrScaleAmt).isImm() || 229 (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || 230 (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || 231 (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || 232 !I->getOperand(X86::AddrDisp).isImm()) 233 return false; 234 235 int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); 236 assert(StackDisp >= 0 && "Negative stack displacement when passing parameters"); 237 238 // We really don't want to consider the unaligned case. 239 if (StackDisp % 4) 240 return false; 241 StackDisp /= 4; 242 243 assert((size_t)StackDisp < MovVector.size() && 244 "Function call has more parameters than the stack is adjusted for."); 245 246 // If the same stack slot is being filled twice, something's fishy. 247 if (MovVector[StackDisp] != nullptr) 248 return false; 249 MovVector[StackDisp] = I; 250 251 ++I; 252 } while (I != MBB.end()); 253 254 // We now expect the end of the sequence - a call and a stack adjust. 255 if (I == MBB.end()) 256 return false; 257 258 // For PCrel calls, we expect an additional COPY of the basereg. 259 // If we find one, skip it. 260 if (I->isCopy()) { 261 if (I->getOperand(1).getReg() == 262 MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg()) 263 ++I; 264 else 265 return false; 266 } 267 268 if (!I->isCall()) 269 return false; 270 MachineBasicBlock::iterator Call = I; 271 if ((++I)->getOpcode() != FrameDestroyOpcode) 272 return false; 273 274 // Now, go through the vector, and see that we don't have any gaps, 275 // but only a series of 32-bit MOVs. 276 277 int64_t ExpectedDist = 0; 278 auto MMI = MovVector.begin(), MME = MovVector.end(); 279 for (; MMI != MME; ++MMI, ExpectedDist += 4) 280 if (*MMI == nullptr) 281 break; 282 283 // If the call had no parameters, do nothing 284 if (!ExpectedDist) 285 return false; 286 287 // We are either at the last parameter, or a gap. 288 // Make sure it's not a gap 289 for (; MMI != MME; ++MMI) 290 if (*MMI != nullptr) 291 return false; 292 293 // Ok, we can in fact do the transformation for this call. 294 // Do not remove the FrameSetup instruction, but adjust the parameters. 295 // PEI will end up finalizing the handling of this. 296 FrameSetup->getOperand(1).setImm(ExpectedDist); 297 298 DebugLoc DL = I->getDebugLoc(); 299 // Now, iterate through the vector in reverse order, and replace the movs 300 // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to 301 // replace uses. 302 for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) { 303 MachineBasicBlock::iterator MOV = *MovVector[Idx]; 304 MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); 305 if (MOV->getOpcode() == X86::MOV32mi) { 306 unsigned PushOpcode = X86::PUSHi32; 307 // If the operand is a small (8-bit) immediate, we can use a 308 // PUSH instruction with a shorter encoding. 309 // Note that isImm() may fail even though this is a MOVmi, because 310 // the operand can also be a symbol. 311 if (PushOp.isImm()) { 312 int64_t Val = PushOp.getImm(); 313 if (isInt<8>(Val)) 314 PushOpcode = X86::PUSH32i8; 315 } 316 BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp); 317 } else { 318 unsigned int Reg = PushOp.getReg(); 319 320 // If PUSHrmm is not slow on this target, try to fold the source of the 321 // push into the instruction. 322 const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>(); 323 bool SlowPUSHrmm = ST.isAtom() || ST.isSLM(); 324 325 // Check that this is legal to fold. Right now, we're extremely 326 // conservative about that. 327 MachineInstr *DefMov = nullptr; 328 if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { 329 MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm)); 330 331 unsigned NumOps = DefMov->getDesc().getNumOperands(); 332 for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) 333 Push->addOperand(DefMov->getOperand(i)); 334 335 DefMov->eraseFromParent(); 336 } else { 337 BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr(); 338 } 339 } 340 341 MBB.erase(MOV); 342 } 343 344 // The stack-pointer copy is no longer used in the call sequences. 345 // There should not be any other users, but we can't commit to that, so: 346 if (MRI->use_empty(SPCopy->getOperand(0).getReg())) 347 SPCopy->eraseFromParent(); 348 349 // Once we've done this, we need to make sure PEI doesn't assume a reserved 350 // frame. 351 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 352 FuncInfo->setHasPushSequences(true); 353 354 return true; 355 } 356 357 MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( 358 MachineBasicBlock::iterator FrameSetup, unsigned Reg) { 359 // Do an extremely restricted form of load folding. 360 // ISel will often create patterns like: 361 // movl 4(%edi), %eax 362 // movl 8(%edi), %ecx 363 // movl 12(%edi), %edx 364 // movl %edx, 8(%esp) 365 // movl %ecx, 4(%esp) 366 // movl %eax, (%esp) 367 // call 368 // Get rid of those with prejudice. 369 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 370 return nullptr; 371 372 // Make sure this is the only use of Reg. 373 if (!MRI->hasOneNonDBGUse(Reg)) 374 return nullptr; 375 376 MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg); 377 378 // Make sure the def is a MOV from memory. 379 // If the def is an another block, give up. 380 if (DefMI->getOpcode() != X86::MOV32rm || 381 DefMI->getParent() != FrameSetup->getParent()) 382 return nullptr; 383 384 // Be careful with movs that load from a stack slot, since it may get 385 // resolved incorrectly. 386 // TODO: Again, we already have the infrastructure, so this should work. 387 if (!DefMI->getOperand(1).isReg()) 388 return nullptr; 389 390 // Now, make sure everything else up until the ADJCALLSTACK is a sequence 391 // of MOVs. To be less conservative would require duplicating a lot of the 392 // logic from PeepholeOptimizer. 393 // FIXME: A possibly better approach would be to teach the PeepholeOptimizer 394 // to be smarter about folding into pushes. 395 for (auto I = DefMI; I != FrameSetup; ++I) 396 if (I->getOpcode() != X86::MOV32rm) 397 return nullptr; 398 399 return DefMI; 400 } 401