1 //===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines a pass that optimizes call sequences on x86. 11 // Currently, it converts movs of function parameters onto the stack into 12 // pushes. This is beneficial for two main reasons: 13 // 1) The push instruction encoding is much smaller than an esp-relative mov 14 // 2) It is possible to push memory arguments directly. So, if the 15 // the transformation is preformed pre-reg-alloc, it can help relieve 16 // register pressure. 17 // 18 //===----------------------------------------------------------------------===// 19 20 #include <algorithm> 21 22 #include "X86.h" 23 #include "X86InstrInfo.h" 24 #include "X86Subtarget.h" 25 #include "X86MachineFunctionInfo.h" 26 #include "llvm/ADT/Statistic.h" 27 #include "llvm/CodeGen/MachineFunctionPass.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/CodeGen/MachineRegisterInfo.h" 30 #include "llvm/CodeGen/Passes.h" 31 #include "llvm/IR/Function.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Target/TargetInstrInfo.h" 35 36 using namespace llvm; 37 38 #define DEBUG_TYPE "x86-cf-opt" 39 40 static cl::opt<bool> 41 NoX86CFOpt("no-x86-call-frame-opt", 42 cl::desc("Avoid optimizing x86 call frames for size"), 43 cl::init(false), cl::Hidden); 44 45 namespace { 46 class X86CallFrameOptimization : public MachineFunctionPass { 47 public: 48 X86CallFrameOptimization() : MachineFunctionPass(ID) {} 49 50 bool runOnMachineFunction(MachineFunction &MF) override; 51 52 private: 53 bool shouldPerformTransformation(MachineFunction &MF); 54 55 bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB, 56 MachineBasicBlock::iterator I); 57 58 MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, 59 unsigned Reg); 60 61 const char *getPassName() const override { 62 return "X86 Optimize Call Frame"; 63 } 64 65 const TargetInstrInfo *TII; 66 const TargetFrameLowering *TFL; 67 const MachineRegisterInfo *MRI; 68 static char ID; 69 }; 70 71 char X86CallFrameOptimization::ID = 0; 72 } 73 74 FunctionPass *llvm::createX86CallFrameOptimization() { 75 return new X86CallFrameOptimization(); 76 } 77 78 // This checks whether the transformation is legal and profitable 79 bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) { 80 if (NoX86CFOpt.getValue()) 81 return false; 82 83 // We currently only support call sequences where *all* parameters. 84 // are passed on the stack. 85 // No point in running this in 64-bit mode, since some arguments are 86 // passed in-register in all common calling conventions, so the pattern 87 // we're looking for will never match. 88 const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); 89 if (STI.is64Bit()) 90 return false; 91 92 // You would expect straight-line code between call-frame setup and 93 // call-frame destroy. You would be wrong. There are circumstances (e.g. 94 // CMOV_GR8 expansion of a select that feeds a function call!) where we can 95 // end up with the setup and the destroy in different basic blocks. 96 // This is bad, and breaks SP adjustment. 97 // So, check that all of the frames in the function are closed inside 98 // the same block, and, for good measure, that there are no nested frames. 99 int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); 100 int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); 101 for (MachineBasicBlock &BB : MF) { 102 bool InsideFrameSequence = false; 103 for (MachineInstr &MI : BB) { 104 if (MI.getOpcode() == FrameSetupOpcode) { 105 if (InsideFrameSequence) 106 return false; 107 InsideFrameSequence = true; 108 } 109 else if (MI.getOpcode() == FrameDestroyOpcode) { 110 if (!InsideFrameSequence) 111 return false; 112 InsideFrameSequence = false; 113 } 114 } 115 116 if (InsideFrameSequence) 117 return false; 118 } 119 120 // Now that we know the transformation is legal, check if it is 121 // profitable. 122 // TODO: Add a heuristic that actually looks at the function, 123 // and enable this for more cases. 124 125 // This transformation is always a win when we expected to have 126 // a reserved call frame. Under other circumstances, it may be either 127 // a win or a loss, and requires a heuristic. 128 // For now, enable it only for the relatively clear win cases. 129 bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects(); 130 if (CannotReserveFrame) 131 return true; 132 133 // For now, don't even try to evaluate the profitability when 134 // not optimizing for size. 135 AttributeSet FnAttrs = MF.getFunction()->getAttributes(); 136 bool OptForSize = 137 FnAttrs.hasAttribute(AttributeSet::FunctionIndex, 138 Attribute::OptimizeForSize) || 139 FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); 140 141 if (!OptForSize) 142 return false; 143 144 // Stack re-alignment can make this unprofitable even in terms of size. 145 // As mentioned above, a better heuristic is needed. For now, don't do this 146 // when the required alignment is above 8. (4 would be the safe choice, but 147 // some experimentation showed 8 is generally good). 148 if (TFL->getStackAlignment() > 8) 149 return false; 150 151 return true; 152 } 153 154 bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { 155 TII = MF.getSubtarget().getInstrInfo(); 156 TFL = MF.getSubtarget().getFrameLowering(); 157 MRI = &MF.getRegInfo(); 158 159 if (!shouldPerformTransformation(MF)) 160 return false; 161 162 int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); 163 164 bool Changed = false; 165 166 for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) 167 for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) 168 if (I->getOpcode() == FrameSetupOpcode) 169 Changed |= adjustCallSequence(MF, *BB, I); 170 171 return Changed; 172 } 173 174 bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, 175 MachineBasicBlock &MBB, 176 MachineBasicBlock::iterator I) { 177 178 // Check that this particular call sequence is amenable to the 179 // transformation. 180 const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( 181 MF.getSubtarget().getRegisterInfo()); 182 unsigned StackPtr = RegInfo.getStackRegister(); 183 int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); 184 185 // We expect to enter this at the beginning of a call sequence 186 assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); 187 MachineBasicBlock::iterator FrameSetup = I++; 188 189 190 // For globals in PIC mode, we can have some LEAs here. 191 // Ignore them, they don't bother us. 192 // TODO: Extend this to something that covers more cases. 193 while (I->getOpcode() == X86::LEA32r) 194 ++I; 195 196 // We expect a copy instruction here. 197 // TODO: The copy instruction is a lowering artifact. 198 // We should also support a copy-less version, where the stack 199 // pointer is used directly. 200 if (!I->isCopy() || !I->getOperand(0).isReg()) 201 return false; 202 MachineBasicBlock::iterator SPCopy = I++; 203 StackPtr = SPCopy->getOperand(0).getReg(); 204 205 // Scan the call setup sequence for the pattern we're looking for. 206 // We only handle a simple case - a sequence of MOV32mi or MOV32mr 207 // instructions, that push a sequence of 32-bit values onto the stack, with 208 // no gaps between them. 209 SmallVector<MachineInstr*, 4> MovVector(4, nullptr); 210 unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; 211 if (MaxAdjust > 4) 212 MovVector.resize(MaxAdjust, nullptr); 213 214 do { 215 int Opcode = I->getOpcode(); 216 if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) 217 break; 218 219 // We only want movs of the form: 220 // movl imm/r32, k(%esp) 221 // If we run into something else, bail. 222 // Note that AddrBaseReg may, counter to its name, not be a register, 223 // but rather a frame index. 224 // TODO: Support the fi case. This should probably work now that we 225 // have the infrastructure to track the stack pointer within a call 226 // sequence. 227 if (!I->getOperand(X86::AddrBaseReg).isReg() || 228 (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || 229 !I->getOperand(X86::AddrScaleAmt).isImm() || 230 (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || 231 (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || 232 (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || 233 !I->getOperand(X86::AddrDisp).isImm()) 234 return false; 235 236 int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); 237 assert(StackDisp >= 0 && "Negative stack displacement when passing parameters"); 238 239 // We really don't want to consider the unaligned case. 240 if (StackDisp % 4) 241 return false; 242 StackDisp /= 4; 243 244 assert((size_t)StackDisp < MovVector.size() && 245 "Function call has more parameters than the stack is adjusted for."); 246 247 // If the same stack slot is being filled twice, something's fishy. 248 if (MovVector[StackDisp] != nullptr) 249 return false; 250 MovVector[StackDisp] = I; 251 252 ++I; 253 } while (I != MBB.end()); 254 255 // We now expect the end of the sequence - a call and a stack adjust. 256 if (I == MBB.end()) 257 return false; 258 259 // For PCrel calls, we expect an additional COPY of the basereg. 260 // If we find one, skip it. 261 if (I->isCopy()) { 262 if (I->getOperand(1).getReg() == 263 MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg()) 264 ++I; 265 else 266 return false; 267 } 268 269 if (!I->isCall()) 270 return false; 271 MachineBasicBlock::iterator Call = I; 272 if ((++I)->getOpcode() != FrameDestroyOpcode) 273 return false; 274 275 // Now, go through the vector, and see that we don't have any gaps, 276 // but only a series of 32-bit MOVs. 277 278 int64_t ExpectedDist = 0; 279 auto MMI = MovVector.begin(), MME = MovVector.end(); 280 for (; MMI != MME; ++MMI, ExpectedDist += 4) 281 if (*MMI == nullptr) 282 break; 283 284 // If the call had no parameters, do nothing 285 if (!ExpectedDist) 286 return false; 287 288 // We are either at the last parameter, or a gap. 289 // Make sure it's not a gap 290 for (; MMI != MME; ++MMI) 291 if (*MMI != nullptr) 292 return false; 293 294 // Ok, we can in fact do the transformation for this call. 295 // Do not remove the FrameSetup instruction, but adjust the parameters. 296 // PEI will end up finalizing the handling of this. 297 FrameSetup->getOperand(1).setImm(ExpectedDist); 298 299 DebugLoc DL = I->getDebugLoc(); 300 // Now, iterate through the vector in reverse order, and replace the movs 301 // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to 302 // replace uses. 303 for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) { 304 MachineBasicBlock::iterator MOV = *MovVector[Idx]; 305 MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); 306 if (MOV->getOpcode() == X86::MOV32mi) { 307 unsigned PushOpcode = X86::PUSHi32; 308 // If the operand is a small (8-bit) immediate, we can use a 309 // PUSH instruction with a shorter encoding. 310 // Note that isImm() may fail even though this is a MOVmi, because 311 // the operand can also be a symbol. 312 if (PushOp.isImm()) { 313 int64_t Val = PushOp.getImm(); 314 if (isInt<8>(Val)) 315 PushOpcode = X86::PUSH32i8; 316 } 317 BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp); 318 } else { 319 unsigned int Reg = PushOp.getReg(); 320 321 // If PUSHrmm is not slow on this target, try to fold the source of the 322 // push into the instruction. 323 const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>(); 324 bool SlowPUSHrmm = ST.isAtom() || ST.isSLM(); 325 326 // Check that this is legal to fold. Right now, we're extremely 327 // conservative about that. 328 MachineInstr *DefMov = nullptr; 329 if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { 330 MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm)); 331 332 unsigned NumOps = DefMov->getDesc().getNumOperands(); 333 for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) 334 Push->addOperand(DefMov->getOperand(i)); 335 336 DefMov->eraseFromParent(); 337 } else { 338 BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr(); 339 } 340 } 341 342 MBB.erase(MOV); 343 } 344 345 // The stack-pointer copy is no longer used in the call sequences. 346 // There should not be any other users, but we can't commit to that, so: 347 if (MRI->use_empty(SPCopy->getOperand(0).getReg())) 348 SPCopy->eraseFromParent(); 349 350 // Once we've done this, we need to make sure PEI doesn't assume a reserved 351 // frame. 352 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 353 FuncInfo->setHasPushSequences(true); 354 355 return true; 356 } 357 358 MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( 359 MachineBasicBlock::iterator FrameSetup, unsigned Reg) { 360 // Do an extremely restricted form of load folding. 361 // ISel will often create patterns like: 362 // movl 4(%edi), %eax 363 // movl 8(%edi), %ecx 364 // movl 12(%edi), %edx 365 // movl %edx, 8(%esp) 366 // movl %ecx, 4(%esp) 367 // movl %eax, (%esp) 368 // call 369 // Get rid of those with prejudice. 370 if (!TargetRegisterInfo::isVirtualRegister(Reg)) 371 return nullptr; 372 373 // Make sure this is the only use of Reg. 374 if (!MRI->hasOneNonDBGUse(Reg)) 375 return nullptr; 376 377 MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg); 378 379 // Make sure the def is a MOV from memory. 380 // If the def is an another block, give up. 381 if (DefMI->getOpcode() != X86::MOV32rm || 382 DefMI->getParent() != FrameSetup->getParent()) 383 return nullptr; 384 385 // Be careful with movs that load from a stack slot, since it may get 386 // resolved incorrectly. 387 // TODO: Again, we already have the infrastructure, so this should work. 388 if (!DefMI->getOperand(1).isReg()) 389 return nullptr; 390 391 // Now, make sure everything else up until the ADJCALLSTACK is a sequence 392 // of MOVs. To be less conservative would require duplicating a lot of the 393 // logic from PeepholeOptimizer. 394 // FIXME: A possibly better approach would be to teach the PeepholeOptimizer 395 // to be smarter about folding into pushes. 396 for (auto I = DefMI; I != FrameSetup; ++I) 397 if (I->getOpcode() != X86::MOV32rm) 398 return nullptr; 399 400 return DefMI; 401 } 402