1 //===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "ARM.h" 11 #include "ARMBaseInstrInfo.h" 12 #include "ARMSubtarget.h" 13 #include "MCTargetDesc/ARMAddressingModes.h" 14 #include "Thumb2InstrInfo.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/PostOrderIterator.h" 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/CodeGen/MachineFunctionPass.h" 19 #include "llvm/CodeGen/MachineInstr.h" 20 #include "llvm/CodeGen/MachineInstrBuilder.h" 21 #include "llvm/IR/Function.h" // To access Function attributes 22 #include "llvm/Support/CommandLine.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Support/raw_ostream.h" 25 #include "llvm/Target/TargetMachine.h" 26 using namespace llvm; 27 28 #define DEBUG_TYPE "t2-reduce-size" 29 30 STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones"); 31 STATISTIC(Num2Addrs, "Number of 32-bit instrs reduced to 2addr 16-bit ones"); 32 STATISTIC(NumLdSts, "Number of 32-bit load / store reduced to 16-bit ones"); 33 34 static cl::opt<int> ReduceLimit("t2-reduce-limit", 35 cl::init(-1), cl::Hidden); 36 static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2", 37 cl::init(-1), cl::Hidden); 38 static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3", 39 cl::init(-1), cl::Hidden); 40 41 namespace { 42 /// ReduceTable - A static table with information on mapping from wide 43 /// opcodes to narrow 44 struct ReduceEntry { 45 uint16_t WideOpc; // Wide opcode 46 uint16_t NarrowOpc1; // Narrow opcode to transform to 47 uint16_t NarrowOpc2; // Narrow opcode when it's two-address 48 uint8_t Imm1Limit; // Limit of immediate field (bits) 49 uint8_t Imm2Limit; // Limit of immediate field when it's two-address 50 unsigned LowRegs1 : 1; // Only possible if low-registers are used 51 unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr) 52 unsigned PredCC1 : 2; // 0 - If predicated, cc is on and vice versa. 53 // 1 - No cc field. 54 // 2 - Always set CPSR. 55 unsigned PredCC2 : 2; 56 unsigned PartFlag : 1; // 16-bit instruction does partial flag update 57 unsigned Special : 1; // Needs to be dealt with specially 58 unsigned AvoidMovs: 1; // Avoid movs with shifter operand (for Swift) 59 }; 60 61 static const ReduceEntry ReduceTable[] = { 62 // Wide, Narrow1, Narrow2, imm1,imm2, lo1, lo2, P/C,PF,S,AM 63 { ARM::t2ADCrr, 0, ARM::tADC, 0, 0, 0, 1, 0,0, 0,0,0 }, 64 { ARM::t2ADDri, ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 0,0, 0,1,0 }, 65 { ARM::t2ADDrr, ARM::tADDrr, ARM::tADDhirr, 0, 0, 1, 0, 0,1, 0,0,0 }, 66 { ARM::t2ADDSri,ARM::tADDi3, ARM::tADDi8, 3, 8, 1, 1, 2,2, 0,1,0 }, 67 { ARM::t2ADDSrr,ARM::tADDrr, 0, 0, 0, 1, 0, 2,0, 0,1,0 }, 68 { ARM::t2ANDrr, 0, ARM::tAND, 0, 0, 0, 1, 0,0, 1,0,0 }, 69 { ARM::t2ASRri, ARM::tASRri, 0, 5, 0, 1, 0, 0,0, 1,0,1 }, 70 { ARM::t2ASRrr, 0, ARM::tASRrr, 0, 0, 0, 1, 0,0, 1,0,1 }, 71 { ARM::t2BICrr, 0, ARM::tBIC, 0, 0, 0, 1, 0,0, 1,0,0 }, 72 //FIXME: Disable CMN, as CCodes are backwards from compare expectations 73 //{ ARM::t2CMNrr, ARM::tCMN, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, 74 { ARM::t2CMNzrr, ARM::tCMNz, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, 75 { ARM::t2CMPri, ARM::tCMPi8, 0, 8, 0, 1, 0, 2,0, 0,0,0 }, 76 { ARM::t2CMPrr, ARM::tCMPhir, 0, 0, 0, 0, 0, 2,0, 0,1,0 }, 77 { ARM::t2EORrr, 0, ARM::tEOR, 0, 0, 0, 1, 0,0, 1,0,0 }, 78 // FIXME: adr.n immediate offset must be multiple of 4. 79 //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0, 0, 0, 1, 0, 1,0, 0,0,0 }, 80 { ARM::t2LSLri, ARM::tLSLri, 0, 5, 0, 1, 0, 0,0, 1,0,1 }, 81 { ARM::t2LSLrr, 0, ARM::tLSLrr, 0, 0, 0, 1, 0,0, 1,0,1 }, 82 { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 1,0,1 }, 83 { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 1,0,1 }, 84 { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1,0,0 }, 85 { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1,1,0 }, 86 // FIXME: Do we need the 16-bit 'S' variant? 87 { ARM::t2MOVr,ARM::tMOVr, 0, 0, 0, 0, 0, 1,0, 0,0,0 }, 88 { ARM::t2MUL, 0, ARM::tMUL, 0, 0, 0, 1, 0,0, 1,0,0 }, 89 { ARM::t2MVNr, ARM::tMVN, 0, 0, 0, 1, 0, 0,0, 0,0,0 }, 90 { ARM::t2ORRrr, 0, ARM::tORR, 0, 0, 0, 1, 0,0, 1,0,0 }, 91 { ARM::t2REV, ARM::tREV, 0, 0, 0, 1, 0, 1,0, 0,0,0 }, 92 { ARM::t2REV16, ARM::tREV16, 0, 0, 0, 1, 0, 1,0, 0,0,0 }, 93 { ARM::t2REVSH, ARM::tREVSH, 0, 0, 0, 1, 0, 1,0, 0,0,0 }, 94 { ARM::t2RORrr, 0, ARM::tROR, 0, 0, 0, 1, 0,0, 1,0,0 }, 95 { ARM::t2RSBri, ARM::tRSB, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, 96 { ARM::t2RSBSri,ARM::tRSB, 0, 0, 0, 1, 0, 2,0, 0,1,0 }, 97 { ARM::t2SBCrr, 0, ARM::tSBC, 0, 0, 0, 1, 0,0, 0,0,0 }, 98 { ARM::t2SUBri, ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 0,0, 0,0,0 }, 99 { ARM::t2SUBrr, ARM::tSUBrr, 0, 0, 0, 1, 0, 0,0, 0,0,0 }, 100 { ARM::t2SUBSri,ARM::tSUBi3, ARM::tSUBi8, 3, 8, 1, 1, 2,2, 0,0,0 }, 101 { ARM::t2SUBSrr,ARM::tSUBrr, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, 102 { ARM::t2SXTB, ARM::tSXTB, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, 103 { ARM::t2SXTH, ARM::tSXTH, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, 104 { ARM::t2TSTrr, ARM::tTST, 0, 0, 0, 1, 0, 2,0, 0,0,0 }, 105 { ARM::t2UXTB, ARM::tUXTB, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, 106 { ARM::t2UXTH, ARM::tUXTH, 0, 0, 0, 1, 0, 1,0, 0,1,0 }, 107 108 // FIXME: Clean this up after splitting each Thumb load / store opcode 109 // into multiple ones. 110 { ARM::t2LDRi12,ARM::tLDRi, ARM::tLDRspi, 5, 8, 1, 0, 0,0, 0,1,0 }, 111 { ARM::t2LDRs, ARM::tLDRr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, 112 { ARM::t2LDRBi12,ARM::tLDRBi, 0, 5, 0, 1, 0, 0,0, 0,1,0 }, 113 { ARM::t2LDRBs, ARM::tLDRBr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, 114 { ARM::t2LDRHi12,ARM::tLDRHi, 0, 5, 0, 1, 0, 0,0, 0,1,0 }, 115 { ARM::t2LDRHs, ARM::tLDRHr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, 116 { ARM::t2LDRSBs,ARM::tLDRSB, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, 117 { ARM::t2LDRSHs,ARM::tLDRSH, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, 118 { ARM::t2STRi12,ARM::tSTRi, ARM::tSTRspi, 5, 8, 1, 0, 0,0, 0,1,0 }, 119 { ARM::t2STRs, ARM::tSTRr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, 120 { ARM::t2STRBi12,ARM::tSTRBi, 0, 5, 0, 1, 0, 0,0, 0,1,0 }, 121 { ARM::t2STRBs, ARM::tSTRBr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, 122 { ARM::t2STRHi12,ARM::tSTRHi, 0, 5, 0, 1, 0, 0,0, 0,1,0 }, 123 { ARM::t2STRHs, ARM::tSTRHr, 0, 0, 0, 1, 0, 0,0, 0,1,0 }, 124 125 { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, 126 { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 }, 127 { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 }, 128 // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent 129 { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, 130 { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 } 131 }; 132 133 class Thumb2SizeReduce : public MachineFunctionPass { 134 public: 135 static char ID; 136 Thumb2SizeReduce(); 137 138 const Thumb2InstrInfo *TII; 139 const ARMSubtarget *STI; 140 141 bool runOnMachineFunction(MachineFunction &MF) override; 142 143 const char *getPassName() const override { 144 return "Thumb2 instruction size reduction pass"; 145 } 146 147 private: 148 /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable. 149 DenseMap<unsigned, unsigned> ReduceOpcodeMap; 150 151 bool canAddPseudoFlagDep(MachineInstr *Use, bool IsSelfLoop); 152 153 bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, 154 bool is2Addr, ARMCC::CondCodes Pred, 155 bool LiveCPSR, bool &HasCC, bool &CCDead); 156 157 bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, 158 const ReduceEntry &Entry); 159 160 bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, 161 const ReduceEntry &Entry, bool LiveCPSR, bool IsSelfLoop); 162 163 /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address 164 /// instruction. 165 bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, 166 const ReduceEntry &Entry, bool LiveCPSR, 167 bool IsSelfLoop); 168 169 /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit 170 /// non-two-address instruction. 171 bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, 172 const ReduceEntry &Entry, bool LiveCPSR, 173 bool IsSelfLoop); 174 175 /// ReduceMI - Attempt to reduce MI, return true on success. 176 bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI, 177 bool LiveCPSR, bool IsSelfLoop); 178 179 /// ReduceMBB - Reduce width of instructions in the specified basic block. 180 bool ReduceMBB(MachineBasicBlock &MBB); 181 182 bool OptimizeSize; 183 bool MinimizeSize; 184 185 // Last instruction to define CPSR in the current block. 186 MachineInstr *CPSRDef; 187 // Was CPSR last defined by a high latency instruction? 188 // When CPSRDef is null, this refers to CPSR defs in predecessors. 189 bool HighLatencyCPSR; 190 191 struct MBBInfo { 192 // The flags leaving this block have high latency. 193 bool HighLatencyCPSR; 194 // Has this block been visited yet? 195 bool Visited; 196 197 MBBInfo() : HighLatencyCPSR(false), Visited(false) {} 198 }; 199 200 SmallVector<MBBInfo, 8> BlockInfo; 201 }; 202 char Thumb2SizeReduce::ID = 0; 203 } 204 205 Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(ID) { 206 OptimizeSize = MinimizeSize = false; 207 for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) { 208 unsigned FromOpc = ReduceTable[i].WideOpc; 209 if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second) 210 assert(false && "Duplicated entries?"); 211 } 212 } 213 214 static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) { 215 for (const uint16_t *Regs = MCID.getImplicitDefs(); *Regs; ++Regs) 216 if (*Regs == ARM::CPSR) 217 return true; 218 return false; 219 } 220 221 // Check for a likely high-latency flag def. 222 static bool isHighLatencyCPSR(MachineInstr *Def) { 223 switch(Def->getOpcode()) { 224 case ARM::FMSTAT: 225 case ARM::tMUL: 226 return true; 227 } 228 return false; 229 } 230 231 /// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations, 232 /// the 's' 16-bit instruction partially update CPSR. Abort the 233 /// transformation to avoid adding false dependency on last CPSR setting 234 /// instruction which hurts the ability for out-of-order execution engine 235 /// to do register renaming magic. 236 /// This function checks if there is a read-of-write dependency between the 237 /// last instruction that defines the CPSR and the current instruction. If there 238 /// is, then there is no harm done since the instruction cannot be retired 239 /// before the CPSR setting instruction anyway. 240 /// Note, we are not doing full dependency analysis here for the sake of compile 241 /// time. We're not looking for cases like: 242 /// r0 = muls ... 243 /// r1 = add.w r0, ... 244 /// ... 245 /// = mul.w r1 246 /// In this case it would have been ok to narrow the mul.w to muls since there 247 /// are indirect RAW dependency between the muls and the mul.w 248 bool 249 Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) { 250 // Disable the check for -Oz (aka OptimizeForSizeHarder). 251 if (MinimizeSize || !STI->avoidCPSRPartialUpdate()) 252 return false; 253 254 if (!CPSRDef) 255 // If this BB loops back to itself, conservatively avoid narrowing the 256 // first instruction that does partial flag update. 257 return HighLatencyCPSR || FirstInSelfLoop; 258 259 SmallSet<unsigned, 2> Defs; 260 for (const MachineOperand &MO : CPSRDef->operands()) { 261 if (!MO.isReg() || MO.isUndef() || MO.isUse()) 262 continue; 263 unsigned Reg = MO.getReg(); 264 if (Reg == 0 || Reg == ARM::CPSR) 265 continue; 266 Defs.insert(Reg); 267 } 268 269 for (const MachineOperand &MO : Use->operands()) { 270 if (!MO.isReg() || MO.isUndef() || MO.isDef()) 271 continue; 272 unsigned Reg = MO.getReg(); 273 if (Defs.count(Reg)) 274 return false; 275 } 276 277 // If the current CPSR has high latency, try to avoid the false dependency. 278 if (HighLatencyCPSR) 279 return true; 280 281 // tMOVi8 usually doesn't start long dependency chains, and there are a lot 282 // of them, so always shrink them when CPSR doesn't have high latency. 283 if (Use->getOpcode() == ARM::t2MOVi || 284 Use->getOpcode() == ARM::t2MOVi16) 285 return false; 286 287 // No read-after-write dependency. The narrowing will add false dependency. 288 return true; 289 } 290 291 bool 292 Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry, 293 bool is2Addr, ARMCC::CondCodes Pred, 294 bool LiveCPSR, bool &HasCC, bool &CCDead) { 295 if ((is2Addr && Entry.PredCC2 == 0) || 296 (!is2Addr && Entry.PredCC1 == 0)) { 297 if (Pred == ARMCC::AL) { 298 // Not predicated, must set CPSR. 299 if (!HasCC) { 300 // Original instruction was not setting CPSR, but CPSR is not 301 // currently live anyway. It's ok to set it. The CPSR def is 302 // dead though. 303 if (!LiveCPSR) { 304 HasCC = true; 305 CCDead = true; 306 return true; 307 } 308 return false; 309 } 310 } else { 311 // Predicated, must not set CPSR. 312 if (HasCC) 313 return false; 314 } 315 } else if ((is2Addr && Entry.PredCC2 == 2) || 316 (!is2Addr && Entry.PredCC1 == 2)) { 317 /// Old opcode has an optional def of CPSR. 318 if (HasCC) 319 return true; 320 // If old opcode does not implicitly define CPSR, then it's not ok since 321 // these new opcodes' CPSR def is not meant to be thrown away. e.g. CMP. 322 if (!HasImplicitCPSRDef(MI->getDesc())) 323 return false; 324 HasCC = true; 325 } else { 326 // 16-bit instruction does not set CPSR. 327 if (HasCC) 328 return false; 329 } 330 331 return true; 332 } 333 334 static bool VerifyLowRegs(MachineInstr *MI) { 335 unsigned Opc = MI->getOpcode(); 336 bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA || 337 Opc == ARM::t2LDMDB || Opc == ARM::t2LDMIA_UPD || 338 Opc == ARM::t2LDMDB_UPD); 339 bool isLROk = (Opc == ARM::t2STMDB_UPD); 340 bool isSPOk = isPCOk || isLROk; 341 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 342 const MachineOperand &MO = MI->getOperand(i); 343 if (!MO.isReg() || MO.isImplicit()) 344 continue; 345 unsigned Reg = MO.getReg(); 346 if (Reg == 0 || Reg == ARM::CPSR) 347 continue; 348 if (isPCOk && Reg == ARM::PC) 349 continue; 350 if (isLROk && Reg == ARM::LR) 351 continue; 352 if (Reg == ARM::SP) { 353 if (isSPOk) 354 continue; 355 if (i == 1 && (Opc == ARM::t2LDRi12 || Opc == ARM::t2STRi12)) 356 // Special case for these ldr / str with sp as base register. 357 continue; 358 } 359 if (!isARMLowRegister(Reg)) 360 return false; 361 } 362 return true; 363 } 364 365 bool 366 Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, 367 const ReduceEntry &Entry) { 368 if (ReduceLimitLdSt != -1 && ((int)NumLdSts >= ReduceLimitLdSt)) 369 return false; 370 371 unsigned Scale = 1; 372 bool HasImmOffset = false; 373 bool HasShift = false; 374 bool HasOffReg = true; 375 bool isLdStMul = false; 376 unsigned Opc = Entry.NarrowOpc1; 377 unsigned OpNum = 3; // First 'rest' of operands. 378 uint8_t ImmLimit = Entry.Imm1Limit; 379 380 switch (Entry.WideOpc) { 381 default: 382 llvm_unreachable("Unexpected Thumb2 load / store opcode!"); 383 case ARM::t2LDRi12: 384 case ARM::t2STRi12: 385 if (MI->getOperand(1).getReg() == ARM::SP) { 386 Opc = Entry.NarrowOpc2; 387 ImmLimit = Entry.Imm2Limit; 388 } 389 390 Scale = 4; 391 HasImmOffset = true; 392 HasOffReg = false; 393 break; 394 case ARM::t2LDRBi12: 395 case ARM::t2STRBi12: 396 HasImmOffset = true; 397 HasOffReg = false; 398 break; 399 case ARM::t2LDRHi12: 400 case ARM::t2STRHi12: 401 Scale = 2; 402 HasImmOffset = true; 403 HasOffReg = false; 404 break; 405 case ARM::t2LDRs: 406 case ARM::t2LDRBs: 407 case ARM::t2LDRHs: 408 case ARM::t2LDRSBs: 409 case ARM::t2LDRSHs: 410 case ARM::t2STRs: 411 case ARM::t2STRBs: 412 case ARM::t2STRHs: 413 HasShift = true; 414 OpNum = 4; 415 break; 416 case ARM::t2LDMIA: 417 case ARM::t2LDMDB: { 418 unsigned BaseReg = MI->getOperand(0).getReg(); 419 if (!isARMLowRegister(BaseReg) || Entry.WideOpc != ARM::t2LDMIA) 420 return false; 421 422 // For the non-writeback version (this one), the base register must be 423 // one of the registers being loaded. 424 bool isOK = false; 425 for (unsigned i = 4; i < MI->getNumOperands(); ++i) { 426 if (MI->getOperand(i).getReg() == BaseReg) { 427 isOK = true; 428 break; 429 } 430 } 431 432 if (!isOK) 433 return false; 434 435 OpNum = 0; 436 isLdStMul = true; 437 break; 438 } 439 case ARM::t2LDMIA_RET: { 440 unsigned BaseReg = MI->getOperand(1).getReg(); 441 if (BaseReg != ARM::SP) 442 return false; 443 Opc = Entry.NarrowOpc2; // tPOP_RET 444 OpNum = 2; 445 isLdStMul = true; 446 break; 447 } 448 case ARM::t2LDMIA_UPD: 449 case ARM::t2LDMDB_UPD: 450 case ARM::t2STMIA_UPD: 451 case ARM::t2STMDB_UPD: { 452 OpNum = 0; 453 454 unsigned BaseReg = MI->getOperand(1).getReg(); 455 if (BaseReg == ARM::SP && 456 (Entry.WideOpc == ARM::t2LDMIA_UPD || 457 Entry.WideOpc == ARM::t2STMDB_UPD)) { 458 Opc = Entry.NarrowOpc2; // tPOP or tPUSH 459 OpNum = 2; 460 } else if (!isARMLowRegister(BaseReg) || 461 (Entry.WideOpc != ARM::t2LDMIA_UPD && 462 Entry.WideOpc != ARM::t2STMIA_UPD)) { 463 return false; 464 } 465 466 isLdStMul = true; 467 break; 468 } 469 } 470 471 unsigned OffsetReg = 0; 472 bool OffsetKill = false; 473 bool OffsetInternal = false; 474 if (HasShift) { 475 OffsetReg = MI->getOperand(2).getReg(); 476 OffsetKill = MI->getOperand(2).isKill(); 477 OffsetInternal = MI->getOperand(2).isInternalRead(); 478 479 if (MI->getOperand(3).getImm()) 480 // Thumb1 addressing mode doesn't support shift. 481 return false; 482 } 483 484 unsigned OffsetImm = 0; 485 if (HasImmOffset) { 486 OffsetImm = MI->getOperand(2).getImm(); 487 unsigned MaxOffset = ((1 << ImmLimit) - 1) * Scale; 488 489 if ((OffsetImm & (Scale - 1)) || OffsetImm > MaxOffset) 490 // Make sure the immediate field fits. 491 return false; 492 } 493 494 // Add the 16-bit load / store instruction. 495 DebugLoc dl = MI->getDebugLoc(); 496 MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc)); 497 if (!isLdStMul) { 498 MIB.addOperand(MI->getOperand(0)); 499 MIB.addOperand(MI->getOperand(1)); 500 501 if (HasImmOffset) 502 MIB.addImm(OffsetImm / Scale); 503 504 assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!"); 505 506 if (HasOffReg) 507 MIB.addReg(OffsetReg, getKillRegState(OffsetKill) | 508 getInternalReadRegState(OffsetInternal)); 509 } 510 511 // Transfer the rest of operands. 512 for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum) 513 MIB.addOperand(MI->getOperand(OpNum)); 514 515 // Transfer memoperands. 516 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 517 518 // Transfer MI flags. 519 MIB.setMIFlags(MI->getFlags()); 520 521 DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); 522 523 MBB.erase_instr(MI); 524 ++NumLdSts; 525 return true; 526 } 527 528 bool 529 Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, 530 const ReduceEntry &Entry, 531 bool LiveCPSR, bool IsSelfLoop) { 532 unsigned Opc = MI->getOpcode(); 533 if (Opc == ARM::t2ADDri) { 534 // If the source register is SP, try to reduce to tADDrSPi, otherwise 535 // it's a normal reduce. 536 if (MI->getOperand(1).getReg() != ARM::SP) { 537 if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) 538 return true; 539 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); 540 } 541 // Try to reduce to tADDrSPi. 542 unsigned Imm = MI->getOperand(2).getImm(); 543 // The immediate must be in range, the destination register must be a low 544 // reg, the predicate must be "always" and the condition flags must not 545 // be being set. 546 if (Imm & 3 || Imm > 1020) 547 return false; 548 if (!isARMLowRegister(MI->getOperand(0).getReg())) 549 return false; 550 if (MI->getOperand(3).getImm() != ARMCC::AL) 551 return false; 552 const MCInstrDesc &MCID = MI->getDesc(); 553 if (MCID.hasOptionalDef() && 554 MI->getOperand(MCID.getNumOperands()-1).getReg() == ARM::CPSR) 555 return false; 556 557 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), 558 TII->get(ARM::tADDrSPi)) 559 .addOperand(MI->getOperand(0)) 560 .addOperand(MI->getOperand(1)) 561 .addImm(Imm / 4); // The tADDrSPi has an implied scale by four. 562 AddDefaultPred(MIB); 563 564 // Transfer MI flags. 565 MIB.setMIFlags(MI->getFlags()); 566 567 DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " <<*MIB); 568 569 MBB.erase_instr(MI); 570 ++NumNarrows; 571 return true; 572 } 573 574 if (Entry.LowRegs1 && !VerifyLowRegs(MI)) 575 return false; 576 577 if (MI->mayLoad() || MI->mayStore()) 578 return ReduceLoadStore(MBB, MI, Entry); 579 580 switch (Opc) { 581 default: break; 582 case ARM::t2ADDSri: 583 case ARM::t2ADDSrr: { 584 unsigned PredReg = 0; 585 if (getInstrPredicate(MI, PredReg) == ARMCC::AL) { 586 switch (Opc) { 587 default: break; 588 case ARM::t2ADDSri: { 589 if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) 590 return true; 591 // fallthrough 592 } 593 case ARM::t2ADDSrr: 594 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); 595 } 596 } 597 break; 598 } 599 case ARM::t2RSBri: 600 case ARM::t2RSBSri: 601 case ARM::t2SXTB: 602 case ARM::t2SXTH: 603 case ARM::t2UXTB: 604 case ARM::t2UXTH: 605 if (MI->getOperand(2).getImm() == 0) 606 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); 607 break; 608 case ARM::t2MOVi16: 609 // Can convert only 'pure' immediate operands, not immediates obtained as 610 // globals' addresses. 611 if (MI->getOperand(1).isImm()) 612 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); 613 break; 614 case ARM::t2CMPrr: { 615 // Try to reduce to the lo-reg only version first. Why there are two 616 // versions of the instruction is a mystery. 617 // It would be nice to just have two entries in the master table that 618 // are prioritized, but the table assumes a unique entry for each 619 // source insn opcode. So for now, we hack a local entry record to use. 620 static const ReduceEntry NarrowEntry = 621 { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1,0 }; 622 if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, IsSelfLoop)) 623 return true; 624 return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); 625 } 626 } 627 return false; 628 } 629 630 bool 631 Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, 632 const ReduceEntry &Entry, 633 bool LiveCPSR, bool IsSelfLoop) { 634 635 if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr)) 636 return false; 637 638 if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs && 639 STI->avoidMOVsShifterOperand()) 640 // Don't issue movs with shifter operand for some CPUs unless we 641 // are optimizing / minimizing for size. 642 return false; 643 644 unsigned Reg0 = MI->getOperand(0).getReg(); 645 unsigned Reg1 = MI->getOperand(1).getReg(); 646 // t2MUL is "special". The tied source operand is second, not first. 647 if (MI->getOpcode() == ARM::t2MUL) { 648 unsigned Reg2 = MI->getOperand(2).getReg(); 649 // Early exit if the regs aren't all low regs. 650 if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1) 651 || !isARMLowRegister(Reg2)) 652 return false; 653 if (Reg0 != Reg2) { 654 // If the other operand also isn't the same as the destination, we 655 // can't reduce. 656 if (Reg1 != Reg0) 657 return false; 658 // Try to commute the operands to make it a 2-address instruction. 659 MachineInstr *CommutedMI = TII->commuteInstruction(MI); 660 if (!CommutedMI) 661 return false; 662 } 663 } else if (Reg0 != Reg1) { 664 // Try to commute the operands to make it a 2-address instruction. 665 unsigned CommOpIdx1, CommOpIdx2; 666 if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) || 667 CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0) 668 return false; 669 MachineInstr *CommutedMI = TII->commuteInstruction(MI); 670 if (!CommutedMI) 671 return false; 672 } 673 if (Entry.LowRegs2 && !isARMLowRegister(Reg0)) 674 return false; 675 if (Entry.Imm2Limit) { 676 unsigned Imm = MI->getOperand(2).getImm(); 677 unsigned Limit = (1 << Entry.Imm2Limit) - 1; 678 if (Imm > Limit) 679 return false; 680 } else { 681 unsigned Reg2 = MI->getOperand(2).getReg(); 682 if (Entry.LowRegs2 && !isARMLowRegister(Reg2)) 683 return false; 684 } 685 686 // Check if it's possible / necessary to transfer the predicate. 687 const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2); 688 unsigned PredReg = 0; 689 ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); 690 bool SkipPred = false; 691 if (Pred != ARMCC::AL) { 692 if (!NewMCID.isPredicable()) 693 // Can't transfer predicate, fail. 694 return false; 695 } else { 696 SkipPred = !NewMCID.isPredicable(); 697 } 698 699 bool HasCC = false; 700 bool CCDead = false; 701 const MCInstrDesc &MCID = MI->getDesc(); 702 if (MCID.hasOptionalDef()) { 703 unsigned NumOps = MCID.getNumOperands(); 704 HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); 705 if (HasCC && MI->getOperand(NumOps-1).isDead()) 706 CCDead = true; 707 } 708 if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead)) 709 return false; 710 711 // Avoid adding a false dependency on partial flag update by some 16-bit 712 // instructions which has the 's' bit set. 713 if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC && 714 canAddPseudoFlagDep(MI, IsSelfLoop)) 715 return false; 716 717 // Add the 16-bit instruction. 718 DebugLoc dl = MI->getDebugLoc(); 719 MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID); 720 MIB.addOperand(MI->getOperand(0)); 721 if (NewMCID.hasOptionalDef()) { 722 if (HasCC) 723 AddDefaultT1CC(MIB, CCDead); 724 else 725 AddNoT1CC(MIB); 726 } 727 728 // Transfer the rest of operands. 729 unsigned NumOps = MCID.getNumOperands(); 730 for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { 731 if (i < NumOps && MCID.OpInfo[i].isOptionalDef()) 732 continue; 733 if (SkipPred && MCID.OpInfo[i].isPredicate()) 734 continue; 735 MIB.addOperand(MI->getOperand(i)); 736 } 737 738 // Transfer MI flags. 739 MIB.setMIFlags(MI->getFlags()); 740 741 DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); 742 743 MBB.erase_instr(MI); 744 ++Num2Addrs; 745 return true; 746 } 747 748 bool 749 Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, 750 const ReduceEntry &Entry, 751 bool LiveCPSR, bool IsSelfLoop) { 752 if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit)) 753 return false; 754 755 if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs && 756 STI->avoidMOVsShifterOperand()) 757 // Don't issue movs with shifter operand for some CPUs unless we 758 // are optimizing / minimizing for size. 759 return false; 760 761 unsigned Limit = ~0U; 762 if (Entry.Imm1Limit) 763 Limit = (1 << Entry.Imm1Limit) - 1; 764 765 const MCInstrDesc &MCID = MI->getDesc(); 766 for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) { 767 if (MCID.OpInfo[i].isPredicate()) 768 continue; 769 const MachineOperand &MO = MI->getOperand(i); 770 if (MO.isReg()) { 771 unsigned Reg = MO.getReg(); 772 if (!Reg || Reg == ARM::CPSR) 773 continue; 774 if (Entry.LowRegs1 && !isARMLowRegister(Reg)) 775 return false; 776 } else if (MO.isImm() && 777 !MCID.OpInfo[i].isPredicate()) { 778 if (((unsigned)MO.getImm()) > Limit) 779 return false; 780 } 781 } 782 783 // Check if it's possible / necessary to transfer the predicate. 784 const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1); 785 unsigned PredReg = 0; 786 ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); 787 bool SkipPred = false; 788 if (Pred != ARMCC::AL) { 789 if (!NewMCID.isPredicable()) 790 // Can't transfer predicate, fail. 791 return false; 792 } else { 793 SkipPred = !NewMCID.isPredicable(); 794 } 795 796 bool HasCC = false; 797 bool CCDead = false; 798 if (MCID.hasOptionalDef()) { 799 unsigned NumOps = MCID.getNumOperands(); 800 HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR); 801 if (HasCC && MI->getOperand(NumOps-1).isDead()) 802 CCDead = true; 803 } 804 if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead)) 805 return false; 806 807 // Avoid adding a false dependency on partial flag update by some 16-bit 808 // instructions which has the 's' bit set. 809 if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC && 810 canAddPseudoFlagDep(MI, IsSelfLoop)) 811 return false; 812 813 // Add the 16-bit instruction. 814 DebugLoc dl = MI->getDebugLoc(); 815 MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID); 816 MIB.addOperand(MI->getOperand(0)); 817 if (NewMCID.hasOptionalDef()) { 818 if (HasCC) 819 AddDefaultT1CC(MIB, CCDead); 820 else 821 AddNoT1CC(MIB); 822 } 823 824 // Transfer the rest of operands. 825 unsigned NumOps = MCID.getNumOperands(); 826 for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) { 827 if (i < NumOps && MCID.OpInfo[i].isOptionalDef()) 828 continue; 829 if ((MCID.getOpcode() == ARM::t2RSBSri || 830 MCID.getOpcode() == ARM::t2RSBri || 831 MCID.getOpcode() == ARM::t2SXTB || 832 MCID.getOpcode() == ARM::t2SXTH || 833 MCID.getOpcode() == ARM::t2UXTB || 834 MCID.getOpcode() == ARM::t2UXTH) && i == 2) 835 // Skip the zero immediate operand, it's now implicit. 836 continue; 837 bool isPred = (i < NumOps && MCID.OpInfo[i].isPredicate()); 838 if (SkipPred && isPred) 839 continue; 840 const MachineOperand &MO = MI->getOperand(i); 841 if (MO.isReg() && MO.isImplicit() && MO.getReg() == ARM::CPSR) 842 // Skip implicit def of CPSR. Either it's modeled as an optional 843 // def now or it's already an implicit def on the new instruction. 844 continue; 845 MIB.addOperand(MO); 846 } 847 if (!MCID.isPredicable() && NewMCID.isPredicable()) 848 AddDefaultPred(MIB); 849 850 // Transfer MI flags. 851 MIB.setMIFlags(MI->getFlags()); 852 853 DEBUG(errs() << "Converted 32-bit: " << *MI << " to 16-bit: " << *MIB); 854 855 MBB.erase_instr(MI); 856 ++NumNarrows; 857 return true; 858 } 859 860 static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) { 861 bool HasDef = false; 862 for (const MachineOperand &MO : MI.operands()) { 863 if (!MO.isReg() || MO.isUndef() || MO.isUse()) 864 continue; 865 if (MO.getReg() != ARM::CPSR) 866 continue; 867 868 DefCPSR = true; 869 if (!MO.isDead()) 870 HasDef = true; 871 } 872 873 return HasDef || LiveCPSR; 874 } 875 876 static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) { 877 for (const MachineOperand &MO : MI.operands()) { 878 if (!MO.isReg() || MO.isUndef() || MO.isDef()) 879 continue; 880 if (MO.getReg() != ARM::CPSR) 881 continue; 882 assert(LiveCPSR && "CPSR liveness tracking is wrong!"); 883 if (MO.isKill()) { 884 LiveCPSR = false; 885 break; 886 } 887 } 888 889 return LiveCPSR; 890 } 891 892 bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI, 893 bool LiveCPSR, bool IsSelfLoop) { 894 unsigned Opcode = MI->getOpcode(); 895 DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode); 896 if (OPI == ReduceOpcodeMap.end()) 897 return false; 898 const ReduceEntry &Entry = ReduceTable[OPI->second]; 899 900 // Don't attempt normal reductions on "special" cases for now. 901 if (Entry.Special) 902 return ReduceSpecial(MBB, MI, Entry, LiveCPSR, IsSelfLoop); 903 904 // Try to transform to a 16-bit two-address instruction. 905 if (Entry.NarrowOpc2 && 906 ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) 907 return true; 908 909 // Try to transform to a 16-bit non-two-address instruction. 910 if (Entry.NarrowOpc1 && 911 ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop)) 912 return true; 913 914 return false; 915 } 916 917 bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) { 918 bool Modified = false; 919 920 // Yes, CPSR could be livein. 921 bool LiveCPSR = MBB.isLiveIn(ARM::CPSR); 922 MachineInstr *BundleMI = nullptr; 923 924 CPSRDef = nullptr; 925 HighLatencyCPSR = false; 926 927 // Check predecessors for the latest CPSRDef. 928 for (auto *Pred : MBB.predecessors()) { 929 const MBBInfo &PInfo = BlockInfo[Pred->getNumber()]; 930 if (!PInfo.Visited) { 931 // Since blocks are visited in RPO, this must be a back-edge. 932 continue; 933 } 934 if (PInfo.HighLatencyCPSR) { 935 HighLatencyCPSR = true; 936 break; 937 } 938 } 939 940 // If this BB loops back to itself, conservatively avoid narrowing the 941 // first instruction that does partial flag update. 942 bool IsSelfLoop = MBB.isSuccessor(&MBB); 943 MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),E = MBB.instr_end(); 944 MachineBasicBlock::instr_iterator NextMII; 945 for (; MII != E; MII = NextMII) { 946 NextMII = std::next(MII); 947 948 MachineInstr *MI = &*MII; 949 if (MI->isBundle()) { 950 BundleMI = MI; 951 continue; 952 } 953 if (MI->isDebugValue()) 954 continue; 955 956 LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR); 957 958 // Does NextMII belong to the same bundle as MI? 959 bool NextInSameBundle = NextMII != E && NextMII->isBundledWithPred(); 960 961 if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop)) { 962 Modified = true; 963 MachineBasicBlock::instr_iterator I = std::prev(NextMII); 964 MI = &*I; 965 // Removing and reinserting the first instruction in a bundle will break 966 // up the bundle. Fix the bundling if it was broken. 967 if (NextInSameBundle && !NextMII->isBundledWithPred()) 968 NextMII->bundleWithPred(); 969 } 970 971 if (!NextInSameBundle && MI->isInsideBundle()) { 972 // FIXME: Since post-ra scheduler operates on bundles, the CPSR kill 973 // marker is only on the BUNDLE instruction. Process the BUNDLE 974 // instruction as we finish with the bundled instruction to work around 975 // the inconsistency. 976 if (BundleMI->killsRegister(ARM::CPSR)) 977 LiveCPSR = false; 978 MachineOperand *MO = BundleMI->findRegisterDefOperand(ARM::CPSR); 979 if (MO && !MO->isDead()) 980 LiveCPSR = true; 981 MO = BundleMI->findRegisterUseOperand(ARM::CPSR); 982 if (MO && !MO->isKill()) 983 LiveCPSR = true; 984 } 985 986 bool DefCPSR = false; 987 LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR); 988 if (MI->isCall()) { 989 // Calls don't really set CPSR. 990 CPSRDef = nullptr; 991 HighLatencyCPSR = false; 992 IsSelfLoop = false; 993 } else if (DefCPSR) { 994 // This is the last CPSR defining instruction. 995 CPSRDef = MI; 996 HighLatencyCPSR = isHighLatencyCPSR(CPSRDef); 997 IsSelfLoop = false; 998 } 999 } 1000 1001 MBBInfo &Info = BlockInfo[MBB.getNumber()]; 1002 Info.HighLatencyCPSR = HighLatencyCPSR; 1003 Info.Visited = true; 1004 return Modified; 1005 } 1006 1007 bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { 1008 STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget()); 1009 if (STI->isThumb1Only() || STI->prefers32BitThumb()) 1010 return false; 1011 1012 TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo()); 1013 1014 // Optimizing / minimizing size? 1015 OptimizeSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); 1016 MinimizeSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); 1017 1018 BlockInfo.clear(); 1019 BlockInfo.resize(MF.getNumBlockIDs()); 1020 1021 // Visit blocks in reverse post-order so LastCPSRDef is known for all 1022 // predecessors. 1023 ReversePostOrderTraversal<MachineFunction*> RPOT(&MF); 1024 bool Modified = false; 1025 for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator 1026 I = RPOT.begin(), E = RPOT.end(); I != E; ++I) 1027 Modified |= ReduceMBB(**I); 1028 return Modified; 1029 } 1030 1031 /// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size 1032 /// reduction pass. 1033 FunctionPass *llvm::createThumb2SizeReductionPass() { 1034 return new Thumb2SizeReduce(); 1035 } 1036