1 //===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines a pattern matching instruction selector for PowerPC, 11 // converting from a legalized dag to a PPC dag. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "MCTargetDesc/PPCMCTargetDesc.h" 16 #include "MCTargetDesc/PPCPredicates.h" 17 #include "PPC.h" 18 #include "PPCISelLowering.h" 19 #include "PPCMachineFunctionInfo.h" 20 #include "PPCSubtarget.h" 21 #include "PPCTargetMachine.h" 22 #include "llvm/ADT/APInt.h" 23 #include "llvm/ADT/DenseMap.h" 24 #include "llvm/ADT/SmallPtrSet.h" 25 #include "llvm/ADT/SmallVector.h" 26 #include "llvm/ADT/STLExtras.h" 27 #include "llvm/Analysis/BranchProbabilityInfo.h" 28 #include "llvm/CodeGen/FunctionLoweringInfo.h" 29 #include "llvm/CodeGen/ISDOpcodes.h" 30 #include "llvm/CodeGen/MachineBasicBlock.h" 31 #include "llvm/CodeGen/MachineFunction.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/CodeGen/MachineValueType.h" 35 #include "llvm/CodeGen/SelectionDAG.h" 36 #include "llvm/CodeGen/SelectionDAGISel.h" 37 #include "llvm/CodeGen/SelectionDAGNodes.h" 38 #include "llvm/CodeGen/ValueTypes.h" 39 #include "llvm/IR/BasicBlock.h" 40 #include "llvm/IR/DebugLoc.h" 41 #include "llvm/IR/Function.h" 42 #include "llvm/IR/GlobalValue.h" 43 #include "llvm/IR/InlineAsm.h" 44 #include "llvm/IR/InstrTypes.h" 45 #include "llvm/IR/Module.h" 46 #include "llvm/Support/Casting.h" 47 #include "llvm/Support/CodeGen.h" 48 #include "llvm/Support/CommandLine.h" 49 #include "llvm/Support/Compiler.h" 50 #include "llvm/Support/Debug.h" 51 #include "llvm/Support/ErrorHandling.h" 52 #include "llvm/Support/MathExtras.h" 53 #include "llvm/Support/raw_ostream.h" 54 #include "llvm/Target/TargetInstrInfo.h" 55 #include "llvm/Target/TargetRegisterInfo.h" 56 #include <algorithm> 57 #include <cassert> 58 #include <cstdint> 59 #include <iterator> 60 #include <limits> 61 #include <memory> 62 #include <new> 63 #include <tuple> 64 #include <utility> 65 66 using namespace llvm; 67 68 #define DEBUG_TYPE "ppc-codegen" 69 70 // FIXME: Remove this once the bug has been fixed! 71 cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug", 72 cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden); 73 74 static cl::opt<bool> 75 UseBitPermRewriter("ppc-use-bit-perm-rewriter", cl::init(true), 76 cl::desc("use aggressive ppc isel for bit permutations"), 77 cl::Hidden); 78 static cl::opt<bool> BPermRewriterNoMasking( 79 "ppc-bit-perm-rewriter-stress-rotates", 80 cl::desc("stress rotate selection in aggressive ppc isel for " 81 "bit permutations"), 82 cl::Hidden); 83 84 static cl::opt<bool> EnableBranchHint( 85 "ppc-use-branch-hint", cl::init(true), 86 cl::desc("Enable static hinting of branches on ppc"), 87 cl::Hidden); 88 89 namespace { 90 91 //===--------------------------------------------------------------------===// 92 /// PPCDAGToDAGISel - PPC specific code to select PPC machine 93 /// instructions for SelectionDAG operations. 94 /// 95 class PPCDAGToDAGISel : public SelectionDAGISel { 96 const PPCTargetMachine &TM; 97 const PPCSubtarget *PPCSubTarget; 98 const PPCTargetLowering *PPCLowering; 99 unsigned GlobalBaseReg; 100 101 public: 102 explicit PPCDAGToDAGISel(PPCTargetMachine &tm) 103 : SelectionDAGISel(tm), TM(tm) {} 104 105 bool runOnMachineFunction(MachineFunction &MF) override { 106 // Make sure we re-emit a set of the global base reg if necessary 107 GlobalBaseReg = 0; 108 PPCSubTarget = &MF.getSubtarget<PPCSubtarget>(); 109 PPCLowering = PPCSubTarget->getTargetLowering(); 110 SelectionDAGISel::runOnMachineFunction(MF); 111 112 if (!PPCSubTarget->isSVR4ABI()) 113 InsertVRSaveCode(MF); 114 115 return true; 116 } 117 118 void PreprocessISelDAG() override; 119 void PostprocessISelDAG() override; 120 121 /// getI32Imm - Return a target constant with the specified value, of type 122 /// i32. 123 inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) { 124 return CurDAG->getTargetConstant(Imm, dl, MVT::i32); 125 } 126 127 /// getI64Imm - Return a target constant with the specified value, of type 128 /// i64. 129 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &dl) { 130 return CurDAG->getTargetConstant(Imm, dl, MVT::i64); 131 } 132 133 /// getSmallIPtrImm - Return a target constant of pointer type. 134 inline SDValue getSmallIPtrImm(unsigned Imm, const SDLoc &dl) { 135 return CurDAG->getTargetConstant( 136 Imm, dl, PPCLowering->getPointerTy(CurDAG->getDataLayout())); 137 } 138 139 /// isRotateAndMask - Returns true if Mask and Shift can be folded into a 140 /// rotate and mask opcode and mask operation. 141 static bool isRotateAndMask(SDNode *N, unsigned Mask, bool isShiftMask, 142 unsigned &SH, unsigned &MB, unsigned &ME); 143 144 /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC 145 /// base register. Return the virtual register that holds this value. 146 SDNode *getGlobalBaseReg(); 147 148 void selectFrameIndex(SDNode *SN, SDNode *N, unsigned Offset = 0); 149 150 // Select - Convert the specified operand from a target-independent to a 151 // target-specific node if it hasn't already been changed. 152 void Select(SDNode *N) override; 153 154 bool tryBitfieldInsert(SDNode *N); 155 bool tryBitPermutation(SDNode *N); 156 157 /// SelectCC - Select a comparison of the specified values with the 158 /// specified condition code, returning the CR# of the expression. 159 SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, 160 const SDLoc &dl); 161 162 /// SelectAddrImm - Returns true if the address N can be represented by 163 /// a base register plus a signed 16-bit displacement [r+imm]. 164 bool SelectAddrImm(SDValue N, SDValue &Disp, 165 SDValue &Base) { 166 return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false); 167 } 168 169 /// SelectAddrImmOffs - Return true if the operand is valid for a preinc 170 /// immediate field. Note that the operand at this point is already the 171 /// result of a prior SelectAddressRegImm call. 172 bool SelectAddrImmOffs(SDValue N, SDValue &Out) const { 173 if (N.getOpcode() == ISD::TargetConstant || 174 N.getOpcode() == ISD::TargetGlobalAddress) { 175 Out = N; 176 return true; 177 } 178 179 return false; 180 } 181 182 /// SelectAddrIdx - Given the specified addressed, check to see if it can be 183 /// represented as an indexed [r+r] operation. Returns false if it can 184 /// be represented by [r+imm], which are preferred. 185 bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) { 186 return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG); 187 } 188 189 /// SelectAddrIdxOnly - Given the specified addressed, force it to be 190 /// represented as an indexed [r+r] operation. 191 bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) { 192 return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG); 193 } 194 195 /// SelectAddrImmX4 - Returns true if the address N can be represented by 196 /// a base register plus a signed 16-bit displacement that is a multiple of 4. 197 /// Suitable for use by STD and friends. 198 bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) { 199 return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true); 200 } 201 202 // Select an address into a single register. 203 bool SelectAddr(SDValue N, SDValue &Base) { 204 Base = N; 205 return true; 206 } 207 208 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for 209 /// inline asm expressions. It is always correct to compute the value into 210 /// a register. The case of adding a (possibly relocatable) constant to a 211 /// register can be improved, but it is wrong to substitute Reg+Reg for 212 /// Reg in an asm, because the load or store opcode would have to change. 213 bool SelectInlineAsmMemoryOperand(const SDValue &Op, 214 unsigned ConstraintID, 215 std::vector<SDValue> &OutOps) override { 216 switch(ConstraintID) { 217 default: 218 errs() << "ConstraintID: " << ConstraintID << "\n"; 219 llvm_unreachable("Unexpected asm memory constraint"); 220 case InlineAsm::Constraint_es: 221 case InlineAsm::Constraint_i: 222 case InlineAsm::Constraint_m: 223 case InlineAsm::Constraint_o: 224 case InlineAsm::Constraint_Q: 225 case InlineAsm::Constraint_Z: 226 case InlineAsm::Constraint_Zy: 227 // We need to make sure that this one operand does not end up in r0 228 // (because we might end up lowering this as 0(%op)). 229 const TargetRegisterInfo *TRI = PPCSubTarget->getRegisterInfo(); 230 const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF, /*Kind=*/1); 231 SDLoc dl(Op); 232 SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32); 233 SDValue NewOp = 234 SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 235 dl, Op.getValueType(), 236 Op, RC), 0); 237 238 OutOps.push_back(NewOp); 239 return false; 240 } 241 return true; 242 } 243 244 void InsertVRSaveCode(MachineFunction &MF); 245 246 StringRef getPassName() const override { 247 return "PowerPC DAG->DAG Pattern Instruction Selection"; 248 } 249 250 // Include the pieces autogenerated from the target description. 251 #include "PPCGenDAGISel.inc" 252 253 private: 254 bool trySETCC(SDNode *N); 255 256 void PeepholePPC64(); 257 void PeepholePPC64ZExt(); 258 void PeepholeCROps(); 259 260 SDValue combineToCMPB(SDNode *N); 261 void foldBoolExts(SDValue &Res, SDNode *&N); 262 263 bool AllUsersSelectZero(SDNode *N); 264 void SwapAllSelectUsers(SDNode *N); 265 266 void transferMemOperands(SDNode *N, SDNode *Result); 267 }; 268 269 } // end anonymous namespace 270 271 /// InsertVRSaveCode - Once the entire function has been instruction selected, 272 /// all virtual registers are created and all machine instructions are built, 273 /// check to see if we need to save/restore VRSAVE. If so, do it. 274 void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) { 275 // Check to see if this function uses vector registers, which means we have to 276 // save and restore the VRSAVE register and update it with the regs we use. 277 // 278 // In this case, there will be virtual registers of vector type created 279 // by the scheduler. Detect them now. 280 bool HasVectorVReg = false; 281 for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) { 282 unsigned Reg = TargetRegisterInfo::index2VirtReg(i); 283 if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) { 284 HasVectorVReg = true; 285 break; 286 } 287 } 288 if (!HasVectorVReg) return; // nothing to do. 289 290 // If we have a vector register, we want to emit code into the entry and exit 291 // blocks to save and restore the VRSAVE register. We do this here (instead 292 // of marking all vector instructions as clobbering VRSAVE) for two reasons: 293 // 294 // 1. This (trivially) reduces the load on the register allocator, by not 295 // having to represent the live range of the VRSAVE register. 296 // 2. This (more significantly) allows us to create a temporary virtual 297 // register to hold the saved VRSAVE value, allowing this temporary to be 298 // register allocated, instead of forcing it to be spilled to the stack. 299 300 // Create two vregs - one to hold the VRSAVE register that is live-in to the 301 // function and one for the value after having bits or'd into it. 302 unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); 303 unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); 304 305 const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo(); 306 MachineBasicBlock &EntryBB = *Fn.begin(); 307 DebugLoc dl; 308 // Emit the following code into the entry block: 309 // InVRSAVE = MFVRSAVE 310 // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE 311 // MTVRSAVE UpdatedVRSAVE 312 MachineBasicBlock::iterator IP = EntryBB.begin(); // Insert Point 313 BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE); 314 BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE), 315 UpdatedVRSAVE).addReg(InVRSAVE); 316 BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE); 317 318 // Find all return blocks, outputting a restore in each epilog. 319 for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { 320 if (BB->isReturnBlock()) { 321 IP = BB->end(); --IP; 322 323 // Skip over all terminator instructions, which are part of the return 324 // sequence. 325 MachineBasicBlock::iterator I2 = IP; 326 while (I2 != BB->begin() && (--I2)->isTerminator()) 327 IP = I2; 328 329 // Emit: MTVRSAVE InVRSave 330 BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE); 331 } 332 } 333 } 334 335 /// getGlobalBaseReg - Output the instructions required to put the 336 /// base address to use for accessing globals into a register. 337 /// 338 SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { 339 if (!GlobalBaseReg) { 340 const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo(); 341 // Insert the set of GlobalBaseReg into the first MBB of the function 342 MachineBasicBlock &FirstMBB = MF->front(); 343 MachineBasicBlock::iterator MBBI = FirstMBB.begin(); 344 const Module *M = MF->getFunction()->getParent(); 345 DebugLoc dl; 346 347 if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) { 348 if (PPCSubTarget->isTargetELF()) { 349 GlobalBaseReg = PPC::R30; 350 if (M->getPICLevel() == PICLevel::SmallPIC) { 351 BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MoveGOTtoLR)); 352 BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg); 353 MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true); 354 } else { 355 BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR)); 356 BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg); 357 unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); 358 BuildMI(FirstMBB, MBBI, dl, 359 TII.get(PPC::UpdateGBR), GlobalBaseReg) 360 .addReg(TempReg, RegState::Define).addReg(GlobalBaseReg); 361 MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true); 362 } 363 } else { 364 GlobalBaseReg = 365 RegInfo->createVirtualRegister(&PPC::GPRC_and_GPRC_NOR0RegClass); 366 BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR)); 367 BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg); 368 } 369 } else { 370 GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass); 371 BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8)); 372 BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg); 373 } 374 } 375 return CurDAG->getRegister(GlobalBaseReg, 376 PPCLowering->getPointerTy(CurDAG->getDataLayout())) 377 .getNode(); 378 } 379 380 /// isIntS16Immediate - This method tests to see if the node is either a 32-bit 381 /// or 64-bit immediate, and if the value can be accurately represented as a 382 /// sign extension from a 16-bit value. If so, this returns true and the 383 /// immediate. 384 static bool isIntS16Immediate(SDNode *N, short &Imm) { 385 if (N->getOpcode() != ISD::Constant) 386 return false; 387 388 Imm = (short)cast<ConstantSDNode>(N)->getZExtValue(); 389 if (N->getValueType(0) == MVT::i32) 390 return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); 391 else 392 return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); 393 } 394 395 static bool isIntS16Immediate(SDValue Op, short &Imm) { 396 return isIntS16Immediate(Op.getNode(), Imm); 397 } 398 399 /// isInt32Immediate - This method tests to see if the node is a 32-bit constant 400 /// operand. If so Imm will receive the 32-bit value. 401 static bool isInt32Immediate(SDNode *N, unsigned &Imm) { 402 if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { 403 Imm = cast<ConstantSDNode>(N)->getZExtValue(); 404 return true; 405 } 406 return false; 407 } 408 409 /// isInt64Immediate - This method tests to see if the node is a 64-bit constant 410 /// operand. If so Imm will receive the 64-bit value. 411 static bool isInt64Immediate(SDNode *N, uint64_t &Imm) { 412 if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) { 413 Imm = cast<ConstantSDNode>(N)->getZExtValue(); 414 return true; 415 } 416 return false; 417 } 418 419 // isInt32Immediate - This method tests to see if a constant operand. 420 // If so Imm will receive the 32 bit value. 421 static bool isInt32Immediate(SDValue N, unsigned &Imm) { 422 return isInt32Immediate(N.getNode(), Imm); 423 } 424 425 static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo, 426 const SDValue &DestMBB) { 427 assert(isa<BasicBlockSDNode>(DestMBB)); 428 429 if (!FuncInfo->BPI) return PPC::BR_NO_HINT; 430 431 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); 432 const TerminatorInst *BBTerm = BB->getTerminator(); 433 434 if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT; 435 436 const BasicBlock *TBB = BBTerm->getSuccessor(0); 437 const BasicBlock *FBB = BBTerm->getSuccessor(1); 438 439 auto TProb = FuncInfo->BPI->getEdgeProbability(BB, TBB); 440 auto FProb = FuncInfo->BPI->getEdgeProbability(BB, FBB); 441 442 // We only want to handle cases which are easy to predict at static time, e.g. 443 // C++ throw statement, that is very likely not taken, or calling never 444 // returned function, e.g. stdlib exit(). So we set Threshold to filter 445 // unwanted cases. 446 // 447 // Below is LLVM branch weight table, we only want to handle case 1, 2 448 // 449 // Case Taken:Nontaken Example 450 // 1. Unreachable 1048575:1 C++ throw, stdlib exit(), 451 // 2. Invoke-terminating 1:1048575 452 // 3. Coldblock 4:64 __builtin_expect 453 // 4. Loop Branch 124:4 For loop 454 // 5. PH/ZH/FPH 20:12 455 const uint32_t Threshold = 10000; 456 457 if (std::max(TProb, FProb) / Threshold < std::min(TProb, FProb)) 458 return PPC::BR_NO_HINT; 459 460 DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName() << "::" 461 << BB->getName() << "'\n" 462 << " -> " << TBB->getName() << ": " << TProb << "\n" 463 << " -> " << FBB->getName() << ": " << FProb << "\n"); 464 465 const BasicBlockSDNode *BBDN = cast<BasicBlockSDNode>(DestMBB); 466 467 // If Dest BasicBlock is False-BasicBlock (FBB), swap branch probabilities, 468 // because we want 'TProb' stands for 'branch probability' to Dest BasicBlock 469 if (BBDN->getBasicBlock()->getBasicBlock() != TBB) 470 std::swap(TProb, FProb); 471 472 return (TProb > FProb) ? PPC::BR_TAKEN_HINT : PPC::BR_NONTAKEN_HINT; 473 } 474 475 // isOpcWithIntImmediate - This method tests to see if the node is a specific 476 // opcode and that it has a immediate integer right operand. 477 // If so Imm will receive the 32 bit value. 478 static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) { 479 return N->getOpcode() == Opc 480 && isInt32Immediate(N->getOperand(1).getNode(), Imm); 481 } 482 483 void PPCDAGToDAGISel::selectFrameIndex(SDNode *SN, SDNode *N, unsigned Offset) { 484 SDLoc dl(SN); 485 int FI = cast<FrameIndexSDNode>(N)->getIndex(); 486 SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0)); 487 unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8; 488 if (SN->hasOneUse()) 489 CurDAG->SelectNodeTo(SN, Opc, N->getValueType(0), TFI, 490 getSmallIPtrImm(Offset, dl)); 491 else 492 ReplaceNode(SN, CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI, 493 getSmallIPtrImm(Offset, dl))); 494 } 495 496 bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask, 497 bool isShiftMask, unsigned &SH, 498 unsigned &MB, unsigned &ME) { 499 // Don't even go down this path for i64, since different logic will be 500 // necessary for rldicl/rldicr/rldimi. 501 if (N->getValueType(0) != MVT::i32) 502 return false; 503 504 unsigned Shift = 32; 505 unsigned Indeterminant = ~0; // bit mask marking indeterminant results 506 unsigned Opcode = N->getOpcode(); 507 if (N->getNumOperands() != 2 || 508 !isInt32Immediate(N->getOperand(1).getNode(), Shift) || (Shift > 31)) 509 return false; 510 511 if (Opcode == ISD::SHL) { 512 // apply shift left to mask if it comes first 513 if (isShiftMask) Mask = Mask << Shift; 514 // determine which bits are made indeterminant by shift 515 Indeterminant = ~(0xFFFFFFFFu << Shift); 516 } else if (Opcode == ISD::SRL) { 517 // apply shift right to mask if it comes first 518 if (isShiftMask) Mask = Mask >> Shift; 519 // determine which bits are made indeterminant by shift 520 Indeterminant = ~(0xFFFFFFFFu >> Shift); 521 // adjust for the left rotate 522 Shift = 32 - Shift; 523 } else if (Opcode == ISD::ROTL) { 524 Indeterminant = 0; 525 } else { 526 return false; 527 } 528 529 // if the mask doesn't intersect any Indeterminant bits 530 if (Mask && !(Mask & Indeterminant)) { 531 SH = Shift & 31; 532 // make sure the mask is still a mask (wrap arounds may not be) 533 return isRunOfOnes(Mask, MB, ME); 534 } 535 return false; 536 } 537 538 /// Turn an or of two masked values into the rotate left word immediate then 539 /// mask insert (rlwimi) instruction. 540 bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) { 541 SDValue Op0 = N->getOperand(0); 542 SDValue Op1 = N->getOperand(1); 543 SDLoc dl(N); 544 545 APInt LKZ, LKO, RKZ, RKO; 546 CurDAG->computeKnownBits(Op0, LKZ, LKO); 547 CurDAG->computeKnownBits(Op1, RKZ, RKO); 548 549 unsigned TargetMask = LKZ.getZExtValue(); 550 unsigned InsertMask = RKZ.getZExtValue(); 551 552 if ((TargetMask | InsertMask) == 0xFFFFFFFF) { 553 unsigned Op0Opc = Op0.getOpcode(); 554 unsigned Op1Opc = Op1.getOpcode(); 555 unsigned Value, SH = 0; 556 TargetMask = ~TargetMask; 557 InsertMask = ~InsertMask; 558 559 // If the LHS has a foldable shift and the RHS does not, then swap it to the 560 // RHS so that we can fold the shift into the insert. 561 if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) { 562 if (Op0.getOperand(0).getOpcode() == ISD::SHL || 563 Op0.getOperand(0).getOpcode() == ISD::SRL) { 564 if (Op1.getOperand(0).getOpcode() != ISD::SHL && 565 Op1.getOperand(0).getOpcode() != ISD::SRL) { 566 std::swap(Op0, Op1); 567 std::swap(Op0Opc, Op1Opc); 568 std::swap(TargetMask, InsertMask); 569 } 570 } 571 } else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) { 572 if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL && 573 Op1.getOperand(0).getOpcode() != ISD::SRL) { 574 std::swap(Op0, Op1); 575 std::swap(Op0Opc, Op1Opc); 576 std::swap(TargetMask, InsertMask); 577 } 578 } 579 580 unsigned MB, ME; 581 if (isRunOfOnes(InsertMask, MB, ME)) { 582 SDValue Tmp1, Tmp2; 583 584 if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) && 585 isInt32Immediate(Op1.getOperand(1), Value)) { 586 Op1 = Op1.getOperand(0); 587 SH = (Op1Opc == ISD::SHL) ? Value : 32 - Value; 588 } 589 if (Op1Opc == ISD::AND) { 590 // The AND mask might not be a constant, and we need to make sure that 591 // if we're going to fold the masking with the insert, all bits not 592 // know to be zero in the mask are known to be one. 593 APInt MKZ, MKO; 594 CurDAG->computeKnownBits(Op1.getOperand(1), MKZ, MKO); 595 bool CanFoldMask = InsertMask == MKO.getZExtValue(); 596 597 unsigned SHOpc = Op1.getOperand(0).getOpcode(); 598 if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && CanFoldMask && 599 isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) { 600 // Note that Value must be in range here (less than 32) because 601 // otherwise there would not be any bits set in InsertMask. 602 Op1 = Op1.getOperand(0).getOperand(0); 603 SH = (SHOpc == ISD::SHL) ? Value : 32 - Value; 604 } 605 } 606 607 SH &= 31; 608 SDValue Ops[] = { Op0, Op1, getI32Imm(SH, dl), getI32Imm(MB, dl), 609 getI32Imm(ME, dl) }; 610 ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops)); 611 return true; 612 } 613 } 614 return false; 615 } 616 617 // Predict the number of instructions that would be generated by calling 618 // getInt64(N). 619 static unsigned getInt64CountDirect(int64_t Imm) { 620 // Assume no remaining bits. 621 unsigned Remainder = 0; 622 // Assume no shift required. 623 unsigned Shift = 0; 624 625 // If it can't be represented as a 32 bit value. 626 if (!isInt<32>(Imm)) { 627 Shift = countTrailingZeros<uint64_t>(Imm); 628 int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift; 629 630 // If the shifted value fits 32 bits. 631 if (isInt<32>(ImmSh)) { 632 // Go with the shifted value. 633 Imm = ImmSh; 634 } else { 635 // Still stuck with a 64 bit value. 636 Remainder = Imm; 637 Shift = 32; 638 Imm >>= 32; 639 } 640 } 641 642 // Intermediate operand. 643 unsigned Result = 0; 644 645 // Handle first 32 bits. 646 unsigned Lo = Imm & 0xFFFF; 647 648 // Simple value. 649 if (isInt<16>(Imm)) { 650 // Just the Lo bits. 651 ++Result; 652 } else if (Lo) { 653 // Handle the Hi bits and Lo bits. 654 Result += 2; 655 } else { 656 // Just the Hi bits. 657 ++Result; 658 } 659 660 // If no shift, we're done. 661 if (!Shift) return Result; 662 663 // If Hi word == Lo word, 664 // we can use rldimi to insert the Lo word into Hi word. 665 if ((unsigned)(Imm & 0xFFFFFFFF) == Remainder) { 666 ++Result; 667 return Result; 668 } 669 670 // Shift for next step if the upper 32-bits were not zero. 671 if (Imm) 672 ++Result; 673 674 // Add in the last bits as required. 675 if ((Remainder >> 16) & 0xFFFF) 676 ++Result; 677 if (Remainder & 0xFFFF) 678 ++Result; 679 680 return Result; 681 } 682 683 static uint64_t Rot64(uint64_t Imm, unsigned R) { 684 return (Imm << R) | (Imm >> (64 - R)); 685 } 686 687 static unsigned getInt64Count(int64_t Imm) { 688 unsigned Count = getInt64CountDirect(Imm); 689 if (Count == 1) 690 return Count; 691 692 for (unsigned r = 1; r < 63; ++r) { 693 uint64_t RImm = Rot64(Imm, r); 694 unsigned RCount = getInt64CountDirect(RImm) + 1; 695 Count = std::min(Count, RCount); 696 697 // See comments in getInt64 for an explanation of the logic below. 698 unsigned LS = findLastSet(RImm); 699 if (LS != r-1) 700 continue; 701 702 uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1)); 703 uint64_t RImmWithOnes = RImm | OnesMask; 704 705 RCount = getInt64CountDirect(RImmWithOnes) + 1; 706 Count = std::min(Count, RCount); 707 } 708 709 return Count; 710 } 711 712 // Select a 64-bit constant. For cost-modeling purposes, getInt64Count 713 // (above) needs to be kept in sync with this function. 714 static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl, 715 int64_t Imm) { 716 // Assume no remaining bits. 717 unsigned Remainder = 0; 718 // Assume no shift required. 719 unsigned Shift = 0; 720 721 // If it can't be represented as a 32 bit value. 722 if (!isInt<32>(Imm)) { 723 Shift = countTrailingZeros<uint64_t>(Imm); 724 int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift; 725 726 // If the shifted value fits 32 bits. 727 if (isInt<32>(ImmSh)) { 728 // Go with the shifted value. 729 Imm = ImmSh; 730 } else { 731 // Still stuck with a 64 bit value. 732 Remainder = Imm; 733 Shift = 32; 734 Imm >>= 32; 735 } 736 } 737 738 // Intermediate operand. 739 SDNode *Result; 740 741 // Handle first 32 bits. 742 unsigned Lo = Imm & 0xFFFF; 743 unsigned Hi = (Imm >> 16) & 0xFFFF; 744 745 auto getI32Imm = [CurDAG, dl](unsigned Imm) { 746 return CurDAG->getTargetConstant(Imm, dl, MVT::i32); 747 }; 748 749 // Simple value. 750 if (isInt<16>(Imm)) { 751 // Just the Lo bits. 752 Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo)); 753 } else if (Lo) { 754 // Handle the Hi bits. 755 unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8; 756 Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi)); 757 // And Lo bits. 758 Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, 759 SDValue(Result, 0), getI32Imm(Lo)); 760 } else { 761 // Just the Hi bits. 762 Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi)); 763 } 764 765 // If no shift, we're done. 766 if (!Shift) return Result; 767 768 // If Hi word == Lo word, 769 // we can use rldimi to insert the Lo word into Hi word. 770 if ((unsigned)(Imm & 0xFFFFFFFF) == Remainder) { 771 SDValue Ops[] = 772 { SDValue(Result, 0), SDValue(Result, 0), getI32Imm(Shift), getI32Imm(0)}; 773 return CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops); 774 } 775 776 // Shift for next step if the upper 32-bits were not zero. 777 if (Imm) { 778 Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, 779 SDValue(Result, 0), 780 getI32Imm(Shift), 781 getI32Imm(63 - Shift)); 782 } 783 784 // Add in the last bits as required. 785 if ((Hi = (Remainder >> 16) & 0xFFFF)) { 786 Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64, 787 SDValue(Result, 0), getI32Imm(Hi)); 788 } 789 if ((Lo = Remainder & 0xFFFF)) { 790 Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, 791 SDValue(Result, 0), getI32Imm(Lo)); 792 } 793 794 return Result; 795 } 796 797 static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) { 798 unsigned Count = getInt64CountDirect(Imm); 799 if (Count == 1) 800 return getInt64Direct(CurDAG, dl, Imm); 801 802 unsigned RMin = 0; 803 804 int64_t MatImm; 805 unsigned MaskEnd; 806 807 for (unsigned r = 1; r < 63; ++r) { 808 uint64_t RImm = Rot64(Imm, r); 809 unsigned RCount = getInt64CountDirect(RImm) + 1; 810 if (RCount < Count) { 811 Count = RCount; 812 RMin = r; 813 MatImm = RImm; 814 MaskEnd = 63; 815 } 816 817 // If the immediate to generate has many trailing zeros, it might be 818 // worthwhile to generate a rotated value with too many leading ones 819 // (because that's free with li/lis's sign-extension semantics), and then 820 // mask them off after rotation. 821 822 unsigned LS = findLastSet(RImm); 823 // We're adding (63-LS) higher-order ones, and we expect to mask them off 824 // after performing the inverse rotation by (64-r). So we need that: 825 // 63-LS == 64-r => LS == r-1 826 if (LS != r-1) 827 continue; 828 829 uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1)); 830 uint64_t RImmWithOnes = RImm | OnesMask; 831 832 RCount = getInt64CountDirect(RImmWithOnes) + 1; 833 if (RCount < Count) { 834 Count = RCount; 835 RMin = r; 836 MatImm = RImmWithOnes; 837 MaskEnd = LS; 838 } 839 } 840 841 if (!RMin) 842 return getInt64Direct(CurDAG, dl, Imm); 843 844 auto getI32Imm = [CurDAG, dl](unsigned Imm) { 845 return CurDAG->getTargetConstant(Imm, dl, MVT::i32); 846 }; 847 848 SDValue Val = SDValue(getInt64Direct(CurDAG, dl, MatImm), 0); 849 return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val, 850 getI32Imm(64 - RMin), getI32Imm(MaskEnd)); 851 } 852 853 // Select a 64-bit constant. 854 static SDNode *getInt64(SelectionDAG *CurDAG, SDNode *N) { 855 SDLoc dl(N); 856 857 // Get 64 bit value. 858 int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue(); 859 return getInt64(CurDAG, dl, Imm); 860 } 861 862 namespace { 863 864 class BitPermutationSelector { 865 struct ValueBit { 866 SDValue V; 867 868 // The bit number in the value, using a convention where bit 0 is the 869 // lowest-order bit. 870 unsigned Idx; 871 872 enum Kind { 873 ConstZero, 874 Variable 875 } K; 876 877 ValueBit(SDValue V, unsigned I, Kind K = Variable) 878 : V(V), Idx(I), K(K) {} 879 ValueBit(Kind K = Variable) 880 : V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {} 881 882 bool isZero() const { 883 return K == ConstZero; 884 } 885 886 bool hasValue() const { 887 return K == Variable; 888 } 889 890 SDValue getValue() const { 891 assert(hasValue() && "Cannot get the value of a constant bit"); 892 return V; 893 } 894 895 unsigned getValueBitIndex() const { 896 assert(hasValue() && "Cannot get the value bit index of a constant bit"); 897 return Idx; 898 } 899 }; 900 901 // A bit group has the same underlying value and the same rotate factor. 902 struct BitGroup { 903 SDValue V; 904 unsigned RLAmt; 905 unsigned StartIdx, EndIdx; 906 907 // This rotation amount assumes that the lower 32 bits of the quantity are 908 // replicated in the high 32 bits by the rotation operator (which is done 909 // by rlwinm and friends in 64-bit mode). 910 bool Repl32; 911 // Did converting to Repl32 == true change the rotation factor? If it did, 912 // it decreased it by 32. 913 bool Repl32CR; 914 // Was this group coalesced after setting Repl32 to true? 915 bool Repl32Coalesced; 916 917 BitGroup(SDValue V, unsigned R, unsigned S, unsigned E) 918 : V(V), RLAmt(R), StartIdx(S), EndIdx(E), Repl32(false), Repl32CR(false), 919 Repl32Coalesced(false) { 920 DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R << 921 " [" << S << ", " << E << "]\n"); 922 } 923 }; 924 925 // Information on each (Value, RLAmt) pair (like the number of groups 926 // associated with each) used to choose the lowering method. 927 struct ValueRotInfo { 928 SDValue V; 929 unsigned RLAmt = std::numeric_limits<unsigned>::max(); 930 unsigned NumGroups = 0; 931 unsigned FirstGroupStartIdx = std::numeric_limits<unsigned>::max(); 932 bool Repl32 = false; 933 934 ValueRotInfo() = default; 935 936 // For sorting (in reverse order) by NumGroups, and then by 937 // FirstGroupStartIdx. 938 bool operator < (const ValueRotInfo &Other) const { 939 // We need to sort so that the non-Repl32 come first because, when we're 940 // doing masking, the Repl32 bit groups might be subsumed into the 64-bit 941 // masking operation. 942 if (Repl32 < Other.Repl32) 943 return true; 944 else if (Repl32 > Other.Repl32) 945 return false; 946 else if (NumGroups > Other.NumGroups) 947 return true; 948 else if (NumGroups < Other.NumGroups) 949 return false; 950 else if (FirstGroupStartIdx < Other.FirstGroupStartIdx) 951 return true; 952 return false; 953 } 954 }; 955 956 using ValueBitsMemoizedValue = std::pair<bool, SmallVector<ValueBit, 64>>; 957 using ValueBitsMemoizer = 958 DenseMap<SDValue, std::unique_ptr<ValueBitsMemoizedValue>>; 959 ValueBitsMemoizer Memoizer; 960 961 // Return a pair of bool and a SmallVector pointer to a memoization entry. 962 // The bool is true if something interesting was deduced, otherwise if we're 963 // providing only a generic representation of V (or something else likewise 964 // uninteresting for instruction selection) through the SmallVector. 965 std::pair<bool, SmallVector<ValueBit, 64> *> getValueBits(SDValue V, 966 unsigned NumBits) { 967 auto &ValueEntry = Memoizer[V]; 968 if (ValueEntry) 969 return std::make_pair(ValueEntry->first, &ValueEntry->second); 970 ValueEntry.reset(new ValueBitsMemoizedValue()); 971 bool &Interesting = ValueEntry->first; 972 SmallVector<ValueBit, 64> &Bits = ValueEntry->second; 973 Bits.resize(NumBits); 974 975 switch (V.getOpcode()) { 976 default: break; 977 case ISD::ROTL: 978 if (isa<ConstantSDNode>(V.getOperand(1))) { 979 unsigned RotAmt = V.getConstantOperandVal(1); 980 981 const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second; 982 983 for (unsigned i = 0; i < NumBits; ++i) 984 Bits[i] = LHSBits[i < RotAmt ? i + (NumBits - RotAmt) : i - RotAmt]; 985 986 return std::make_pair(Interesting = true, &Bits); 987 } 988 break; 989 case ISD::SHL: 990 if (isa<ConstantSDNode>(V.getOperand(1))) { 991 unsigned ShiftAmt = V.getConstantOperandVal(1); 992 993 const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second; 994 995 for (unsigned i = ShiftAmt; i < NumBits; ++i) 996 Bits[i] = LHSBits[i - ShiftAmt]; 997 998 for (unsigned i = 0; i < ShiftAmt; ++i) 999 Bits[i] = ValueBit(ValueBit::ConstZero); 1000 1001 return std::make_pair(Interesting = true, &Bits); 1002 } 1003 break; 1004 case ISD::SRL: 1005 if (isa<ConstantSDNode>(V.getOperand(1))) { 1006 unsigned ShiftAmt = V.getConstantOperandVal(1); 1007 1008 const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second; 1009 1010 for (unsigned i = 0; i < NumBits - ShiftAmt; ++i) 1011 Bits[i] = LHSBits[i + ShiftAmt]; 1012 1013 for (unsigned i = NumBits - ShiftAmt; i < NumBits; ++i) 1014 Bits[i] = ValueBit(ValueBit::ConstZero); 1015 1016 return std::make_pair(Interesting = true, &Bits); 1017 } 1018 break; 1019 case ISD::AND: 1020 if (isa<ConstantSDNode>(V.getOperand(1))) { 1021 uint64_t Mask = V.getConstantOperandVal(1); 1022 1023 const SmallVector<ValueBit, 64> *LHSBits; 1024 // Mark this as interesting, only if the LHS was also interesting. This 1025 // prevents the overall procedure from matching a single immediate 'and' 1026 // (which is non-optimal because such an and might be folded with other 1027 // things if we don't select it here). 1028 std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0), NumBits); 1029 1030 for (unsigned i = 0; i < NumBits; ++i) 1031 if (((Mask >> i) & 1) == 1) 1032 Bits[i] = (*LHSBits)[i]; 1033 else 1034 Bits[i] = ValueBit(ValueBit::ConstZero); 1035 1036 return std::make_pair(Interesting, &Bits); 1037 } 1038 break; 1039 case ISD::OR: { 1040 const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second; 1041 const auto &RHSBits = *getValueBits(V.getOperand(1), NumBits).second; 1042 1043 bool AllDisjoint = true; 1044 for (unsigned i = 0; i < NumBits; ++i) 1045 if (LHSBits[i].isZero()) 1046 Bits[i] = RHSBits[i]; 1047 else if (RHSBits[i].isZero()) 1048 Bits[i] = LHSBits[i]; 1049 else { 1050 AllDisjoint = false; 1051 break; 1052 } 1053 1054 if (!AllDisjoint) 1055 break; 1056 1057 return std::make_pair(Interesting = true, &Bits); 1058 } 1059 } 1060 1061 for (unsigned i = 0; i < NumBits; ++i) 1062 Bits[i] = ValueBit(V, i); 1063 1064 return std::make_pair(Interesting = false, &Bits); 1065 } 1066 1067 // For each value (except the constant ones), compute the left-rotate amount 1068 // to get it from its original to final position. 1069 void computeRotationAmounts() { 1070 HasZeros = false; 1071 RLAmt.resize(Bits.size()); 1072 for (unsigned i = 0; i < Bits.size(); ++i) 1073 if (Bits[i].hasValue()) { 1074 unsigned VBI = Bits[i].getValueBitIndex(); 1075 if (i >= VBI) 1076 RLAmt[i] = i - VBI; 1077 else 1078 RLAmt[i] = Bits.size() - (VBI - i); 1079 } else if (Bits[i].isZero()) { 1080 HasZeros = true; 1081 RLAmt[i] = UINT32_MAX; 1082 } else { 1083 llvm_unreachable("Unknown value bit type"); 1084 } 1085 } 1086 1087 // Collect groups of consecutive bits with the same underlying value and 1088 // rotation factor. If we're doing late masking, we ignore zeros, otherwise 1089 // they break up groups. 1090 void collectBitGroups(bool LateMask) { 1091 BitGroups.clear(); 1092 1093 unsigned LastRLAmt = RLAmt[0]; 1094 SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue(); 1095 unsigned LastGroupStartIdx = 0; 1096 for (unsigned i = 1; i < Bits.size(); ++i) { 1097 unsigned ThisRLAmt = RLAmt[i]; 1098 SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue(); 1099 if (LateMask && !ThisValue) { 1100 ThisValue = LastValue; 1101 ThisRLAmt = LastRLAmt; 1102 // If we're doing late masking, then the first bit group always starts 1103 // at zero (even if the first bits were zero). 1104 if (BitGroups.empty()) 1105 LastGroupStartIdx = 0; 1106 } 1107 1108 // If this bit has the same underlying value and the same rotate factor as 1109 // the last one, then they're part of the same group. 1110 if (ThisRLAmt == LastRLAmt && ThisValue == LastValue) 1111 continue; 1112 1113 if (LastValue.getNode()) 1114 BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx, 1115 i-1)); 1116 LastRLAmt = ThisRLAmt; 1117 LastValue = ThisValue; 1118 LastGroupStartIdx = i; 1119 } 1120 if (LastValue.getNode()) 1121 BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx, 1122 Bits.size()-1)); 1123 1124 if (BitGroups.empty()) 1125 return; 1126 1127 // We might be able to combine the first and last groups. 1128 if (BitGroups.size() > 1) { 1129 // If the first and last groups are the same, then remove the first group 1130 // in favor of the last group, making the ending index of the last group 1131 // equal to the ending index of the to-be-removed first group. 1132 if (BitGroups[0].StartIdx == 0 && 1133 BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 && 1134 BitGroups[0].V == BitGroups[BitGroups.size()-1].V && 1135 BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) { 1136 DEBUG(dbgs() << "\tcombining final bit group with initial one\n"); 1137 BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx; 1138 BitGroups.erase(BitGroups.begin()); 1139 } 1140 } 1141 } 1142 1143 // Take all (SDValue, RLAmt) pairs and sort them by the number of groups 1144 // associated with each. If there is a degeneracy, pick the one that occurs 1145 // first (in the final value). 1146 void collectValueRotInfo() { 1147 ValueRots.clear(); 1148 1149 for (auto &BG : BitGroups) { 1150 unsigned RLAmtKey = BG.RLAmt + (BG.Repl32 ? 64 : 0); 1151 ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, RLAmtKey)]; 1152 VRI.V = BG.V; 1153 VRI.RLAmt = BG.RLAmt; 1154 VRI.Repl32 = BG.Repl32; 1155 VRI.NumGroups += 1; 1156 VRI.FirstGroupStartIdx = std::min(VRI.FirstGroupStartIdx, BG.StartIdx); 1157 } 1158 1159 // Now that we've collected the various ValueRotInfo instances, we need to 1160 // sort them. 1161 ValueRotsVec.clear(); 1162 for (auto &I : ValueRots) { 1163 ValueRotsVec.push_back(I.second); 1164 } 1165 std::sort(ValueRotsVec.begin(), ValueRotsVec.end()); 1166 } 1167 1168 // In 64-bit mode, rlwinm and friends have a rotation operator that 1169 // replicates the low-order 32 bits into the high-order 32-bits. The mask 1170 // indices of these instructions can only be in the lower 32 bits, so they 1171 // can only represent some 64-bit bit groups. However, when they can be used, 1172 // the 32-bit replication can be used to represent, as a single bit group, 1173 // otherwise separate bit groups. We'll convert to replicated-32-bit bit 1174 // groups when possible. Returns true if any of the bit groups were 1175 // converted. 1176 void assignRepl32BitGroups() { 1177 // If we have bits like this: 1178 // 1179 // Indices: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 1180 // V bits: ... 7 6 5 4 3 2 1 0 31 30 29 28 27 26 25 24 1181 // Groups: | RLAmt = 8 | RLAmt = 40 | 1182 // 1183 // But, making use of a 32-bit operation that replicates the low-order 32 1184 // bits into the high-order 32 bits, this can be one bit group with a RLAmt 1185 // of 8. 1186 1187 auto IsAllLow32 = [this](BitGroup & BG) { 1188 if (BG.StartIdx <= BG.EndIdx) { 1189 for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i) { 1190 if (!Bits[i].hasValue()) 1191 continue; 1192 if (Bits[i].getValueBitIndex() >= 32) 1193 return false; 1194 } 1195 } else { 1196 for (unsigned i = BG.StartIdx; i < Bits.size(); ++i) { 1197 if (!Bits[i].hasValue()) 1198 continue; 1199 if (Bits[i].getValueBitIndex() >= 32) 1200 return false; 1201 } 1202 for (unsigned i = 0; i <= BG.EndIdx; ++i) { 1203 if (!Bits[i].hasValue()) 1204 continue; 1205 if (Bits[i].getValueBitIndex() >= 32) 1206 return false; 1207 } 1208 } 1209 1210 return true; 1211 }; 1212 1213 for (auto &BG : BitGroups) { 1214 if (BG.StartIdx < 32 && BG.EndIdx < 32) { 1215 if (IsAllLow32(BG)) { 1216 if (BG.RLAmt >= 32) { 1217 BG.RLAmt -= 32; 1218 BG.Repl32CR = true; 1219 } 1220 1221 BG.Repl32 = true; 1222 1223 DEBUG(dbgs() << "\t32-bit replicated bit group for " << 1224 BG.V.getNode() << " RLAmt = " << BG.RLAmt << 1225 " [" << BG.StartIdx << ", " << BG.EndIdx << "]\n"); 1226 } 1227 } 1228 } 1229 1230 // Now walk through the bit groups, consolidating where possible. 1231 for (auto I = BitGroups.begin(); I != BitGroups.end();) { 1232 // We might want to remove this bit group by merging it with the previous 1233 // group (which might be the ending group). 1234 auto IP = (I == BitGroups.begin()) ? 1235 std::prev(BitGroups.end()) : std::prev(I); 1236 if (I->Repl32 && IP->Repl32 && I->V == IP->V && I->RLAmt == IP->RLAmt && 1237 I->StartIdx == (IP->EndIdx + 1) % 64 && I != IP) { 1238 1239 DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for " << 1240 I->V.getNode() << " RLAmt = " << I->RLAmt << 1241 " [" << I->StartIdx << ", " << I->EndIdx << 1242 "] with group with range [" << 1243 IP->StartIdx << ", " << IP->EndIdx << "]\n"); 1244 1245 IP->EndIdx = I->EndIdx; 1246 IP->Repl32CR = IP->Repl32CR || I->Repl32CR; 1247 IP->Repl32Coalesced = true; 1248 I = BitGroups.erase(I); 1249 continue; 1250 } else { 1251 // There is a special case worth handling: If there is a single group 1252 // covering the entire upper 32 bits, and it can be merged with both 1253 // the next and previous groups (which might be the same group), then 1254 // do so. If it is the same group (so there will be only one group in 1255 // total), then we need to reverse the order of the range so that it 1256 // covers the entire 64 bits. 1257 if (I->StartIdx == 32 && I->EndIdx == 63) { 1258 assert(std::next(I) == BitGroups.end() && 1259 "bit group ends at index 63 but there is another?"); 1260 auto IN = BitGroups.begin(); 1261 1262 if (IP->Repl32 && IN->Repl32 && I->V == IP->V && I->V == IN->V && 1263 (I->RLAmt % 32) == IP->RLAmt && (I->RLAmt % 32) == IN->RLAmt && 1264 IP->EndIdx == 31 && IN->StartIdx == 0 && I != IP && 1265 IsAllLow32(*I)) { 1266 1267 DEBUG(dbgs() << "\tcombining bit group for " << 1268 I->V.getNode() << " RLAmt = " << I->RLAmt << 1269 " [" << I->StartIdx << ", " << I->EndIdx << 1270 "] with 32-bit replicated groups with ranges [" << 1271 IP->StartIdx << ", " << IP->EndIdx << "] and [" << 1272 IN->StartIdx << ", " << IN->EndIdx << "]\n"); 1273 1274 if (IP == IN) { 1275 // There is only one other group; change it to cover the whole 1276 // range (backward, so that it can still be Repl32 but cover the 1277 // whole 64-bit range). 1278 IP->StartIdx = 31; 1279 IP->EndIdx = 30; 1280 IP->Repl32CR = IP->Repl32CR || I->RLAmt >= 32; 1281 IP->Repl32Coalesced = true; 1282 I = BitGroups.erase(I); 1283 } else { 1284 // There are two separate groups, one before this group and one 1285 // after us (at the beginning). We're going to remove this group, 1286 // but also the group at the very beginning. 1287 IP->EndIdx = IN->EndIdx; 1288 IP->Repl32CR = IP->Repl32CR || IN->Repl32CR || I->RLAmt >= 32; 1289 IP->Repl32Coalesced = true; 1290 I = BitGroups.erase(I); 1291 BitGroups.erase(BitGroups.begin()); 1292 } 1293 1294 // This must be the last group in the vector (and we might have 1295 // just invalidated the iterator above), so break here. 1296 break; 1297 } 1298 } 1299 } 1300 1301 ++I; 1302 } 1303 } 1304 1305 SDValue getI32Imm(unsigned Imm, const SDLoc &dl) { 1306 return CurDAG->getTargetConstant(Imm, dl, MVT::i32); 1307 } 1308 1309 uint64_t getZerosMask() { 1310 uint64_t Mask = 0; 1311 for (unsigned i = 0; i < Bits.size(); ++i) { 1312 if (Bits[i].hasValue()) 1313 continue; 1314 Mask |= (UINT64_C(1) << i); 1315 } 1316 1317 return ~Mask; 1318 } 1319 1320 // Depending on the number of groups for a particular value, it might be 1321 // better to rotate, mask explicitly (using andi/andis), and then or the 1322 // result. Select this part of the result first. 1323 void SelectAndParts32(const SDLoc &dl, SDValue &Res, unsigned *InstCnt) { 1324 if (BPermRewriterNoMasking) 1325 return; 1326 1327 for (ValueRotInfo &VRI : ValueRotsVec) { 1328 unsigned Mask = 0; 1329 for (unsigned i = 0; i < Bits.size(); ++i) { 1330 if (!Bits[i].hasValue() || Bits[i].getValue() != VRI.V) 1331 continue; 1332 if (RLAmt[i] != VRI.RLAmt) 1333 continue; 1334 Mask |= (1u << i); 1335 } 1336 1337 // Compute the masks for andi/andis that would be necessary. 1338 unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16; 1339 assert((ANDIMask != 0 || ANDISMask != 0) && 1340 "No set bits in mask for value bit groups"); 1341 bool NeedsRotate = VRI.RLAmt != 0; 1342 1343 // We're trying to minimize the number of instructions. If we have one 1344 // group, using one of andi/andis can break even. If we have three 1345 // groups, we can use both andi and andis and break even (to use both 1346 // andi and andis we also need to or the results together). We need four 1347 // groups if we also need to rotate. To use andi/andis we need to do more 1348 // than break even because rotate-and-mask instructions tend to be easier 1349 // to schedule. 1350 1351 // FIXME: We've biased here against using andi/andis, which is right for 1352 // POWER cores, but not optimal everywhere. For example, on the A2, 1353 // andi/andis have single-cycle latency whereas the rotate-and-mask 1354 // instructions take two cycles, and it would be better to bias toward 1355 // andi/andis in break-even cases. 1356 1357 unsigned NumAndInsts = (unsigned) NeedsRotate + 1358 (unsigned) (ANDIMask != 0) + 1359 (unsigned) (ANDISMask != 0) + 1360 (unsigned) (ANDIMask != 0 && ANDISMask != 0) + 1361 (unsigned) (bool) Res; 1362 1363 DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() << 1364 " RL: " << VRI.RLAmt << ":" << 1365 "\n\t\t\tisel using masking: " << NumAndInsts << 1366 " using rotates: " << VRI.NumGroups << "\n"); 1367 1368 if (NumAndInsts >= VRI.NumGroups) 1369 continue; 1370 1371 DEBUG(dbgs() << "\t\t\t\tusing masking\n"); 1372 1373 if (InstCnt) *InstCnt += NumAndInsts; 1374 1375 SDValue VRot; 1376 if (VRI.RLAmt) { 1377 SDValue Ops[] = 1378 { VRI.V, getI32Imm(VRI.RLAmt, dl), getI32Imm(0, dl), 1379 getI32Imm(31, dl) }; 1380 VRot = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, 1381 Ops), 0); 1382 } else { 1383 VRot = VRI.V; 1384 } 1385 1386 SDValue ANDIVal, ANDISVal; 1387 if (ANDIMask != 0) 1388 ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32, 1389 VRot, getI32Imm(ANDIMask, dl)), 0); 1390 if (ANDISMask != 0) 1391 ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32, 1392 VRot, getI32Imm(ANDISMask, dl)), 0); 1393 1394 SDValue TotalVal; 1395 if (!ANDIVal) 1396 TotalVal = ANDISVal; 1397 else if (!ANDISVal) 1398 TotalVal = ANDIVal; 1399 else 1400 TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32, 1401 ANDIVal, ANDISVal), 0); 1402 1403 if (!Res) 1404 Res = TotalVal; 1405 else 1406 Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32, 1407 Res, TotalVal), 0); 1408 1409 // Now, remove all groups with this underlying value and rotation 1410 // factor. 1411 eraseMatchingBitGroups([VRI](const BitGroup &BG) { 1412 return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt; 1413 }); 1414 } 1415 } 1416 1417 // Instruction selection for the 32-bit case. 1418 SDNode *Select32(SDNode *N, bool LateMask, unsigned *InstCnt) { 1419 SDLoc dl(N); 1420 SDValue Res; 1421 1422 if (InstCnt) *InstCnt = 0; 1423 1424 // Take care of cases that should use andi/andis first. 1425 SelectAndParts32(dl, Res, InstCnt); 1426 1427 // If we've not yet selected a 'starting' instruction, and we have no zeros 1428 // to fill in, select the (Value, RLAmt) with the highest priority (largest 1429 // number of groups), and start with this rotated value. 1430 if ((!HasZeros || LateMask) && !Res) { 1431 ValueRotInfo &VRI = ValueRotsVec[0]; 1432 if (VRI.RLAmt) { 1433 if (InstCnt) *InstCnt += 1; 1434 SDValue Ops[] = 1435 { VRI.V, getI32Imm(VRI.RLAmt, dl), getI32Imm(0, dl), 1436 getI32Imm(31, dl) }; 1437 Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 1438 0); 1439 } else { 1440 Res = VRI.V; 1441 } 1442 1443 // Now, remove all groups with this underlying value and rotation factor. 1444 eraseMatchingBitGroups([VRI](const BitGroup &BG) { 1445 return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt; 1446 }); 1447 } 1448 1449 if (InstCnt) *InstCnt += BitGroups.size(); 1450 1451 // Insert the other groups (one at a time). 1452 for (auto &BG : BitGroups) { 1453 if (!Res) { 1454 SDValue Ops[] = 1455 { BG.V, getI32Imm(BG.RLAmt, dl), 1456 getI32Imm(Bits.size() - BG.EndIdx - 1, dl), 1457 getI32Imm(Bits.size() - BG.StartIdx - 1, dl) }; 1458 Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); 1459 } else { 1460 SDValue Ops[] = 1461 { Res, BG.V, getI32Imm(BG.RLAmt, dl), 1462 getI32Imm(Bits.size() - BG.EndIdx - 1, dl), 1463 getI32Imm(Bits.size() - BG.StartIdx - 1, dl) }; 1464 Res = SDValue(CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops), 0); 1465 } 1466 } 1467 1468 if (LateMask) { 1469 unsigned Mask = (unsigned) getZerosMask(); 1470 1471 unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16; 1472 assert((ANDIMask != 0 || ANDISMask != 0) && 1473 "No set bits in zeros mask?"); 1474 1475 if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) + 1476 (unsigned) (ANDISMask != 0) + 1477 (unsigned) (ANDIMask != 0 && ANDISMask != 0); 1478 1479 SDValue ANDIVal, ANDISVal; 1480 if (ANDIMask != 0) 1481 ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32, 1482 Res, getI32Imm(ANDIMask, dl)), 0); 1483 if (ANDISMask != 0) 1484 ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32, 1485 Res, getI32Imm(ANDISMask, dl)), 0); 1486 1487 if (!ANDIVal) 1488 Res = ANDISVal; 1489 else if (!ANDISVal) 1490 Res = ANDIVal; 1491 else 1492 Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32, 1493 ANDIVal, ANDISVal), 0); 1494 } 1495 1496 return Res.getNode(); 1497 } 1498 1499 unsigned SelectRotMask64Count(unsigned RLAmt, bool Repl32, 1500 unsigned MaskStart, unsigned MaskEnd, 1501 bool IsIns) { 1502 // In the notation used by the instructions, 'start' and 'end' are reversed 1503 // because bits are counted from high to low order. 1504 unsigned InstMaskStart = 64 - MaskEnd - 1, 1505 InstMaskEnd = 64 - MaskStart - 1; 1506 1507 if (Repl32) 1508 return 1; 1509 1510 if ((!IsIns && (InstMaskEnd == 63 || InstMaskStart == 0)) || 1511 InstMaskEnd == 63 - RLAmt) 1512 return 1; 1513 1514 return 2; 1515 } 1516 1517 // For 64-bit values, not all combinations of rotates and masks are 1518 // available. Produce one if it is available. 1519 SDValue SelectRotMask64(SDValue V, const SDLoc &dl, unsigned RLAmt, 1520 bool Repl32, unsigned MaskStart, unsigned MaskEnd, 1521 unsigned *InstCnt = nullptr) { 1522 // In the notation used by the instructions, 'start' and 'end' are reversed 1523 // because bits are counted from high to low order. 1524 unsigned InstMaskStart = 64 - MaskEnd - 1, 1525 InstMaskEnd = 64 - MaskStart - 1; 1526 1527 if (InstCnt) *InstCnt += 1; 1528 1529 if (Repl32) { 1530 // This rotation amount assumes that the lower 32 bits of the quantity 1531 // are replicated in the high 32 bits by the rotation operator (which is 1532 // done by rlwinm and friends). 1533 assert(InstMaskStart >= 32 && "Mask cannot start out of range"); 1534 assert(InstMaskEnd >= 32 && "Mask cannot end out of range"); 1535 SDValue Ops[] = 1536 { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl), 1537 getI32Imm(InstMaskEnd - 32, dl) }; 1538 return SDValue(CurDAG->getMachineNode(PPC::RLWINM8, dl, MVT::i64, 1539 Ops), 0); 1540 } 1541 1542 if (InstMaskEnd == 63) { 1543 SDValue Ops[] = 1544 { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) }; 1545 return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Ops), 0); 1546 } 1547 1548 if (InstMaskStart == 0) { 1549 SDValue Ops[] = 1550 { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskEnd, dl) }; 1551 return SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Ops), 0); 1552 } 1553 1554 if (InstMaskEnd == 63 - RLAmt) { 1555 SDValue Ops[] = 1556 { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) }; 1557 return SDValue(CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, Ops), 0); 1558 } 1559 1560 // We cannot do this with a single instruction, so we'll use two. The 1561 // problem is that we're not free to choose both a rotation amount and mask 1562 // start and end independently. We can choose an arbitrary mask start and 1563 // end, but then the rotation amount is fixed. Rotation, however, can be 1564 // inverted, and so by applying an "inverse" rotation first, we can get the 1565 // desired result. 1566 if (InstCnt) *InstCnt += 1; 1567 1568 // The rotation mask for the second instruction must be MaskStart. 1569 unsigned RLAmt2 = MaskStart; 1570 // The first instruction must rotate V so that the overall rotation amount 1571 // is RLAmt. 1572 unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64; 1573 if (RLAmt1) 1574 V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63); 1575 return SelectRotMask64(V, dl, RLAmt2, false, MaskStart, MaskEnd); 1576 } 1577 1578 // For 64-bit values, not all combinations of rotates and masks are 1579 // available. Produce a rotate-mask-and-insert if one is available. 1580 SDValue SelectRotMaskIns64(SDValue Base, SDValue V, const SDLoc &dl, 1581 unsigned RLAmt, bool Repl32, unsigned MaskStart, 1582 unsigned MaskEnd, unsigned *InstCnt = nullptr) { 1583 // In the notation used by the instructions, 'start' and 'end' are reversed 1584 // because bits are counted from high to low order. 1585 unsigned InstMaskStart = 64 - MaskEnd - 1, 1586 InstMaskEnd = 64 - MaskStart - 1; 1587 1588 if (InstCnt) *InstCnt += 1; 1589 1590 if (Repl32) { 1591 // This rotation amount assumes that the lower 32 bits of the quantity 1592 // are replicated in the high 32 bits by the rotation operator (which is 1593 // done by rlwinm and friends). 1594 assert(InstMaskStart >= 32 && "Mask cannot start out of range"); 1595 assert(InstMaskEnd >= 32 && "Mask cannot end out of range"); 1596 SDValue Ops[] = 1597 { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl), 1598 getI32Imm(InstMaskEnd - 32, dl) }; 1599 return SDValue(CurDAG->getMachineNode(PPC::RLWIMI8, dl, MVT::i64, 1600 Ops), 0); 1601 } 1602 1603 if (InstMaskEnd == 63 - RLAmt) { 1604 SDValue Ops[] = 1605 { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) }; 1606 return SDValue(CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops), 0); 1607 } 1608 1609 // We cannot do this with a single instruction, so we'll use two. The 1610 // problem is that we're not free to choose both a rotation amount and mask 1611 // start and end independently. We can choose an arbitrary mask start and 1612 // end, but then the rotation amount is fixed. Rotation, however, can be 1613 // inverted, and so by applying an "inverse" rotation first, we can get the 1614 // desired result. 1615 if (InstCnt) *InstCnt += 1; 1616 1617 // The rotation mask for the second instruction must be MaskStart. 1618 unsigned RLAmt2 = MaskStart; 1619 // The first instruction must rotate V so that the overall rotation amount 1620 // is RLAmt. 1621 unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64; 1622 if (RLAmt1) 1623 V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63); 1624 return SelectRotMaskIns64(Base, V, dl, RLAmt2, false, MaskStart, MaskEnd); 1625 } 1626 1627 void SelectAndParts64(const SDLoc &dl, SDValue &Res, unsigned *InstCnt) { 1628 if (BPermRewriterNoMasking) 1629 return; 1630 1631 // The idea here is the same as in the 32-bit version, but with additional 1632 // complications from the fact that Repl32 might be true. Because we 1633 // aggressively convert bit groups to Repl32 form (which, for small 1634 // rotation factors, involves no other change), and then coalesce, it might 1635 // be the case that a single 64-bit masking operation could handle both 1636 // some Repl32 groups and some non-Repl32 groups. If converting to Repl32 1637 // form allowed coalescing, then we must use a 32-bit rotaton in order to 1638 // completely capture the new combined bit group. 1639 1640 for (ValueRotInfo &VRI : ValueRotsVec) { 1641 uint64_t Mask = 0; 1642 1643 // We need to add to the mask all bits from the associated bit groups. 1644 // If Repl32 is false, we need to add bits from bit groups that have 1645 // Repl32 true, but are trivially convertable to Repl32 false. Such a 1646 // group is trivially convertable if it overlaps only with the lower 32 1647 // bits, and the group has not been coalesced. 1648 auto MatchingBG = [VRI](const BitGroup &BG) { 1649 if (VRI.V != BG.V) 1650 return false; 1651 1652 unsigned EffRLAmt = BG.RLAmt; 1653 if (!VRI.Repl32 && BG.Repl32) { 1654 if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx <= BG.EndIdx && 1655 !BG.Repl32Coalesced) { 1656 if (BG.Repl32CR) 1657 EffRLAmt += 32; 1658 } else { 1659 return false; 1660 } 1661 } else if (VRI.Repl32 != BG.Repl32) { 1662 return false; 1663 } 1664 1665 return VRI.RLAmt == EffRLAmt; 1666 }; 1667 1668 for (auto &BG : BitGroups) { 1669 if (!MatchingBG(BG)) 1670 continue; 1671 1672 if (BG.StartIdx <= BG.EndIdx) { 1673 for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i) 1674 Mask |= (UINT64_C(1) << i); 1675 } else { 1676 for (unsigned i = BG.StartIdx; i < Bits.size(); ++i) 1677 Mask |= (UINT64_C(1) << i); 1678 for (unsigned i = 0; i <= BG.EndIdx; ++i) 1679 Mask |= (UINT64_C(1) << i); 1680 } 1681 } 1682 1683 // We can use the 32-bit andi/andis technique if the mask does not 1684 // require any higher-order bits. This can save an instruction compared 1685 // to always using the general 64-bit technique. 1686 bool Use32BitInsts = isUInt<32>(Mask); 1687 // Compute the masks for andi/andis that would be necessary. 1688 unsigned ANDIMask = (Mask & UINT16_MAX), 1689 ANDISMask = (Mask >> 16) & UINT16_MAX; 1690 1691 bool NeedsRotate = VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask)); 1692 1693 unsigned NumAndInsts = (unsigned) NeedsRotate + 1694 (unsigned) (bool) Res; 1695 if (Use32BitInsts) 1696 NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) + 1697 (unsigned) (ANDIMask != 0 && ANDISMask != 0); 1698 else 1699 NumAndInsts += getInt64Count(Mask) + /* and */ 1; 1700 1701 unsigned NumRLInsts = 0; 1702 bool FirstBG = true; 1703 bool MoreBG = false; 1704 for (auto &BG : BitGroups) { 1705 if (!MatchingBG(BG)) { 1706 MoreBG = true; 1707 continue; 1708 } 1709 NumRLInsts += 1710 SelectRotMask64Count(BG.RLAmt, BG.Repl32, BG.StartIdx, BG.EndIdx, 1711 !FirstBG); 1712 FirstBG = false; 1713 } 1714 1715 DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() << 1716 " RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":") << 1717 "\n\t\t\tisel using masking: " << NumAndInsts << 1718 " using rotates: " << NumRLInsts << "\n"); 1719 1720 // When we'd use andi/andis, we bias toward using the rotates (andi only 1721 // has a record form, and is cracked on POWER cores). However, when using 1722 // general 64-bit constant formation, bias toward the constant form, 1723 // because that exposes more opportunities for CSE. 1724 if (NumAndInsts > NumRLInsts) 1725 continue; 1726 // When merging multiple bit groups, instruction or is used. 1727 // But when rotate is used, rldimi can inert the rotated value into any 1728 // register, so instruction or can be avoided. 1729 if ((Use32BitInsts || MoreBG) && NumAndInsts == NumRLInsts) 1730 continue; 1731 1732 DEBUG(dbgs() << "\t\t\t\tusing masking\n"); 1733 1734 if (InstCnt) *InstCnt += NumAndInsts; 1735 1736 SDValue VRot; 1737 // We actually need to generate a rotation if we have a non-zero rotation 1738 // factor or, in the Repl32 case, if we care about any of the 1739 // higher-order replicated bits. In the latter case, we generate a mask 1740 // backward so that it actually includes the entire 64 bits. 1741 if (VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask))) 1742 VRot = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32, 1743 VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63); 1744 else 1745 VRot = VRI.V; 1746 1747 SDValue TotalVal; 1748 if (Use32BitInsts) { 1749 assert((ANDIMask != 0 || ANDISMask != 0) && 1750 "No set bits in mask when using 32-bit ands for 64-bit value"); 1751 1752 SDValue ANDIVal, ANDISVal; 1753 if (ANDIMask != 0) 1754 ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64, 1755 VRot, getI32Imm(ANDIMask, dl)), 0); 1756 if (ANDISMask != 0) 1757 ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64, 1758 VRot, getI32Imm(ANDISMask, dl)), 0); 1759 1760 if (!ANDIVal) 1761 TotalVal = ANDISVal; 1762 else if (!ANDISVal) 1763 TotalVal = ANDIVal; 1764 else 1765 TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, 1766 ANDIVal, ANDISVal), 0); 1767 } else { 1768 TotalVal = SDValue(getInt64(CurDAG, dl, Mask), 0); 1769 TotalVal = 1770 SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, 1771 VRot, TotalVal), 0); 1772 } 1773 1774 if (!Res) 1775 Res = TotalVal; 1776 else 1777 Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, 1778 Res, TotalVal), 0); 1779 1780 // Now, remove all groups with this underlying value and rotation 1781 // factor. 1782 eraseMatchingBitGroups(MatchingBG); 1783 } 1784 } 1785 1786 // Instruction selection for the 64-bit case. 1787 SDNode *Select64(SDNode *N, bool LateMask, unsigned *InstCnt) { 1788 SDLoc dl(N); 1789 SDValue Res; 1790 1791 if (InstCnt) *InstCnt = 0; 1792 1793 // Take care of cases that should use andi/andis first. 1794 SelectAndParts64(dl, Res, InstCnt); 1795 1796 // If we've not yet selected a 'starting' instruction, and we have no zeros 1797 // to fill in, select the (Value, RLAmt) with the highest priority (largest 1798 // number of groups), and start with this rotated value. 1799 if ((!HasZeros || LateMask) && !Res) { 1800 // If we have both Repl32 groups and non-Repl32 groups, the non-Repl32 1801 // groups will come first, and so the VRI representing the largest number 1802 // of groups might not be first (it might be the first Repl32 groups). 1803 unsigned MaxGroupsIdx = 0; 1804 if (!ValueRotsVec[0].Repl32) { 1805 for (unsigned i = 0, ie = ValueRotsVec.size(); i < ie; ++i) 1806 if (ValueRotsVec[i].Repl32) { 1807 if (ValueRotsVec[i].NumGroups > ValueRotsVec[0].NumGroups) 1808 MaxGroupsIdx = i; 1809 break; 1810 } 1811 } 1812 1813 ValueRotInfo &VRI = ValueRotsVec[MaxGroupsIdx]; 1814 bool NeedsRotate = false; 1815 if (VRI.RLAmt) { 1816 NeedsRotate = true; 1817 } else if (VRI.Repl32) { 1818 for (auto &BG : BitGroups) { 1819 if (BG.V != VRI.V || BG.RLAmt != VRI.RLAmt || 1820 BG.Repl32 != VRI.Repl32) 1821 continue; 1822 1823 // We don't need a rotate if the bit group is confined to the lower 1824 // 32 bits. 1825 if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx < BG.EndIdx) 1826 continue; 1827 1828 NeedsRotate = true; 1829 break; 1830 } 1831 } 1832 1833 if (NeedsRotate) 1834 Res = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32, 1835 VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63, 1836 InstCnt); 1837 else 1838 Res = VRI.V; 1839 1840 // Now, remove all groups with this underlying value and rotation factor. 1841 if (Res) 1842 eraseMatchingBitGroups([VRI](const BitGroup &BG) { 1843 return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt && 1844 BG.Repl32 == VRI.Repl32; 1845 }); 1846 } 1847 1848 // Because 64-bit rotates are more flexible than inserts, we might have a 1849 // preference regarding which one we do first (to save one instruction). 1850 if (!Res) 1851 for (auto I = BitGroups.begin(), IE = BitGroups.end(); I != IE; ++I) { 1852 if (SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx, 1853 false) < 1854 SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx, 1855 true)) { 1856 if (I != BitGroups.begin()) { 1857 BitGroup BG = *I; 1858 BitGroups.erase(I); 1859 BitGroups.insert(BitGroups.begin(), BG); 1860 } 1861 1862 break; 1863 } 1864 } 1865 1866 // Insert the other groups (one at a time). 1867 for (auto &BG : BitGroups) { 1868 if (!Res) 1869 Res = SelectRotMask64(BG.V, dl, BG.RLAmt, BG.Repl32, BG.StartIdx, 1870 BG.EndIdx, InstCnt); 1871 else 1872 Res = SelectRotMaskIns64(Res, BG.V, dl, BG.RLAmt, BG.Repl32, 1873 BG.StartIdx, BG.EndIdx, InstCnt); 1874 } 1875 1876 if (LateMask) { 1877 uint64_t Mask = getZerosMask(); 1878 1879 // We can use the 32-bit andi/andis technique if the mask does not 1880 // require any higher-order bits. This can save an instruction compared 1881 // to always using the general 64-bit technique. 1882 bool Use32BitInsts = isUInt<32>(Mask); 1883 // Compute the masks for andi/andis that would be necessary. 1884 unsigned ANDIMask = (Mask & UINT16_MAX), 1885 ANDISMask = (Mask >> 16) & UINT16_MAX; 1886 1887 if (Use32BitInsts) { 1888 assert((ANDIMask != 0 || ANDISMask != 0) && 1889 "No set bits in mask when using 32-bit ands for 64-bit value"); 1890 1891 if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) + 1892 (unsigned) (ANDISMask != 0) + 1893 (unsigned) (ANDIMask != 0 && ANDISMask != 0); 1894 1895 SDValue ANDIVal, ANDISVal; 1896 if (ANDIMask != 0) 1897 ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64, 1898 Res, getI32Imm(ANDIMask, dl)), 0); 1899 if (ANDISMask != 0) 1900 ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64, 1901 Res, getI32Imm(ANDISMask, dl)), 0); 1902 1903 if (!ANDIVal) 1904 Res = ANDISVal; 1905 else if (!ANDISVal) 1906 Res = ANDIVal; 1907 else 1908 Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, 1909 ANDIVal, ANDISVal), 0); 1910 } else { 1911 if (InstCnt) *InstCnt += getInt64Count(Mask) + /* and */ 1; 1912 1913 SDValue MaskVal = SDValue(getInt64(CurDAG, dl, Mask), 0); 1914 Res = 1915 SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, 1916 Res, MaskVal), 0); 1917 } 1918 } 1919 1920 return Res.getNode(); 1921 } 1922 1923 SDNode *Select(SDNode *N, bool LateMask, unsigned *InstCnt = nullptr) { 1924 // Fill in BitGroups. 1925 collectBitGroups(LateMask); 1926 if (BitGroups.empty()) 1927 return nullptr; 1928 1929 // For 64-bit values, figure out when we can use 32-bit instructions. 1930 if (Bits.size() == 64) 1931 assignRepl32BitGroups(); 1932 1933 // Fill in ValueRotsVec. 1934 collectValueRotInfo(); 1935 1936 if (Bits.size() == 32) { 1937 return Select32(N, LateMask, InstCnt); 1938 } else { 1939 assert(Bits.size() == 64 && "Not 64 bits here?"); 1940 return Select64(N, LateMask, InstCnt); 1941 } 1942 1943 return nullptr; 1944 } 1945 1946 void eraseMatchingBitGroups(function_ref<bool(const BitGroup &)> F) { 1947 BitGroups.erase(remove_if(BitGroups, F), BitGroups.end()); 1948 } 1949 1950 SmallVector<ValueBit, 64> Bits; 1951 1952 bool HasZeros; 1953 SmallVector<unsigned, 64> RLAmt; 1954 1955 SmallVector<BitGroup, 16> BitGroups; 1956 1957 DenseMap<std::pair<SDValue, unsigned>, ValueRotInfo> ValueRots; 1958 SmallVector<ValueRotInfo, 16> ValueRotsVec; 1959 1960 SelectionDAG *CurDAG; 1961 1962 public: 1963 BitPermutationSelector(SelectionDAG *DAG) 1964 : CurDAG(DAG) {} 1965 1966 // Here we try to match complex bit permutations into a set of 1967 // rotate-and-shift/shift/and/or instructions, using a set of heuristics 1968 // known to produce optimial code for common cases (like i32 byte swapping). 1969 SDNode *Select(SDNode *N) { 1970 Memoizer.clear(); 1971 auto Result = 1972 getValueBits(SDValue(N, 0), N->getValueType(0).getSizeInBits()); 1973 if (!Result.first) 1974 return nullptr; 1975 Bits = std::move(*Result.second); 1976 1977 DEBUG(dbgs() << "Considering bit-permutation-based instruction" 1978 " selection for: "); 1979 DEBUG(N->dump(CurDAG)); 1980 1981 // Fill it RLAmt and set HasZeros. 1982 computeRotationAmounts(); 1983 1984 if (!HasZeros) 1985 return Select(N, false); 1986 1987 // We currently have two techniques for handling results with zeros: early 1988 // masking (the default) and late masking. Late masking is sometimes more 1989 // efficient, but because the structure of the bit groups is different, it 1990 // is hard to tell without generating both and comparing the results. With 1991 // late masking, we ignore zeros in the resulting value when inserting each 1992 // set of bit groups, and then mask in the zeros at the end. With early 1993 // masking, we only insert the non-zero parts of the result at every step. 1994 1995 unsigned InstCnt, InstCntLateMask; 1996 DEBUG(dbgs() << "\tEarly masking:\n"); 1997 SDNode *RN = Select(N, false, &InstCnt); 1998 DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n"); 1999 2000 DEBUG(dbgs() << "\tLate masking:\n"); 2001 SDNode *RNLM = Select(N, true, &InstCntLateMask); 2002 DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask << 2003 " instructions\n"); 2004 2005 if (InstCnt <= InstCntLateMask) { 2006 DEBUG(dbgs() << "\tUsing early-masking for isel\n"); 2007 return RN; 2008 } 2009 2010 DEBUG(dbgs() << "\tUsing late-masking for isel\n"); 2011 return RNLM; 2012 } 2013 }; 2014 2015 } // end anonymous namespace 2016 2017 bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) { 2018 if (N->getValueType(0) != MVT::i32 && 2019 N->getValueType(0) != MVT::i64) 2020 return false; 2021 2022 if (!UseBitPermRewriter) 2023 return false; 2024 2025 switch (N->getOpcode()) { 2026 default: break; 2027 case ISD::ROTL: 2028 case ISD::SHL: 2029 case ISD::SRL: 2030 case ISD::AND: 2031 case ISD::OR: { 2032 BitPermutationSelector BPS(CurDAG); 2033 if (SDNode *New = BPS.Select(N)) { 2034 ReplaceNode(N, New); 2035 return true; 2036 } 2037 return false; 2038 } 2039 } 2040 2041 return false; 2042 } 2043 2044 /// SelectCC - Select a comparison of the specified values with the specified 2045 /// condition code, returning the CR# of the expression. 2046 SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, 2047 const SDLoc &dl) { 2048 // Always select the LHS. 2049 unsigned Opc; 2050 2051 if (LHS.getValueType() == MVT::i32) { 2052 unsigned Imm; 2053 if (CC == ISD::SETEQ || CC == ISD::SETNE) { 2054 if (isInt32Immediate(RHS, Imm)) { 2055 // SETEQ/SETNE comparison with 16-bit immediate, fold it. 2056 if (isUInt<16>(Imm)) 2057 return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS, 2058 getI32Imm(Imm & 0xFFFF, dl)), 2059 0); 2060 // If this is a 16-bit signed immediate, fold it. 2061 if (isInt<16>((int)Imm)) 2062 return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, 2063 getI32Imm(Imm & 0xFFFF, dl)), 2064 0); 2065 2066 // For non-equality comparisons, the default code would materialize the 2067 // constant, then compare against it, like this: 2068 // lis r2, 4660 2069 // ori r2, r2, 22136 2070 // cmpw cr0, r3, r2 2071 // Since we are just comparing for equality, we can emit this instead: 2072 // xoris r0,r3,0x1234 2073 // cmplwi cr0,r0,0x5678 2074 // beq cr0,L6 2075 SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS, 2076 getI32Imm(Imm >> 16, dl)), 0); 2077 return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor, 2078 getI32Imm(Imm & 0xFFFF, dl)), 0); 2079 } 2080 Opc = PPC::CMPLW; 2081 } else if (ISD::isUnsignedIntSetCC(CC)) { 2082 if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm)) 2083 return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS, 2084 getI32Imm(Imm & 0xFFFF, dl)), 0); 2085 Opc = PPC::CMPLW; 2086 } else { 2087 short SImm; 2088 if (isIntS16Immediate(RHS, SImm)) 2089 return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, 2090 getI32Imm((int)SImm & 0xFFFF, 2091 dl)), 2092 0); 2093 Opc = PPC::CMPW; 2094 } 2095 } else if (LHS.getValueType() == MVT::i64) { 2096 uint64_t Imm; 2097 if (CC == ISD::SETEQ || CC == ISD::SETNE) { 2098 if (isInt64Immediate(RHS.getNode(), Imm)) { 2099 // SETEQ/SETNE comparison with 16-bit immediate, fold it. 2100 if (isUInt<16>(Imm)) 2101 return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS, 2102 getI32Imm(Imm & 0xFFFF, dl)), 2103 0); 2104 // If this is a 16-bit signed immediate, fold it. 2105 if (isInt<16>(Imm)) 2106 return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, 2107 getI32Imm(Imm & 0xFFFF, dl)), 2108 0); 2109 2110 // For non-equality comparisons, the default code would materialize the 2111 // constant, then compare against it, like this: 2112 // lis r2, 4660 2113 // ori r2, r2, 22136 2114 // cmpd cr0, r3, r2 2115 // Since we are just comparing for equality, we can emit this instead: 2116 // xoris r0,r3,0x1234 2117 // cmpldi cr0,r0,0x5678 2118 // beq cr0,L6 2119 if (isUInt<32>(Imm)) { 2120 SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS, 2121 getI64Imm(Imm >> 16, dl)), 0); 2122 return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor, 2123 getI64Imm(Imm & 0xFFFF, dl)), 2124 0); 2125 } 2126 } 2127 Opc = PPC::CMPLD; 2128 } else if (ISD::isUnsignedIntSetCC(CC)) { 2129 if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm)) 2130 return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS, 2131 getI64Imm(Imm & 0xFFFF, dl)), 0); 2132 Opc = PPC::CMPLD; 2133 } else { 2134 short SImm; 2135 if (isIntS16Immediate(RHS, SImm)) 2136 return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, 2137 getI64Imm(SImm & 0xFFFF, dl)), 2138 0); 2139 Opc = PPC::CMPD; 2140 } 2141 } else if (LHS.getValueType() == MVT::f32) { 2142 Opc = PPC::FCMPUS; 2143 } else { 2144 assert(LHS.getValueType() == MVT::f64 && "Unknown vt!"); 2145 Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD; 2146 } 2147 return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0); 2148 } 2149 2150 static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) { 2151 switch (CC) { 2152 case ISD::SETUEQ: 2153 case ISD::SETONE: 2154 case ISD::SETOLE: 2155 case ISD::SETOGE: 2156 llvm_unreachable("Should be lowered by legalize!"); 2157 default: llvm_unreachable("Unknown condition!"); 2158 case ISD::SETOEQ: 2159 case ISD::SETEQ: return PPC::PRED_EQ; 2160 case ISD::SETUNE: 2161 case ISD::SETNE: return PPC::PRED_NE; 2162 case ISD::SETOLT: 2163 case ISD::SETLT: return PPC::PRED_LT; 2164 case ISD::SETULE: 2165 case ISD::SETLE: return PPC::PRED_LE; 2166 case ISD::SETOGT: 2167 case ISD::SETGT: return PPC::PRED_GT; 2168 case ISD::SETUGE: 2169 case ISD::SETGE: return PPC::PRED_GE; 2170 case ISD::SETO: return PPC::PRED_NU; 2171 case ISD::SETUO: return PPC::PRED_UN; 2172 // These two are invalid for floating point. Assume we have int. 2173 case ISD::SETULT: return PPC::PRED_LT; 2174 case ISD::SETUGT: return PPC::PRED_GT; 2175 } 2176 } 2177 2178 /// getCRIdxForSetCC - Return the index of the condition register field 2179 /// associated with the SetCC condition, and whether or not the field is 2180 /// treated as inverted. That is, lt = 0; ge = 0 inverted. 2181 static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) { 2182 Invert = false; 2183 switch (CC) { 2184 default: llvm_unreachable("Unknown condition!"); 2185 case ISD::SETOLT: 2186 case ISD::SETLT: return 0; // Bit #0 = SETOLT 2187 case ISD::SETOGT: 2188 case ISD::SETGT: return 1; // Bit #1 = SETOGT 2189 case ISD::SETOEQ: 2190 case ISD::SETEQ: return 2; // Bit #2 = SETOEQ 2191 case ISD::SETUO: return 3; // Bit #3 = SETUO 2192 case ISD::SETUGE: 2193 case ISD::SETGE: Invert = true; return 0; // !Bit #0 = SETUGE 2194 case ISD::SETULE: 2195 case ISD::SETLE: Invert = true; return 1; // !Bit #1 = SETULE 2196 case ISD::SETUNE: 2197 case ISD::SETNE: Invert = true; return 2; // !Bit #2 = SETUNE 2198 case ISD::SETO: Invert = true; return 3; // !Bit #3 = SETO 2199 case ISD::SETUEQ: 2200 case ISD::SETOGE: 2201 case ISD::SETOLE: 2202 case ISD::SETONE: 2203 llvm_unreachable("Invalid branch code: should be expanded by legalize"); 2204 // These are invalid for floating point. Assume integer. 2205 case ISD::SETULT: return 0; 2206 case ISD::SETUGT: return 1; 2207 } 2208 } 2209 2210 // getVCmpInst: return the vector compare instruction for the specified 2211 // vector type and condition code. Since this is for altivec specific code, 2212 // only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32). 2213 static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, 2214 bool HasVSX, bool &Swap, bool &Negate) { 2215 Swap = false; 2216 Negate = false; 2217 2218 if (VecVT.isFloatingPoint()) { 2219 /* Handle some cases by swapping input operands. */ 2220 switch (CC) { 2221 case ISD::SETLE: CC = ISD::SETGE; Swap = true; break; 2222 case ISD::SETLT: CC = ISD::SETGT; Swap = true; break; 2223 case ISD::SETOLE: CC = ISD::SETOGE; Swap = true; break; 2224 case ISD::SETOLT: CC = ISD::SETOGT; Swap = true; break; 2225 case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break; 2226 case ISD::SETUGT: CC = ISD::SETULT; Swap = true; break; 2227 default: break; 2228 } 2229 /* Handle some cases by negating the result. */ 2230 switch (CC) { 2231 case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break; 2232 case ISD::SETUNE: CC = ISD::SETOEQ; Negate = true; break; 2233 case ISD::SETULE: CC = ISD::SETOGT; Negate = true; break; 2234 case ISD::SETULT: CC = ISD::SETOGE; Negate = true; break; 2235 default: break; 2236 } 2237 /* We have instructions implementing the remaining cases. */ 2238 switch (CC) { 2239 case ISD::SETEQ: 2240 case ISD::SETOEQ: 2241 if (VecVT == MVT::v4f32) 2242 return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP; 2243 else if (VecVT == MVT::v2f64) 2244 return PPC::XVCMPEQDP; 2245 break; 2246 case ISD::SETGT: 2247 case ISD::SETOGT: 2248 if (VecVT == MVT::v4f32) 2249 return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP; 2250 else if (VecVT == MVT::v2f64) 2251 return PPC::XVCMPGTDP; 2252 break; 2253 case ISD::SETGE: 2254 case ISD::SETOGE: 2255 if (VecVT == MVT::v4f32) 2256 return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP; 2257 else if (VecVT == MVT::v2f64) 2258 return PPC::XVCMPGEDP; 2259 break; 2260 default: 2261 break; 2262 } 2263 llvm_unreachable("Invalid floating-point vector compare condition"); 2264 } else { 2265 /* Handle some cases by swapping input operands. */ 2266 switch (CC) { 2267 case ISD::SETGE: CC = ISD::SETLE; Swap = true; break; 2268 case ISD::SETLT: CC = ISD::SETGT; Swap = true; break; 2269 case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break; 2270 case ISD::SETULT: CC = ISD::SETUGT; Swap = true; break; 2271 default: break; 2272 } 2273 /* Handle some cases by negating the result. */ 2274 switch (CC) { 2275 case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break; 2276 case ISD::SETUNE: CC = ISD::SETUEQ; Negate = true; break; 2277 case ISD::SETLE: CC = ISD::SETGT; Negate = true; break; 2278 case ISD::SETULE: CC = ISD::SETUGT; Negate = true; break; 2279 default: break; 2280 } 2281 /* We have instructions implementing the remaining cases. */ 2282 switch (CC) { 2283 case ISD::SETEQ: 2284 case ISD::SETUEQ: 2285 if (VecVT == MVT::v16i8) 2286 return PPC::VCMPEQUB; 2287 else if (VecVT == MVT::v8i16) 2288 return PPC::VCMPEQUH; 2289 else if (VecVT == MVT::v4i32) 2290 return PPC::VCMPEQUW; 2291 else if (VecVT == MVT::v2i64) 2292 return PPC::VCMPEQUD; 2293 break; 2294 case ISD::SETGT: 2295 if (VecVT == MVT::v16i8) 2296 return PPC::VCMPGTSB; 2297 else if (VecVT == MVT::v8i16) 2298 return PPC::VCMPGTSH; 2299 else if (VecVT == MVT::v4i32) 2300 return PPC::VCMPGTSW; 2301 else if (VecVT == MVT::v2i64) 2302 return PPC::VCMPGTSD; 2303 break; 2304 case ISD::SETUGT: 2305 if (VecVT == MVT::v16i8) 2306 return PPC::VCMPGTUB; 2307 else if (VecVT == MVT::v8i16) 2308 return PPC::VCMPGTUH; 2309 else if (VecVT == MVT::v4i32) 2310 return PPC::VCMPGTUW; 2311 else if (VecVT == MVT::v2i64) 2312 return PPC::VCMPGTUD; 2313 break; 2314 default: 2315 break; 2316 } 2317 llvm_unreachable("Invalid integer vector compare condition"); 2318 } 2319 } 2320 2321 bool PPCDAGToDAGISel::trySETCC(SDNode *N) { 2322 SDLoc dl(N); 2323 unsigned Imm; 2324 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 2325 EVT PtrVT = 2326 CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout()); 2327 bool isPPC64 = (PtrVT == MVT::i64); 2328 2329 if (!PPCSubTarget->useCRBits() && 2330 isInt32Immediate(N->getOperand(1), Imm)) { 2331 // We can codegen setcc op, imm very efficiently compared to a brcond. 2332 // Check for those cases here. 2333 // setcc op, 0 2334 if (Imm == 0) { 2335 SDValue Op = N->getOperand(0); 2336 switch (CC) { 2337 default: break; 2338 case ISD::SETEQ: { 2339 Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0); 2340 SDValue Ops[] = { Op, getI32Imm(27, dl), getI32Imm(5, dl), 2341 getI32Imm(31, dl) }; 2342 CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); 2343 return true; 2344 } 2345 case ISD::SETNE: { 2346 if (isPPC64) break; 2347 SDValue AD = 2348 SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, 2349 Op, getI32Imm(~0U, dl)), 0); 2350 CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, AD.getValue(1)); 2351 return true; 2352 } 2353 case ISD::SETLT: { 2354 SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl), 2355 getI32Imm(31, dl) }; 2356 CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); 2357 return true; 2358 } 2359 case ISD::SETGT: { 2360 SDValue T = 2361 SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0); 2362 T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0); 2363 SDValue Ops[] = { T, getI32Imm(1, dl), getI32Imm(31, dl), 2364 getI32Imm(31, dl) }; 2365 CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); 2366 return true; 2367 } 2368 } 2369 } else if (Imm == ~0U) { // setcc op, -1 2370 SDValue Op = N->getOperand(0); 2371 switch (CC) { 2372 default: break; 2373 case ISD::SETEQ: 2374 if (isPPC64) break; 2375 Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, 2376 Op, getI32Imm(1, dl)), 0); 2377 CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, 2378 SDValue(CurDAG->getMachineNode(PPC::LI, dl, 2379 MVT::i32, 2380 getI32Imm(0, dl)), 2381 0), Op.getValue(1)); 2382 return true; 2383 case ISD::SETNE: { 2384 if (isPPC64) break; 2385 Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0); 2386 SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, 2387 Op, getI32Imm(~0U, dl)); 2388 CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0), Op, 2389 SDValue(AD, 1)); 2390 return true; 2391 } 2392 case ISD::SETLT: { 2393 SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op, 2394 getI32Imm(1, dl)), 0); 2395 SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD, 2396 Op), 0); 2397 SDValue Ops[] = { AN, getI32Imm(1, dl), getI32Imm(31, dl), 2398 getI32Imm(31, dl) }; 2399 CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); 2400 return true; 2401 } 2402 case ISD::SETGT: { 2403 SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl), 2404 getI32Imm(31, dl) }; 2405 Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); 2406 CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, getI32Imm(1, dl)); 2407 return true; 2408 } 2409 } 2410 } 2411 } 2412 2413 SDValue LHS = N->getOperand(0); 2414 SDValue RHS = N->getOperand(1); 2415 2416 // Altivec Vector compare instructions do not set any CR register by default and 2417 // vector compare operations return the same type as the operands. 2418 if (LHS.getValueType().isVector()) { 2419 if (PPCSubTarget->hasQPX()) 2420 return false; 2421 2422 EVT VecVT = LHS.getValueType(); 2423 bool Swap, Negate; 2424 unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC, 2425 PPCSubTarget->hasVSX(), Swap, Negate); 2426 if (Swap) 2427 std::swap(LHS, RHS); 2428 2429 EVT ResVT = VecVT.changeVectorElementTypeToInteger(); 2430 if (Negate) { 2431 SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0); 2432 CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : PPC::VNOR, 2433 ResVT, VCmp, VCmp); 2434 return true; 2435 } 2436 2437 CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS); 2438 return true; 2439 } 2440 2441 if (PPCSubTarget->useCRBits()) 2442 return false; 2443 2444 bool Inv; 2445 unsigned Idx = getCRIdxForSetCC(CC, Inv); 2446 SDValue CCReg = SelectCC(LHS, RHS, CC, dl); 2447 SDValue IntCR; 2448 2449 // Force the ccreg into CR7. 2450 SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32); 2451 2452 SDValue InFlag(nullptr, 0); // Null incoming flag value. 2453 CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg, 2454 InFlag).getValue(1); 2455 2456 IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg, 2457 CCReg), 0); 2458 2459 SDValue Ops[] = { IntCR, getI32Imm((32 - (3 - Idx)) & 31, dl), 2460 getI32Imm(31, dl), getI32Imm(31, dl) }; 2461 if (!Inv) { 2462 CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); 2463 return true; 2464 } 2465 2466 // Get the specified bit. 2467 SDValue Tmp = 2468 SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); 2469 CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1, dl)); 2470 return true; 2471 } 2472 2473 void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { 2474 // Transfer memoperands. 2475 MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); 2476 MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); 2477 cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1); 2478 } 2479 2480 // Select - Convert the specified operand from a target-independent to a 2481 // target-specific node if it hasn't already been changed. 2482 void PPCDAGToDAGISel::Select(SDNode *N) { 2483 SDLoc dl(N); 2484 if (N->isMachineOpcode()) { 2485 N->setNodeId(-1); 2486 return; // Already selected. 2487 } 2488 2489 // In case any misguided DAG-level optimizations form an ADD with a 2490 // TargetConstant operand, crash here instead of miscompiling (by selecting 2491 // an r+r add instead of some kind of r+i add). 2492 if (N->getOpcode() == ISD::ADD && 2493 N->getOperand(1).getOpcode() == ISD::TargetConstant) 2494 llvm_unreachable("Invalid ADD with TargetConstant operand"); 2495 2496 // Try matching complex bit permutations before doing anything else. 2497 if (tryBitPermutation(N)) 2498 return; 2499 2500 switch (N->getOpcode()) { 2501 default: break; 2502 2503 case ISD::Constant: 2504 if (N->getValueType(0) == MVT::i64) { 2505 ReplaceNode(N, getInt64(CurDAG, N)); 2506 return; 2507 } 2508 break; 2509 2510 case ISD::SETCC: 2511 if (trySETCC(N)) 2512 return; 2513 break; 2514 2515 case PPCISD::GlobalBaseReg: 2516 ReplaceNode(N, getGlobalBaseReg()); 2517 return; 2518 2519 case ISD::FrameIndex: 2520 selectFrameIndex(N, N); 2521 return; 2522 2523 case PPCISD::MFOCRF: { 2524 SDValue InFlag = N->getOperand(1); 2525 ReplaceNode(N, CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, 2526 N->getOperand(0), InFlag)); 2527 return; 2528 } 2529 2530 case PPCISD::READ_TIME_BASE: 2531 ReplaceNode(N, CurDAG->getMachineNode(PPC::ReadTB, dl, MVT::i32, MVT::i32, 2532 MVT::Other, N->getOperand(0))); 2533 return; 2534 2535 case PPCISD::SRA_ADDZE: { 2536 SDValue N0 = N->getOperand(0); 2537 SDValue ShiftAmt = 2538 CurDAG->getTargetConstant(*cast<ConstantSDNode>(N->getOperand(1))-> 2539 getConstantIntValue(), dl, 2540 N->getValueType(0)); 2541 if (N->getValueType(0) == MVT::i64) { 2542 SDNode *Op = 2543 CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, MVT::Glue, 2544 N0, ShiftAmt); 2545 CurDAG->SelectNodeTo(N, PPC::ADDZE8, MVT::i64, SDValue(Op, 0), 2546 SDValue(Op, 1)); 2547 return; 2548 } else { 2549 assert(N->getValueType(0) == MVT::i32 && 2550 "Expecting i64 or i32 in PPCISD::SRA_ADDZE"); 2551 SDNode *Op = 2552 CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue, 2553 N0, ShiftAmt); 2554 CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, SDValue(Op, 0), 2555 SDValue(Op, 1)); 2556 return; 2557 } 2558 } 2559 2560 case ISD::LOAD: { 2561 // Handle preincrement loads. 2562 LoadSDNode *LD = cast<LoadSDNode>(N); 2563 EVT LoadedVT = LD->getMemoryVT(); 2564 2565 // Normal loads are handled by code generated from the .td file. 2566 if (LD->getAddressingMode() != ISD::PRE_INC) 2567 break; 2568 2569 SDValue Offset = LD->getOffset(); 2570 if (Offset.getOpcode() == ISD::TargetConstant || 2571 Offset.getOpcode() == ISD::TargetGlobalAddress) { 2572 2573 unsigned Opcode; 2574 bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD; 2575 if (LD->getValueType(0) != MVT::i64) { 2576 // Handle PPC32 integer and normal FP loads. 2577 assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); 2578 switch (LoadedVT.getSimpleVT().SimpleTy) { 2579 default: llvm_unreachable("Invalid PPC load type!"); 2580 case MVT::f64: Opcode = PPC::LFDU; break; 2581 case MVT::f32: Opcode = PPC::LFSU; break; 2582 case MVT::i32: Opcode = PPC::LWZU; break; 2583 case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break; 2584 case MVT::i1: 2585 case MVT::i8: Opcode = PPC::LBZU; break; 2586 } 2587 } else { 2588 assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!"); 2589 assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); 2590 switch (LoadedVT.getSimpleVT().SimpleTy) { 2591 default: llvm_unreachable("Invalid PPC load type!"); 2592 case MVT::i64: Opcode = PPC::LDU; break; 2593 case MVT::i32: Opcode = PPC::LWZU8; break; 2594 case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break; 2595 case MVT::i1: 2596 case MVT::i8: Opcode = PPC::LBZU8; break; 2597 } 2598 } 2599 2600 SDValue Chain = LD->getChain(); 2601 SDValue Base = LD->getBasePtr(); 2602 SDValue Ops[] = { Offset, Base, Chain }; 2603 SDNode *MN = CurDAG->getMachineNode( 2604 Opcode, dl, LD->getValueType(0), 2605 PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, Ops); 2606 transferMemOperands(N, MN); 2607 ReplaceNode(N, MN); 2608 return; 2609 } else { 2610 unsigned Opcode; 2611 bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD; 2612 if (LD->getValueType(0) != MVT::i64) { 2613 // Handle PPC32 integer and normal FP loads. 2614 assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); 2615 switch (LoadedVT.getSimpleVT().SimpleTy) { 2616 default: llvm_unreachable("Invalid PPC load type!"); 2617 case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX 2618 case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX 2619 case MVT::f64: Opcode = PPC::LFDUX; break; 2620 case MVT::f32: Opcode = PPC::LFSUX; break; 2621 case MVT::i32: Opcode = PPC::LWZUX; break; 2622 case MVT::i16: Opcode = isSExt ? PPC::LHAUX : PPC::LHZUX; break; 2623 case MVT::i1: 2624 case MVT::i8: Opcode = PPC::LBZUX; break; 2625 } 2626 } else { 2627 assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!"); 2628 assert((!isSExt || LoadedVT == MVT::i16 || LoadedVT == MVT::i32) && 2629 "Invalid sext update load"); 2630 switch (LoadedVT.getSimpleVT().SimpleTy) { 2631 default: llvm_unreachable("Invalid PPC load type!"); 2632 case MVT::i64: Opcode = PPC::LDUX; break; 2633 case MVT::i32: Opcode = isSExt ? PPC::LWAUX : PPC::LWZUX8; break; 2634 case MVT::i16: Opcode = isSExt ? PPC::LHAUX8 : PPC::LHZUX8; break; 2635 case MVT::i1: 2636 case MVT::i8: Opcode = PPC::LBZUX8; break; 2637 } 2638 } 2639 2640 SDValue Chain = LD->getChain(); 2641 SDValue Base = LD->getBasePtr(); 2642 SDValue Ops[] = { Base, Offset, Chain }; 2643 SDNode *MN = CurDAG->getMachineNode( 2644 Opcode, dl, LD->getValueType(0), 2645 PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, Ops); 2646 transferMemOperands(N, MN); 2647 ReplaceNode(N, MN); 2648 return; 2649 } 2650 } 2651 2652 case ISD::AND: { 2653 unsigned Imm, Imm2, SH, MB, ME; 2654 uint64_t Imm64; 2655 2656 // If this is an and of a value rotated between 0 and 31 bits and then and'd 2657 // with a mask, emit rlwinm 2658 if (isInt32Immediate(N->getOperand(1), Imm) && 2659 isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) { 2660 SDValue Val = N->getOperand(0).getOperand(0); 2661 SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl), 2662 getI32Imm(ME, dl) }; 2663 CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); 2664 return; 2665 } 2666 // If this is just a masked value where the input is not handled above, and 2667 // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm 2668 if (isInt32Immediate(N->getOperand(1), Imm) && 2669 isRunOfOnes(Imm, MB, ME) && 2670 N->getOperand(0).getOpcode() != ISD::ROTL) { 2671 SDValue Val = N->getOperand(0); 2672 SDValue Ops[] = { Val, getI32Imm(0, dl), getI32Imm(MB, dl), 2673 getI32Imm(ME, dl) }; 2674 CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); 2675 return; 2676 } 2677 // If this is a 64-bit zero-extension mask, emit rldicl. 2678 if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) && 2679 isMask_64(Imm64)) { 2680 SDValue Val = N->getOperand(0); 2681 MB = 64 - countTrailingOnes(Imm64); 2682 SH = 0; 2683 2684 if (Val.getOpcode() == ISD::ANY_EXTEND) { 2685 auto Op0 = Val.getOperand(0); 2686 if ( Op0.getOpcode() == ISD::SRL && 2687 isInt32Immediate(Op0.getOperand(1).getNode(), Imm) && Imm <= MB) { 2688 2689 auto ResultType = Val.getNode()->getValueType(0); 2690 auto ImDef = CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, 2691 ResultType); 2692 SDValue IDVal (ImDef, 0); 2693 2694 Val = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, 2695 ResultType, IDVal, Op0.getOperand(0), 2696 getI32Imm(1, dl)), 0); 2697 SH = 64 - Imm; 2698 } 2699 } 2700 2701 // If the operand is a logical right shift, we can fold it into this 2702 // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb) 2703 // for n <= mb. The right shift is really a left rotate followed by a 2704 // mask, and this mask is a more-restrictive sub-mask of the mask implied 2705 // by the shift. 2706 if (Val.getOpcode() == ISD::SRL && 2707 isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) { 2708 assert(Imm < 64 && "Illegal shift amount"); 2709 Val = Val.getOperand(0); 2710 SH = 64 - Imm; 2711 } 2712 2713 SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) }; 2714 CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops); 2715 return; 2716 } 2717 // If this is a negated 64-bit zero-extension mask, 2718 // i.e. the immediate is a sequence of ones from most significant side 2719 // and all zero for reminder, we should use rldicr. 2720 if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) && 2721 isMask_64(~Imm64)) { 2722 SDValue Val = N->getOperand(0); 2723 MB = 63 - countTrailingOnes(~Imm64); 2724 SH = 0; 2725 SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) }; 2726 CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops); 2727 return; 2728 } 2729 2730 // AND X, 0 -> 0, not "rlwinm 32". 2731 if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) { 2732 ReplaceUses(SDValue(N, 0), N->getOperand(1)); 2733 return; 2734 } 2735 // ISD::OR doesn't get all the bitfield insertion fun. 2736 // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a 2737 // bitfield insert. 2738 if (isInt32Immediate(N->getOperand(1), Imm) && 2739 N->getOperand(0).getOpcode() == ISD::OR && 2740 isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) { 2741 // The idea here is to check whether this is equivalent to: 2742 // (c1 & m) | (x & ~m) 2743 // where m is a run-of-ones mask. The logic here is that, for each bit in 2744 // c1 and c2: 2745 // - if both are 1, then the output will be 1. 2746 // - if both are 0, then the output will be 0. 2747 // - if the bit in c1 is 0, and the bit in c2 is 1, then the output will 2748 // come from x. 2749 // - if the bit in c1 is 1, and the bit in c2 is 0, then the output will 2750 // be 0. 2751 // If that last condition is never the case, then we can form m from the 2752 // bits that are the same between c1 and c2. 2753 unsigned MB, ME; 2754 if (isRunOfOnes(~(Imm^Imm2), MB, ME) && !(~Imm & Imm2)) { 2755 SDValue Ops[] = { N->getOperand(0).getOperand(0), 2756 N->getOperand(0).getOperand(1), 2757 getI32Imm(0, dl), getI32Imm(MB, dl), 2758 getI32Imm(ME, dl) }; 2759 ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops)); 2760 return; 2761 } 2762 } 2763 2764 // Other cases are autogenerated. 2765 break; 2766 } 2767 case ISD::OR: { 2768 if (N->getValueType(0) == MVT::i32) 2769 if (tryBitfieldInsert(N)) 2770 return; 2771 2772 short Imm; 2773 if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && 2774 isIntS16Immediate(N->getOperand(1), Imm)) { 2775 APInt LHSKnownZero, LHSKnownOne; 2776 CurDAG->computeKnownBits(N->getOperand(0), LHSKnownZero, LHSKnownOne); 2777 2778 // If this is equivalent to an add, then we can fold it with the 2779 // FrameIndex calculation. 2780 if ((LHSKnownZero.getZExtValue()|~(uint64_t)Imm) == ~0ULL) { 2781 selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm); 2782 return; 2783 } 2784 } 2785 2786 // Other cases are autogenerated. 2787 break; 2788 } 2789 case ISD::ADD: { 2790 short Imm; 2791 if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && 2792 isIntS16Immediate(N->getOperand(1), Imm)) { 2793 selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm); 2794 return; 2795 } 2796 2797 break; 2798 } 2799 case ISD::SHL: { 2800 unsigned Imm, SH, MB, ME; 2801 if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) && 2802 isRotateAndMask(N, Imm, true, SH, MB, ME)) { 2803 SDValue Ops[] = { N->getOperand(0).getOperand(0), 2804 getI32Imm(SH, dl), getI32Imm(MB, dl), 2805 getI32Imm(ME, dl) }; 2806 CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); 2807 return; 2808 } 2809 2810 // Other cases are autogenerated. 2811 break; 2812 } 2813 case ISD::SRL: { 2814 unsigned Imm, SH, MB, ME; 2815 if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) && 2816 isRotateAndMask(N, Imm, true, SH, MB, ME)) { 2817 SDValue Ops[] = { N->getOperand(0).getOperand(0), 2818 getI32Imm(SH, dl), getI32Imm(MB, dl), 2819 getI32Imm(ME, dl) }; 2820 CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); 2821 return; 2822 } 2823 2824 // Other cases are autogenerated. 2825 break; 2826 } 2827 // FIXME: Remove this once the ANDI glue bug is fixed: 2828 case PPCISD::ANDIo_1_EQ_BIT: 2829 case PPCISD::ANDIo_1_GT_BIT: { 2830 if (!ANDIGlueBug) 2831 break; 2832 2833 EVT InVT = N->getOperand(0).getValueType(); 2834 assert((InVT == MVT::i64 || InVT == MVT::i32) && 2835 "Invalid input type for ANDIo_1_EQ_BIT"); 2836 2837 unsigned Opcode = (InVT == MVT::i64) ? PPC::ANDIo8 : PPC::ANDIo; 2838 SDValue AndI(CurDAG->getMachineNode(Opcode, dl, InVT, MVT::Glue, 2839 N->getOperand(0), 2840 CurDAG->getTargetConstant(1, dl, InVT)), 2841 0); 2842 SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32); 2843 SDValue SRIdxVal = 2844 CurDAG->getTargetConstant(N->getOpcode() == PPCISD::ANDIo_1_EQ_BIT ? 2845 PPC::sub_eq : PPC::sub_gt, dl, MVT::i32); 2846 2847 CurDAG->SelectNodeTo(N, TargetOpcode::EXTRACT_SUBREG, MVT::i1, CR0Reg, 2848 SRIdxVal, SDValue(AndI.getNode(), 1) /* glue */); 2849 return; 2850 } 2851 case ISD::SELECT_CC: { 2852 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 2853 EVT PtrVT = 2854 CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout()); 2855 bool isPPC64 = (PtrVT == MVT::i64); 2856 2857 // If this is a select of i1 operands, we'll pattern match it. 2858 if (PPCSubTarget->useCRBits() && 2859 N->getOperand(0).getValueType() == MVT::i1) 2860 break; 2861 2862 // Handle the setcc cases here. select_cc lhs, 0, 1, 0, cc 2863 if (!isPPC64) 2864 if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1))) 2865 if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2))) 2866 if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3))) 2867 if (N1C->isNullValue() && N3C->isNullValue() && 2868 N2C->getZExtValue() == 1ULL && CC == ISD::SETNE && 2869 // FIXME: Implement this optzn for PPC64. 2870 N->getValueType(0) == MVT::i32) { 2871 SDNode *Tmp = 2872 CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, 2873 N->getOperand(0), getI32Imm(~0U, dl)); 2874 CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(Tmp, 0), 2875 N->getOperand(0), SDValue(Tmp, 1)); 2876 return; 2877 } 2878 2879 SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl); 2880 2881 if (N->getValueType(0) == MVT::i1) { 2882 // An i1 select is: (c & t) | (!c & f). 2883 bool Inv; 2884 unsigned Idx = getCRIdxForSetCC(CC, Inv); 2885 2886 unsigned SRI; 2887 switch (Idx) { 2888 default: llvm_unreachable("Invalid CC index"); 2889 case 0: SRI = PPC::sub_lt; break; 2890 case 1: SRI = PPC::sub_gt; break; 2891 case 2: SRI = PPC::sub_eq; break; 2892 case 3: SRI = PPC::sub_un; break; 2893 } 2894 2895 SDValue CCBit = CurDAG->getTargetExtractSubreg(SRI, dl, MVT::i1, CCReg); 2896 2897 SDValue NotCCBit(CurDAG->getMachineNode(PPC::CRNOR, dl, MVT::i1, 2898 CCBit, CCBit), 0); 2899 SDValue C = Inv ? NotCCBit : CCBit, 2900 NotC = Inv ? CCBit : NotCCBit; 2901 2902 SDValue CAndT(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1, 2903 C, N->getOperand(2)), 0); 2904 SDValue NotCAndF(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1, 2905 NotC, N->getOperand(3)), 0); 2906 2907 CurDAG->SelectNodeTo(N, PPC::CROR, MVT::i1, CAndT, NotCAndF); 2908 return; 2909 } 2910 2911 unsigned BROpc = getPredicateForSetCC(CC); 2912 2913 unsigned SelectCCOp; 2914 if (N->getValueType(0) == MVT::i32) 2915 SelectCCOp = PPC::SELECT_CC_I4; 2916 else if (N->getValueType(0) == MVT::i64) 2917 SelectCCOp = PPC::SELECT_CC_I8; 2918 else if (N->getValueType(0) == MVT::f32) 2919 if (PPCSubTarget->hasP8Vector()) 2920 SelectCCOp = PPC::SELECT_CC_VSSRC; 2921 else 2922 SelectCCOp = PPC::SELECT_CC_F4; 2923 else if (N->getValueType(0) == MVT::f64) 2924 if (PPCSubTarget->hasVSX()) 2925 SelectCCOp = PPC::SELECT_CC_VSFRC; 2926 else 2927 SelectCCOp = PPC::SELECT_CC_F8; 2928 else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f64) 2929 SelectCCOp = PPC::SELECT_CC_QFRC; 2930 else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f32) 2931 SelectCCOp = PPC::SELECT_CC_QSRC; 2932 else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4i1) 2933 SelectCCOp = PPC::SELECT_CC_QBRC; 2934 else if (N->getValueType(0) == MVT::v2f64 || 2935 N->getValueType(0) == MVT::v2i64) 2936 SelectCCOp = PPC::SELECT_CC_VSRC; 2937 else 2938 SelectCCOp = PPC::SELECT_CC_VRRC; 2939 2940 SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3), 2941 getI32Imm(BROpc, dl) }; 2942 CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops); 2943 return; 2944 } 2945 case ISD::VSELECT: 2946 if (PPCSubTarget->hasVSX()) { 2947 SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) }; 2948 CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops); 2949 return; 2950 } 2951 break; 2952 2953 case ISD::VECTOR_SHUFFLE: 2954 if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 || 2955 N->getValueType(0) == MVT::v2i64)) { 2956 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 2957 2958 SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1), 2959 Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1); 2960 unsigned DM[2]; 2961 2962 for (int i = 0; i < 2; ++i) 2963 if (SVN->getMaskElt(i) <= 0 || SVN->getMaskElt(i) == 2) 2964 DM[i] = 0; 2965 else 2966 DM[i] = 1; 2967 2968 if (Op1 == Op2 && DM[0] == 0 && DM[1] == 0 && 2969 Op1.getOpcode() == ISD::SCALAR_TO_VECTOR && 2970 isa<LoadSDNode>(Op1.getOperand(0))) { 2971 LoadSDNode *LD = cast<LoadSDNode>(Op1.getOperand(0)); 2972 SDValue Base, Offset; 2973 2974 if (LD->isUnindexed() && LD->hasOneUse() && Op1.hasOneUse() && 2975 (LD->getMemoryVT() == MVT::f64 || 2976 LD->getMemoryVT() == MVT::i64) && 2977 SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) { 2978 SDValue Chain = LD->getChain(); 2979 SDValue Ops[] = { Base, Offset, Chain }; 2980 MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); 2981 MemOp[0] = LD->getMemOperand(); 2982 SDNode *NewN = CurDAG->SelectNodeTo(N, PPC::LXVDSX, 2983 N->getValueType(0), Ops); 2984 cast<MachineSDNode>(NewN)->setMemRefs(MemOp, MemOp + 1); 2985 return; 2986 } 2987 } 2988 2989 // For little endian, we must swap the input operands and adjust 2990 // the mask elements (reverse and invert them). 2991 if (PPCSubTarget->isLittleEndian()) { 2992 std::swap(Op1, Op2); 2993 unsigned tmp = DM[0]; 2994 DM[0] = 1 - DM[1]; 2995 DM[1] = 1 - tmp; 2996 } 2997 2998 SDValue DMV = CurDAG->getTargetConstant(DM[1] | (DM[0] << 1), dl, 2999 MVT::i32); 3000 SDValue Ops[] = { Op1, Op2, DMV }; 3001 CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops); 3002 return; 3003 } 3004 3005 break; 3006 case PPCISD::BDNZ: 3007 case PPCISD::BDZ: { 3008 bool IsPPC64 = PPCSubTarget->isPPC64(); 3009 SDValue Ops[] = { N->getOperand(1), N->getOperand(0) }; 3010 CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ 3011 ? (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ) 3012 : (IsPPC64 ? PPC::BDZ8 : PPC::BDZ), 3013 MVT::Other, Ops); 3014 return; 3015 } 3016 case PPCISD::COND_BRANCH: { 3017 // Op #0 is the Chain. 3018 // Op #1 is the PPC::PRED_* number. 3019 // Op #2 is the CR# 3020 // Op #3 is the Dest MBB 3021 // Op #4 is the Flag. 3022 // Prevent PPC::PRED_* from being selected into LI. 3023 unsigned PCC = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 3024 if (EnableBranchHint) 3025 PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(3)); 3026 3027 SDValue Pred = getI32Imm(PCC, dl); 3028 SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3), 3029 N->getOperand(0), N->getOperand(4) }; 3030 CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops); 3031 return; 3032 } 3033 case ISD::BR_CC: { 3034 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 3035 unsigned PCC = getPredicateForSetCC(CC); 3036 3037 if (N->getOperand(2).getValueType() == MVT::i1) { 3038 unsigned Opc; 3039 bool Swap; 3040 switch (PCC) { 3041 default: llvm_unreachable("Unexpected Boolean-operand predicate"); 3042 case PPC::PRED_LT: Opc = PPC::CRANDC; Swap = true; break; 3043 case PPC::PRED_LE: Opc = PPC::CRORC; Swap = true; break; 3044 case PPC::PRED_EQ: Opc = PPC::CREQV; Swap = false; break; 3045 case PPC::PRED_GE: Opc = PPC::CRORC; Swap = false; break; 3046 case PPC::PRED_GT: Opc = PPC::CRANDC; Swap = false; break; 3047 case PPC::PRED_NE: Opc = PPC::CRXOR; Swap = false; break; 3048 } 3049 3050 SDValue BitComp(CurDAG->getMachineNode(Opc, dl, MVT::i1, 3051 N->getOperand(Swap ? 3 : 2), 3052 N->getOperand(Swap ? 2 : 3)), 0); 3053 CurDAG->SelectNodeTo(N, PPC::BC, MVT::Other, BitComp, N->getOperand(4), 3054 N->getOperand(0)); 3055 return; 3056 } 3057 3058 if (EnableBranchHint) 3059 PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(4)); 3060 3061 SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl); 3062 SDValue Ops[] = { getI32Imm(PCC, dl), CondCode, 3063 N->getOperand(4), N->getOperand(0) }; 3064 CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops); 3065 return; 3066 } 3067 case ISD::BRIND: { 3068 // FIXME: Should custom lower this. 3069 SDValue Chain = N->getOperand(0); 3070 SDValue Target = N->getOperand(1); 3071 unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8; 3072 unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8; 3073 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Target, 3074 Chain), 0); 3075 CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain); 3076 return; 3077 } 3078 case PPCISD::TOC_ENTRY: { 3079 assert ((PPCSubTarget->isPPC64() || PPCSubTarget->isSVR4ABI()) && 3080 "Only supported for 64-bit ABI and 32-bit SVR4"); 3081 if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) { 3082 SDValue GA = N->getOperand(0); 3083 SDNode *MN = CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA, 3084 N->getOperand(1)); 3085 transferMemOperands(N, MN); 3086 ReplaceNode(N, MN); 3087 return; 3088 } 3089 3090 // For medium and large code model, we generate two instructions as 3091 // described below. Otherwise we allow SelectCodeCommon to handle this, 3092 // selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA. 3093 CodeModel::Model CModel = TM.getCodeModel(); 3094 if (CModel != CodeModel::Medium && CModel != CodeModel::Large) 3095 break; 3096 3097 // The first source operand is a TargetGlobalAddress or a TargetJumpTable. 3098 // If it must be toc-referenced according to PPCSubTarget, we generate: 3099 // LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>)) 3100 // Otherwise we generate: 3101 // ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>) 3102 SDValue GA = N->getOperand(0); 3103 SDValue TOCbase = N->getOperand(1); 3104 SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64, 3105 TOCbase, GA); 3106 3107 if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA) || 3108 CModel == CodeModel::Large) { 3109 SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA, 3110 SDValue(Tmp, 0)); 3111 transferMemOperands(N, MN); 3112 ReplaceNode(N, MN); 3113 return; 3114 } 3115 3116 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) { 3117 const GlobalValue *GV = G->getGlobal(); 3118 unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV); 3119 if (GVFlags & PPCII::MO_NLP_FLAG) { 3120 SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA, 3121 SDValue(Tmp, 0)); 3122 transferMemOperands(N, MN); 3123 ReplaceNode(N, MN); 3124 return; 3125 } 3126 } 3127 3128 ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64, 3129 SDValue(Tmp, 0), GA)); 3130 return; 3131 } 3132 case PPCISD::PPC32_PICGOT: 3133 // Generate a PIC-safe GOT reference. 3134 assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() && 3135 "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4"); 3136 CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT, 3137 PPCLowering->getPointerTy(CurDAG->getDataLayout()), 3138 MVT::i32); 3139 return; 3140 3141 case PPCISD::VADD_SPLAT: { 3142 // This expands into one of three sequences, depending on whether 3143 // the first operand is odd or even, positive or negative. 3144 assert(isa<ConstantSDNode>(N->getOperand(0)) && 3145 isa<ConstantSDNode>(N->getOperand(1)) && 3146 "Invalid operand on VADD_SPLAT!"); 3147 3148 int Elt = N->getConstantOperandVal(0); 3149 int EltSize = N->getConstantOperandVal(1); 3150 unsigned Opc1, Opc2, Opc3; 3151 EVT VT; 3152 3153 if (EltSize == 1) { 3154 Opc1 = PPC::VSPLTISB; 3155 Opc2 = PPC::VADDUBM; 3156 Opc3 = PPC::VSUBUBM; 3157 VT = MVT::v16i8; 3158 } else if (EltSize == 2) { 3159 Opc1 = PPC::VSPLTISH; 3160 Opc2 = PPC::VADDUHM; 3161 Opc3 = PPC::VSUBUHM; 3162 VT = MVT::v8i16; 3163 } else { 3164 assert(EltSize == 4 && "Invalid element size on VADD_SPLAT!"); 3165 Opc1 = PPC::VSPLTISW; 3166 Opc2 = PPC::VADDUWM; 3167 Opc3 = PPC::VSUBUWM; 3168 VT = MVT::v4i32; 3169 } 3170 3171 if ((Elt & 1) == 0) { 3172 // Elt is even, in the range [-32,-18] + [16,30]. 3173 // 3174 // Convert: VADD_SPLAT elt, size 3175 // Into: tmp = VSPLTIS[BHW] elt 3176 // VADDU[BHW]M tmp, tmp 3177 // Where: [BHW] = B for size = 1, H for size = 2, W for size = 4 3178 SDValue EltVal = getI32Imm(Elt >> 1, dl); 3179 SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); 3180 SDValue TmpVal = SDValue(Tmp, 0); 3181 ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal)); 3182 return; 3183 } else if (Elt > 0) { 3184 // Elt is odd and positive, in the range [17,31]. 3185 // 3186 // Convert: VADD_SPLAT elt, size 3187 // Into: tmp1 = VSPLTIS[BHW] elt-16 3188 // tmp2 = VSPLTIS[BHW] -16 3189 // VSUBU[BHW]M tmp1, tmp2 3190 SDValue EltVal = getI32Imm(Elt - 16, dl); 3191 SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); 3192 EltVal = getI32Imm(-16, dl); 3193 SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); 3194 ReplaceNode(N, CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0), 3195 SDValue(Tmp2, 0))); 3196 return; 3197 } else { 3198 // Elt is odd and negative, in the range [-31,-17]. 3199 // 3200 // Convert: VADD_SPLAT elt, size 3201 // Into: tmp1 = VSPLTIS[BHW] elt+16 3202 // tmp2 = VSPLTIS[BHW] -16 3203 // VADDU[BHW]M tmp1, tmp2 3204 SDValue EltVal = getI32Imm(Elt + 16, dl); 3205 SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); 3206 EltVal = getI32Imm(-16, dl); 3207 SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); 3208 ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0), 3209 SDValue(Tmp2, 0))); 3210 return; 3211 } 3212 } 3213 } 3214 3215 SelectCode(N); 3216 } 3217 3218 // If the target supports the cmpb instruction, do the idiom recognition here. 3219 // We don't do this as a DAG combine because we don't want to do it as nodes 3220 // are being combined (because we might miss part of the eventual idiom). We 3221 // don't want to do it during instruction selection because we want to reuse 3222 // the logic for lowering the masking operations already part of the 3223 // instruction selector. 3224 SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) { 3225 SDLoc dl(N); 3226 3227 assert(N->getOpcode() == ISD::OR && 3228 "Only OR nodes are supported for CMPB"); 3229 3230 SDValue Res; 3231 if (!PPCSubTarget->hasCMPB()) 3232 return Res; 3233 3234 if (N->getValueType(0) != MVT::i32 && 3235 N->getValueType(0) != MVT::i64) 3236 return Res; 3237 3238 EVT VT = N->getValueType(0); 3239 3240 SDValue RHS, LHS; 3241 bool BytesFound[8] = {false, false, false, false, false, false, false, false}; 3242 uint64_t Mask = 0, Alt = 0; 3243 3244 auto IsByteSelectCC = [this](SDValue O, unsigned &b, 3245 uint64_t &Mask, uint64_t &Alt, 3246 SDValue &LHS, SDValue &RHS) { 3247 if (O.getOpcode() != ISD::SELECT_CC) 3248 return false; 3249 ISD::CondCode CC = cast<CondCodeSDNode>(O.getOperand(4))->get(); 3250 3251 if (!isa<ConstantSDNode>(O.getOperand(2)) || 3252 !isa<ConstantSDNode>(O.getOperand(3))) 3253 return false; 3254 3255 uint64_t PM = O.getConstantOperandVal(2); 3256 uint64_t PAlt = O.getConstantOperandVal(3); 3257 for (b = 0; b < 8; ++b) { 3258 uint64_t Mask = UINT64_C(0xFF) << (8*b); 3259 if (PM && (PM & Mask) == PM && (PAlt & Mask) == PAlt) 3260 break; 3261 } 3262 3263 if (b == 8) 3264 return false; 3265 Mask |= PM; 3266 Alt |= PAlt; 3267 3268 if (!isa<ConstantSDNode>(O.getOperand(1)) || 3269 O.getConstantOperandVal(1) != 0) { 3270 SDValue Op0 = O.getOperand(0), Op1 = O.getOperand(1); 3271 if (Op0.getOpcode() == ISD::TRUNCATE) 3272 Op0 = Op0.getOperand(0); 3273 if (Op1.getOpcode() == ISD::TRUNCATE) 3274 Op1 = Op1.getOperand(0); 3275 3276 if (Op0.getOpcode() == ISD::SRL && Op1.getOpcode() == ISD::SRL && 3277 Op0.getOperand(1) == Op1.getOperand(1) && CC == ISD::SETEQ && 3278 isa<ConstantSDNode>(Op0.getOperand(1))) { 3279 3280 unsigned Bits = Op0.getValueSizeInBits(); 3281 if (b != Bits/8-1) 3282 return false; 3283 if (Op0.getConstantOperandVal(1) != Bits-8) 3284 return false; 3285 3286 LHS = Op0.getOperand(0); 3287 RHS = Op1.getOperand(0); 3288 return true; 3289 } 3290 3291 // When we have small integers (i16 to be specific), the form present 3292 // post-legalization uses SETULT in the SELECT_CC for the 3293 // higher-order byte, depending on the fact that the 3294 // even-higher-order bytes are known to all be zero, for example: 3295 // select_cc (xor $lhs, $rhs), 256, 65280, 0, setult 3296 // (so when the second byte is the same, because all higher-order 3297 // bits from bytes 3 and 4 are known to be zero, the result of the 3298 // xor can be at most 255) 3299 if (Op0.getOpcode() == ISD::XOR && CC == ISD::SETULT && 3300 isa<ConstantSDNode>(O.getOperand(1))) { 3301 3302 uint64_t ULim = O.getConstantOperandVal(1); 3303 if (ULim != (UINT64_C(1) << b*8)) 3304 return false; 3305 3306 // Now we need to make sure that the upper bytes are known to be 3307 // zero. 3308 unsigned Bits = Op0.getValueSizeInBits(); 3309 if (!CurDAG->MaskedValueIsZero( 3310 Op0, APInt::getHighBitsSet(Bits, Bits - (b + 1) * 8))) 3311 return false; 3312 3313 LHS = Op0.getOperand(0); 3314 RHS = Op0.getOperand(1); 3315 return true; 3316 } 3317 3318 return false; 3319 } 3320 3321 if (CC != ISD::SETEQ) 3322 return false; 3323 3324 SDValue Op = O.getOperand(0); 3325 if (Op.getOpcode() == ISD::AND) { 3326 if (!isa<ConstantSDNode>(Op.getOperand(1))) 3327 return false; 3328 if (Op.getConstantOperandVal(1) != (UINT64_C(0xFF) << (8*b))) 3329 return false; 3330 3331 SDValue XOR = Op.getOperand(0); 3332 if (XOR.getOpcode() == ISD::TRUNCATE) 3333 XOR = XOR.getOperand(0); 3334 if (XOR.getOpcode() != ISD::XOR) 3335 return false; 3336 3337 LHS = XOR.getOperand(0); 3338 RHS = XOR.getOperand(1); 3339 return true; 3340 } else if (Op.getOpcode() == ISD::SRL) { 3341 if (!isa<ConstantSDNode>(Op.getOperand(1))) 3342 return false; 3343 unsigned Bits = Op.getValueSizeInBits(); 3344 if (b != Bits/8-1) 3345 return false; 3346 if (Op.getConstantOperandVal(1) != Bits-8) 3347 return false; 3348 3349 SDValue XOR = Op.getOperand(0); 3350 if (XOR.getOpcode() == ISD::TRUNCATE) 3351 XOR = XOR.getOperand(0); 3352 if (XOR.getOpcode() != ISD::XOR) 3353 return false; 3354 3355 LHS = XOR.getOperand(0); 3356 RHS = XOR.getOperand(1); 3357 return true; 3358 } 3359 3360 return false; 3361 }; 3362 3363 SmallVector<SDValue, 8> Queue(1, SDValue(N, 0)); 3364 while (!Queue.empty()) { 3365 SDValue V = Queue.pop_back_val(); 3366 3367 for (const SDValue &O : V.getNode()->ops()) { 3368 unsigned b; 3369 uint64_t M = 0, A = 0; 3370 SDValue OLHS, ORHS; 3371 if (O.getOpcode() == ISD::OR) { 3372 Queue.push_back(O); 3373 } else if (IsByteSelectCC(O, b, M, A, OLHS, ORHS)) { 3374 if (!LHS) { 3375 LHS = OLHS; 3376 RHS = ORHS; 3377 BytesFound[b] = true; 3378 Mask |= M; 3379 Alt |= A; 3380 } else if ((LHS == ORHS && RHS == OLHS) || 3381 (RHS == ORHS && LHS == OLHS)) { 3382 BytesFound[b] = true; 3383 Mask |= M; 3384 Alt |= A; 3385 } else { 3386 return Res; 3387 } 3388 } else { 3389 return Res; 3390 } 3391 } 3392 } 3393 3394 unsigned LastB = 0, BCnt = 0; 3395 for (unsigned i = 0; i < 8; ++i) 3396 if (BytesFound[LastB]) { 3397 ++BCnt; 3398 LastB = i; 3399 } 3400 3401 if (!LastB || BCnt < 2) 3402 return Res; 3403 3404 // Because we'll be zero-extending the output anyway if don't have a specific 3405 // value for each input byte (via the Mask), we can 'anyext' the inputs. 3406 if (LHS.getValueType() != VT) { 3407 LHS = CurDAG->getAnyExtOrTrunc(LHS, dl, VT); 3408 RHS = CurDAG->getAnyExtOrTrunc(RHS, dl, VT); 3409 } 3410 3411 Res = CurDAG->getNode(PPCISD::CMPB, dl, VT, LHS, RHS); 3412 3413 bool NonTrivialMask = ((int64_t) Mask) != INT64_C(-1); 3414 if (NonTrivialMask && !Alt) { 3415 // Res = Mask & CMPB 3416 Res = CurDAG->getNode(ISD::AND, dl, VT, Res, 3417 CurDAG->getConstant(Mask, dl, VT)); 3418 } else if (Alt) { 3419 // Res = (CMPB & Mask) | (~CMPB & Alt) 3420 // Which, as suggested here: 3421 // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge 3422 // can be written as: 3423 // Res = Alt ^ ((Alt ^ Mask) & CMPB) 3424 // useful because the (Alt ^ Mask) can be pre-computed. 3425 Res = CurDAG->getNode(ISD::AND, dl, VT, Res, 3426 CurDAG->getConstant(Mask ^ Alt, dl, VT)); 3427 Res = CurDAG->getNode(ISD::XOR, dl, VT, Res, 3428 CurDAG->getConstant(Alt, dl, VT)); 3429 } 3430 3431 return Res; 3432 } 3433 3434 // When CR bit registers are enabled, an extension of an i1 variable to a i32 3435 // or i64 value is lowered in terms of a SELECT_I[48] operation, and thus 3436 // involves constant materialization of a 0 or a 1 or both. If the result of 3437 // the extension is then operated upon by some operator that can be constant 3438 // folded with a constant 0 or 1, and that constant can be materialized using 3439 // only one instruction (like a zero or one), then we should fold in those 3440 // operations with the select. 3441 void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) { 3442 if (!PPCSubTarget->useCRBits()) 3443 return; 3444 3445 if (N->getOpcode() != ISD::ZERO_EXTEND && 3446 N->getOpcode() != ISD::SIGN_EXTEND && 3447 N->getOpcode() != ISD::ANY_EXTEND) 3448 return; 3449 3450 if (N->getOperand(0).getValueType() != MVT::i1) 3451 return; 3452 3453 if (!N->hasOneUse()) 3454 return; 3455 3456 SDLoc dl(N); 3457 EVT VT = N->getValueType(0); 3458 SDValue Cond = N->getOperand(0); 3459 SDValue ConstTrue = 3460 CurDAG->getConstant(N->getOpcode() == ISD::SIGN_EXTEND ? -1 : 1, dl, VT); 3461 SDValue ConstFalse = CurDAG->getConstant(0, dl, VT); 3462 3463 do { 3464 SDNode *User = *N->use_begin(); 3465 if (User->getNumOperands() != 2) 3466 break; 3467 3468 auto TryFold = [this, N, User, dl](SDValue Val) { 3469 SDValue UserO0 = User->getOperand(0), UserO1 = User->getOperand(1); 3470 SDValue O0 = UserO0.getNode() == N ? Val : UserO0; 3471 SDValue O1 = UserO1.getNode() == N ? Val : UserO1; 3472 3473 return CurDAG->FoldConstantArithmetic(User->getOpcode(), dl, 3474 User->getValueType(0), 3475 O0.getNode(), O1.getNode()); 3476 }; 3477 3478 SDValue TrueRes = TryFold(ConstTrue); 3479 if (!TrueRes) 3480 break; 3481 SDValue FalseRes = TryFold(ConstFalse); 3482 if (!FalseRes) 3483 break; 3484 3485 // For us to materialize these using one instruction, we must be able to 3486 // represent them as signed 16-bit integers. 3487 uint64_t True = cast<ConstantSDNode>(TrueRes)->getZExtValue(), 3488 False = cast<ConstantSDNode>(FalseRes)->getZExtValue(); 3489 if (!isInt<16>(True) || !isInt<16>(False)) 3490 break; 3491 3492 // We can replace User with a new SELECT node, and try again to see if we 3493 // can fold the select with its user. 3494 Res = CurDAG->getSelect(dl, User->getValueType(0), Cond, TrueRes, FalseRes); 3495 N = User; 3496 ConstTrue = TrueRes; 3497 ConstFalse = FalseRes; 3498 } while (N->hasOneUse()); 3499 } 3500 3501 void PPCDAGToDAGISel::PreprocessISelDAG() { 3502 SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode()); 3503 ++Position; 3504 3505 bool MadeChange = false; 3506 while (Position != CurDAG->allnodes_begin()) { 3507 SDNode *N = &*--Position; 3508 if (N->use_empty()) 3509 continue; 3510 3511 SDValue Res; 3512 switch (N->getOpcode()) { 3513 default: break; 3514 case ISD::OR: 3515 Res = combineToCMPB(N); 3516 break; 3517 } 3518 3519 if (!Res) 3520 foldBoolExts(Res, N); 3521 3522 if (Res) { 3523 DEBUG(dbgs() << "PPC DAG preprocessing replacing:\nOld: "); 3524 DEBUG(N->dump(CurDAG)); 3525 DEBUG(dbgs() << "\nNew: "); 3526 DEBUG(Res.getNode()->dump(CurDAG)); 3527 DEBUG(dbgs() << "\n"); 3528 3529 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 3530 MadeChange = true; 3531 } 3532 } 3533 3534 if (MadeChange) 3535 CurDAG->RemoveDeadNodes(); 3536 } 3537 3538 /// PostprocessISelDAG - Perform some late peephole optimizations 3539 /// on the DAG representation. 3540 void PPCDAGToDAGISel::PostprocessISelDAG() { 3541 // Skip peepholes at -O0. 3542 if (TM.getOptLevel() == CodeGenOpt::None) 3543 return; 3544 3545 PeepholePPC64(); 3546 PeepholeCROps(); 3547 PeepholePPC64ZExt(); 3548 } 3549 3550 // Check if all users of this node will become isel where the second operand 3551 // is the constant zero. If this is so, and if we can negate the condition, 3552 // then we can flip the true and false operands. This will allow the zero to 3553 // be folded with the isel so that we don't need to materialize a register 3554 // containing zero. 3555 bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) { 3556 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); 3557 UI != UE; ++UI) { 3558 SDNode *User = *UI; 3559 if (!User->isMachineOpcode()) 3560 return false; 3561 if (User->getMachineOpcode() != PPC::SELECT_I4 && 3562 User->getMachineOpcode() != PPC::SELECT_I8) 3563 return false; 3564 3565 SDNode *Op2 = User->getOperand(2).getNode(); 3566 if (!Op2->isMachineOpcode()) 3567 return false; 3568 3569 if (Op2->getMachineOpcode() != PPC::LI && 3570 Op2->getMachineOpcode() != PPC::LI8) 3571 return false; 3572 3573 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2->getOperand(0)); 3574 if (!C) 3575 return false; 3576 3577 if (!C->isNullValue()) 3578 return false; 3579 } 3580 3581 return true; 3582 } 3583 3584 void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) { 3585 SmallVector<SDNode *, 4> ToReplace; 3586 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); 3587 UI != UE; ++UI) { 3588 SDNode *User = *UI; 3589 assert((User->getMachineOpcode() == PPC::SELECT_I4 || 3590 User->getMachineOpcode() == PPC::SELECT_I8) && 3591 "Must have all select users"); 3592 ToReplace.push_back(User); 3593 } 3594 3595 for (SmallVector<SDNode *, 4>::iterator UI = ToReplace.begin(), 3596 UE = ToReplace.end(); UI != UE; ++UI) { 3597 SDNode *User = *UI; 3598 SDNode *ResNode = 3599 CurDAG->getMachineNode(User->getMachineOpcode(), SDLoc(User), 3600 User->getValueType(0), User->getOperand(0), 3601 User->getOperand(2), 3602 User->getOperand(1)); 3603 3604 DEBUG(dbgs() << "CR Peephole replacing:\nOld: "); 3605 DEBUG(User->dump(CurDAG)); 3606 DEBUG(dbgs() << "\nNew: "); 3607 DEBUG(ResNode->dump(CurDAG)); 3608 DEBUG(dbgs() << "\n"); 3609 3610 ReplaceUses(User, ResNode); 3611 } 3612 } 3613 3614 void PPCDAGToDAGISel::PeepholeCROps() { 3615 bool IsModified; 3616 do { 3617 IsModified = false; 3618 for (SDNode &Node : CurDAG->allnodes()) { 3619 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node); 3620 if (!MachineNode || MachineNode->use_empty()) 3621 continue; 3622 SDNode *ResNode = MachineNode; 3623 3624 bool Op1Set = false, Op1Unset = false, 3625 Op1Not = false, 3626 Op2Set = false, Op2Unset = false, 3627 Op2Not = false; 3628 3629 unsigned Opcode = MachineNode->getMachineOpcode(); 3630 switch (Opcode) { 3631 default: break; 3632 case PPC::CRAND: 3633 case PPC::CRNAND: 3634 case PPC::CROR: 3635 case PPC::CRXOR: 3636 case PPC::CRNOR: 3637 case PPC::CREQV: 3638 case PPC::CRANDC: 3639 case PPC::CRORC: { 3640 SDValue Op = MachineNode->getOperand(1); 3641 if (Op.isMachineOpcode()) { 3642 if (Op.getMachineOpcode() == PPC::CRSET) 3643 Op2Set = true; 3644 else if (Op.getMachineOpcode() == PPC::CRUNSET) 3645 Op2Unset = true; 3646 else if (Op.getMachineOpcode() == PPC::CRNOR && 3647 Op.getOperand(0) == Op.getOperand(1)) 3648 Op2Not = true; 3649 } 3650 LLVM_FALLTHROUGH; 3651 } 3652 case PPC::BC: 3653 case PPC::BCn: 3654 case PPC::SELECT_I4: 3655 case PPC::SELECT_I8: 3656 case PPC::SELECT_F4: 3657 case PPC::SELECT_F8: 3658 case PPC::SELECT_QFRC: 3659 case PPC::SELECT_QSRC: 3660 case PPC::SELECT_QBRC: 3661 case PPC::SELECT_VRRC: 3662 case PPC::SELECT_VSFRC: 3663 case PPC::SELECT_VSSRC: 3664 case PPC::SELECT_VSRC: { 3665 SDValue Op = MachineNode->getOperand(0); 3666 if (Op.isMachineOpcode()) { 3667 if (Op.getMachineOpcode() == PPC::CRSET) 3668 Op1Set = true; 3669 else if (Op.getMachineOpcode() == PPC::CRUNSET) 3670 Op1Unset = true; 3671 else if (Op.getMachineOpcode() == PPC::CRNOR && 3672 Op.getOperand(0) == Op.getOperand(1)) 3673 Op1Not = true; 3674 } 3675 } 3676 break; 3677 } 3678 3679 bool SelectSwap = false; 3680 switch (Opcode) { 3681 default: break; 3682 case PPC::CRAND: 3683 if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) 3684 // x & x = x 3685 ResNode = MachineNode->getOperand(0).getNode(); 3686 else if (Op1Set) 3687 // 1 & y = y 3688 ResNode = MachineNode->getOperand(1).getNode(); 3689 else if (Op2Set) 3690 // x & 1 = x 3691 ResNode = MachineNode->getOperand(0).getNode(); 3692 else if (Op1Unset || Op2Unset) 3693 // x & 0 = 0 & y = 0 3694 ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), 3695 MVT::i1); 3696 else if (Op1Not) 3697 // ~x & y = andc(y, x) 3698 ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), 3699 MVT::i1, MachineNode->getOperand(1), 3700 MachineNode->getOperand(0). 3701 getOperand(0)); 3702 else if (Op2Not) 3703 // x & ~y = andc(x, y) 3704 ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), 3705 MVT::i1, MachineNode->getOperand(0), 3706 MachineNode->getOperand(1). 3707 getOperand(0)); 3708 else if (AllUsersSelectZero(MachineNode)) { 3709 ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode), 3710 MVT::i1, MachineNode->getOperand(0), 3711 MachineNode->getOperand(1)); 3712 SelectSwap = true; 3713 } 3714 break; 3715 case PPC::CRNAND: 3716 if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) 3717 // nand(x, x) -> nor(x, x) 3718 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3719 MVT::i1, MachineNode->getOperand(0), 3720 MachineNode->getOperand(0)); 3721 else if (Op1Set) 3722 // nand(1, y) -> nor(y, y) 3723 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3724 MVT::i1, MachineNode->getOperand(1), 3725 MachineNode->getOperand(1)); 3726 else if (Op2Set) 3727 // nand(x, 1) -> nor(x, x) 3728 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3729 MVT::i1, MachineNode->getOperand(0), 3730 MachineNode->getOperand(0)); 3731 else if (Op1Unset || Op2Unset) 3732 // nand(x, 0) = nand(0, y) = 1 3733 ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), 3734 MVT::i1); 3735 else if (Op1Not) 3736 // nand(~x, y) = ~(~x & y) = x | ~y = orc(x, y) 3737 ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), 3738 MVT::i1, MachineNode->getOperand(0). 3739 getOperand(0), 3740 MachineNode->getOperand(1)); 3741 else if (Op2Not) 3742 // nand(x, ~y) = ~x | y = orc(y, x) 3743 ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), 3744 MVT::i1, MachineNode->getOperand(1). 3745 getOperand(0), 3746 MachineNode->getOperand(0)); 3747 else if (AllUsersSelectZero(MachineNode)) { 3748 ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode), 3749 MVT::i1, MachineNode->getOperand(0), 3750 MachineNode->getOperand(1)); 3751 SelectSwap = true; 3752 } 3753 break; 3754 case PPC::CROR: 3755 if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) 3756 // x | x = x 3757 ResNode = MachineNode->getOperand(0).getNode(); 3758 else if (Op1Set || Op2Set) 3759 // x | 1 = 1 | y = 1 3760 ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), 3761 MVT::i1); 3762 else if (Op1Unset) 3763 // 0 | y = y 3764 ResNode = MachineNode->getOperand(1).getNode(); 3765 else if (Op2Unset) 3766 // x | 0 = x 3767 ResNode = MachineNode->getOperand(0).getNode(); 3768 else if (Op1Not) 3769 // ~x | y = orc(y, x) 3770 ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), 3771 MVT::i1, MachineNode->getOperand(1), 3772 MachineNode->getOperand(0). 3773 getOperand(0)); 3774 else if (Op2Not) 3775 // x | ~y = orc(x, y) 3776 ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), 3777 MVT::i1, MachineNode->getOperand(0), 3778 MachineNode->getOperand(1). 3779 getOperand(0)); 3780 else if (AllUsersSelectZero(MachineNode)) { 3781 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3782 MVT::i1, MachineNode->getOperand(0), 3783 MachineNode->getOperand(1)); 3784 SelectSwap = true; 3785 } 3786 break; 3787 case PPC::CRXOR: 3788 if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) 3789 // xor(x, x) = 0 3790 ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), 3791 MVT::i1); 3792 else if (Op1Set) 3793 // xor(1, y) -> nor(y, y) 3794 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3795 MVT::i1, MachineNode->getOperand(1), 3796 MachineNode->getOperand(1)); 3797 else if (Op2Set) 3798 // xor(x, 1) -> nor(x, x) 3799 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3800 MVT::i1, MachineNode->getOperand(0), 3801 MachineNode->getOperand(0)); 3802 else if (Op1Unset) 3803 // xor(0, y) = y 3804 ResNode = MachineNode->getOperand(1).getNode(); 3805 else if (Op2Unset) 3806 // xor(x, 0) = x 3807 ResNode = MachineNode->getOperand(0).getNode(); 3808 else if (Op1Not) 3809 // xor(~x, y) = eqv(x, y) 3810 ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode), 3811 MVT::i1, MachineNode->getOperand(0). 3812 getOperand(0), 3813 MachineNode->getOperand(1)); 3814 else if (Op2Not) 3815 // xor(x, ~y) = eqv(x, y) 3816 ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode), 3817 MVT::i1, MachineNode->getOperand(0), 3818 MachineNode->getOperand(1). 3819 getOperand(0)); 3820 else if (AllUsersSelectZero(MachineNode)) { 3821 ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode), 3822 MVT::i1, MachineNode->getOperand(0), 3823 MachineNode->getOperand(1)); 3824 SelectSwap = true; 3825 } 3826 break; 3827 case PPC::CRNOR: 3828 if (Op1Set || Op2Set) 3829 // nor(1, y) -> 0 3830 ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), 3831 MVT::i1); 3832 else if (Op1Unset) 3833 // nor(0, y) = ~y -> nor(y, y) 3834 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3835 MVT::i1, MachineNode->getOperand(1), 3836 MachineNode->getOperand(1)); 3837 else if (Op2Unset) 3838 // nor(x, 0) = ~x 3839 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3840 MVT::i1, MachineNode->getOperand(0), 3841 MachineNode->getOperand(0)); 3842 else if (Op1Not) 3843 // nor(~x, y) = andc(x, y) 3844 ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), 3845 MVT::i1, MachineNode->getOperand(0). 3846 getOperand(0), 3847 MachineNode->getOperand(1)); 3848 else if (Op2Not) 3849 // nor(x, ~y) = andc(y, x) 3850 ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), 3851 MVT::i1, MachineNode->getOperand(1). 3852 getOperand(0), 3853 MachineNode->getOperand(0)); 3854 else if (AllUsersSelectZero(MachineNode)) { 3855 ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode), 3856 MVT::i1, MachineNode->getOperand(0), 3857 MachineNode->getOperand(1)); 3858 SelectSwap = true; 3859 } 3860 break; 3861 case PPC::CREQV: 3862 if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) 3863 // eqv(x, x) = 1 3864 ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), 3865 MVT::i1); 3866 else if (Op1Set) 3867 // eqv(1, y) = y 3868 ResNode = MachineNode->getOperand(1).getNode(); 3869 else if (Op2Set) 3870 // eqv(x, 1) = x 3871 ResNode = MachineNode->getOperand(0).getNode(); 3872 else if (Op1Unset) 3873 // eqv(0, y) = ~y -> nor(y, y) 3874 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3875 MVT::i1, MachineNode->getOperand(1), 3876 MachineNode->getOperand(1)); 3877 else if (Op2Unset) 3878 // eqv(x, 0) = ~x 3879 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3880 MVT::i1, MachineNode->getOperand(0), 3881 MachineNode->getOperand(0)); 3882 else if (Op1Not) 3883 // eqv(~x, y) = xor(x, y) 3884 ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode), 3885 MVT::i1, MachineNode->getOperand(0). 3886 getOperand(0), 3887 MachineNode->getOperand(1)); 3888 else if (Op2Not) 3889 // eqv(x, ~y) = xor(x, y) 3890 ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode), 3891 MVT::i1, MachineNode->getOperand(0), 3892 MachineNode->getOperand(1). 3893 getOperand(0)); 3894 else if (AllUsersSelectZero(MachineNode)) { 3895 ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode), 3896 MVT::i1, MachineNode->getOperand(0), 3897 MachineNode->getOperand(1)); 3898 SelectSwap = true; 3899 } 3900 break; 3901 case PPC::CRANDC: 3902 if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) 3903 // andc(x, x) = 0 3904 ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), 3905 MVT::i1); 3906 else if (Op1Set) 3907 // andc(1, y) = ~y 3908 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3909 MVT::i1, MachineNode->getOperand(1), 3910 MachineNode->getOperand(1)); 3911 else if (Op1Unset || Op2Set) 3912 // andc(0, y) = andc(x, 1) = 0 3913 ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), 3914 MVT::i1); 3915 else if (Op2Unset) 3916 // andc(x, 0) = x 3917 ResNode = MachineNode->getOperand(0).getNode(); 3918 else if (Op1Not) 3919 // andc(~x, y) = ~(x | y) = nor(x, y) 3920 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3921 MVT::i1, MachineNode->getOperand(0). 3922 getOperand(0), 3923 MachineNode->getOperand(1)); 3924 else if (Op2Not) 3925 // andc(x, ~y) = x & y 3926 ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode), 3927 MVT::i1, MachineNode->getOperand(0), 3928 MachineNode->getOperand(1). 3929 getOperand(0)); 3930 else if (AllUsersSelectZero(MachineNode)) { 3931 ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), 3932 MVT::i1, MachineNode->getOperand(1), 3933 MachineNode->getOperand(0)); 3934 SelectSwap = true; 3935 } 3936 break; 3937 case PPC::CRORC: 3938 if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) 3939 // orc(x, x) = 1 3940 ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), 3941 MVT::i1); 3942 else if (Op1Set || Op2Unset) 3943 // orc(1, y) = orc(x, 0) = 1 3944 ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), 3945 MVT::i1); 3946 else if (Op2Set) 3947 // orc(x, 1) = x 3948 ResNode = MachineNode->getOperand(0).getNode(); 3949 else if (Op1Unset) 3950 // orc(0, y) = ~y 3951 ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), 3952 MVT::i1, MachineNode->getOperand(1), 3953 MachineNode->getOperand(1)); 3954 else if (Op1Not) 3955 // orc(~x, y) = ~(x & y) = nand(x, y) 3956 ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode), 3957 MVT::i1, MachineNode->getOperand(0). 3958 getOperand(0), 3959 MachineNode->getOperand(1)); 3960 else if (Op2Not) 3961 // orc(x, ~y) = x | y 3962 ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode), 3963 MVT::i1, MachineNode->getOperand(0), 3964 MachineNode->getOperand(1). 3965 getOperand(0)); 3966 else if (AllUsersSelectZero(MachineNode)) { 3967 ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), 3968 MVT::i1, MachineNode->getOperand(1), 3969 MachineNode->getOperand(0)); 3970 SelectSwap = true; 3971 } 3972 break; 3973 case PPC::SELECT_I4: 3974 case PPC::SELECT_I8: 3975 case PPC::SELECT_F4: 3976 case PPC::SELECT_F8: 3977 case PPC::SELECT_QFRC: 3978 case PPC::SELECT_QSRC: 3979 case PPC::SELECT_QBRC: 3980 case PPC::SELECT_VRRC: 3981 case PPC::SELECT_VSFRC: 3982 case PPC::SELECT_VSSRC: 3983 case PPC::SELECT_VSRC: 3984 if (Op1Set) 3985 ResNode = MachineNode->getOperand(1).getNode(); 3986 else if (Op1Unset) 3987 ResNode = MachineNode->getOperand(2).getNode(); 3988 else if (Op1Not) 3989 ResNode = CurDAG->getMachineNode(MachineNode->getMachineOpcode(), 3990 SDLoc(MachineNode), 3991 MachineNode->getValueType(0), 3992 MachineNode->getOperand(0). 3993 getOperand(0), 3994 MachineNode->getOperand(2), 3995 MachineNode->getOperand(1)); 3996 break; 3997 case PPC::BC: 3998 case PPC::BCn: 3999 if (Op1Not) 4000 ResNode = CurDAG->getMachineNode(Opcode == PPC::BC ? PPC::BCn : 4001 PPC::BC, 4002 SDLoc(MachineNode), 4003 MVT::Other, 4004 MachineNode->getOperand(0). 4005 getOperand(0), 4006 MachineNode->getOperand(1), 4007 MachineNode->getOperand(2)); 4008 // FIXME: Handle Op1Set, Op1Unset here too. 4009 break; 4010 } 4011 4012 // If we're inverting this node because it is used only by selects that 4013 // we'd like to swap, then swap the selects before the node replacement. 4014 if (SelectSwap) 4015 SwapAllSelectUsers(MachineNode); 4016 4017 if (ResNode != MachineNode) { 4018 DEBUG(dbgs() << "CR Peephole replacing:\nOld: "); 4019 DEBUG(MachineNode->dump(CurDAG)); 4020 DEBUG(dbgs() << "\nNew: "); 4021 DEBUG(ResNode->dump(CurDAG)); 4022 DEBUG(dbgs() << "\n"); 4023 4024 ReplaceUses(MachineNode, ResNode); 4025 IsModified = true; 4026 } 4027 } 4028 if (IsModified) 4029 CurDAG->RemoveDeadNodes(); 4030 } while (IsModified); 4031 } 4032 4033 // Gather the set of 32-bit operations that are known to have their 4034 // higher-order 32 bits zero, where ToPromote contains all such operations. 4035 static bool PeepholePPC64ZExtGather(SDValue Op32, 4036 SmallPtrSetImpl<SDNode *> &ToPromote) { 4037 if (!Op32.isMachineOpcode()) 4038 return false; 4039 4040 // First, check for the "frontier" instructions (those that will clear the 4041 // higher-order 32 bits. 4042 4043 // For RLWINM and RLWNM, we need to make sure that the mask does not wrap 4044 // around. If it does not, then these instructions will clear the 4045 // higher-order bits. 4046 if ((Op32.getMachineOpcode() == PPC::RLWINM || 4047 Op32.getMachineOpcode() == PPC::RLWNM) && 4048 Op32.getConstantOperandVal(2) <= Op32.getConstantOperandVal(3)) { 4049 ToPromote.insert(Op32.getNode()); 4050 return true; 4051 } 4052 4053 // SLW and SRW always clear the higher-order bits. 4054 if (Op32.getMachineOpcode() == PPC::SLW || 4055 Op32.getMachineOpcode() == PPC::SRW) { 4056 ToPromote.insert(Op32.getNode()); 4057 return true; 4058 } 4059 4060 // For LI and LIS, we need the immediate to be positive (so that it is not 4061 // sign extended). 4062 if (Op32.getMachineOpcode() == PPC::LI || 4063 Op32.getMachineOpcode() == PPC::LIS) { 4064 if (!isUInt<15>(Op32.getConstantOperandVal(0))) 4065 return false; 4066 4067 ToPromote.insert(Op32.getNode()); 4068 return true; 4069 } 4070 4071 // LHBRX and LWBRX always clear the higher-order bits. 4072 if (Op32.getMachineOpcode() == PPC::LHBRX || 4073 Op32.getMachineOpcode() == PPC::LWBRX) { 4074 ToPromote.insert(Op32.getNode()); 4075 return true; 4076 } 4077 4078 // CNT[LT]ZW always produce a 64-bit value in [0,32], and so is zero extended. 4079 if (Op32.getMachineOpcode() == PPC::CNTLZW || 4080 Op32.getMachineOpcode() == PPC::CNTTZW) { 4081 ToPromote.insert(Op32.getNode()); 4082 return true; 4083 } 4084 4085 // Next, check for those instructions we can look through. 4086 4087 // Assuming the mask does not wrap around, then the higher-order bits are 4088 // taken directly from the first operand. 4089 if (Op32.getMachineOpcode() == PPC::RLWIMI && 4090 Op32.getConstantOperandVal(3) <= Op32.getConstantOperandVal(4)) { 4091 SmallPtrSet<SDNode *, 16> ToPromote1; 4092 if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1)) 4093 return false; 4094 4095 ToPromote.insert(Op32.getNode()); 4096 ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); 4097 return true; 4098 } 4099 4100 // For OR, the higher-order bits are zero if that is true for both operands. 4101 // For SELECT_I4, the same is true (but the relevant operand numbers are 4102 // shifted by 1). 4103 if (Op32.getMachineOpcode() == PPC::OR || 4104 Op32.getMachineOpcode() == PPC::SELECT_I4) { 4105 unsigned B = Op32.getMachineOpcode() == PPC::SELECT_I4 ? 1 : 0; 4106 SmallPtrSet<SDNode *, 16> ToPromote1; 4107 if (!PeepholePPC64ZExtGather(Op32.getOperand(B+0), ToPromote1)) 4108 return false; 4109 if (!PeepholePPC64ZExtGather(Op32.getOperand(B+1), ToPromote1)) 4110 return false; 4111 4112 ToPromote.insert(Op32.getNode()); 4113 ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); 4114 return true; 4115 } 4116 4117 // For ORI and ORIS, we need the higher-order bits of the first operand to be 4118 // zero, and also for the constant to be positive (so that it is not sign 4119 // extended). 4120 if (Op32.getMachineOpcode() == PPC::ORI || 4121 Op32.getMachineOpcode() == PPC::ORIS) { 4122 SmallPtrSet<SDNode *, 16> ToPromote1; 4123 if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1)) 4124 return false; 4125 if (!isUInt<15>(Op32.getConstantOperandVal(1))) 4126 return false; 4127 4128 ToPromote.insert(Op32.getNode()); 4129 ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); 4130 return true; 4131 } 4132 4133 // The higher-order bits of AND are zero if that is true for at least one of 4134 // the operands. 4135 if (Op32.getMachineOpcode() == PPC::AND) { 4136 SmallPtrSet<SDNode *, 16> ToPromote1, ToPromote2; 4137 bool Op0OK = 4138 PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1); 4139 bool Op1OK = 4140 PeepholePPC64ZExtGather(Op32.getOperand(1), ToPromote2); 4141 if (!Op0OK && !Op1OK) 4142 return false; 4143 4144 ToPromote.insert(Op32.getNode()); 4145 4146 if (Op0OK) 4147 ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); 4148 4149 if (Op1OK) 4150 ToPromote.insert(ToPromote2.begin(), ToPromote2.end()); 4151 4152 return true; 4153 } 4154 4155 // For ANDI and ANDIS, the higher-order bits are zero if either that is true 4156 // of the first operand, or if the second operand is positive (so that it is 4157 // not sign extended). 4158 if (Op32.getMachineOpcode() == PPC::ANDIo || 4159 Op32.getMachineOpcode() == PPC::ANDISo) { 4160 SmallPtrSet<SDNode *, 16> ToPromote1; 4161 bool Op0OK = 4162 PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1); 4163 bool Op1OK = isUInt<15>(Op32.getConstantOperandVal(1)); 4164 if (!Op0OK && !Op1OK) 4165 return false; 4166 4167 ToPromote.insert(Op32.getNode()); 4168 4169 if (Op0OK) 4170 ToPromote.insert(ToPromote1.begin(), ToPromote1.end()); 4171 4172 return true; 4173 } 4174 4175 return false; 4176 } 4177 4178 void PPCDAGToDAGISel::PeepholePPC64ZExt() { 4179 if (!PPCSubTarget->isPPC64()) 4180 return; 4181 4182 // When we zero-extend from i32 to i64, we use a pattern like this: 4183 // def : Pat<(i64 (zext i32:$in)), 4184 // (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32), 4185 // 0, 32)>; 4186 // There are several 32-bit shift/rotate instructions, however, that will 4187 // clear the higher-order bits of their output, rendering the RLDICL 4188 // unnecessary. When that happens, we remove it here, and redefine the 4189 // relevant 32-bit operation to be a 64-bit operation. 4190 4191 SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode()); 4192 ++Position; 4193 4194 bool MadeChange = false; 4195 while (Position != CurDAG->allnodes_begin()) { 4196 SDNode *N = &*--Position; 4197 // Skip dead nodes and any non-machine opcodes. 4198 if (N->use_empty() || !N->isMachineOpcode()) 4199 continue; 4200 4201 if (N->getMachineOpcode() != PPC::RLDICL) 4202 continue; 4203 4204 if (N->getConstantOperandVal(1) != 0 || 4205 N->getConstantOperandVal(2) != 32) 4206 continue; 4207 4208 SDValue ISR = N->getOperand(0); 4209 if (!ISR.isMachineOpcode() || 4210 ISR.getMachineOpcode() != TargetOpcode::INSERT_SUBREG) 4211 continue; 4212 4213 if (!ISR.hasOneUse()) 4214 continue; 4215 4216 if (ISR.getConstantOperandVal(2) != PPC::sub_32) 4217 continue; 4218 4219 SDValue IDef = ISR.getOperand(0); 4220 if (!IDef.isMachineOpcode() || 4221 IDef.getMachineOpcode() != TargetOpcode::IMPLICIT_DEF) 4222 continue; 4223 4224 // We now know that we're looking at a canonical i32 -> i64 zext. See if we 4225 // can get rid of it. 4226 4227 SDValue Op32 = ISR->getOperand(1); 4228 if (!Op32.isMachineOpcode()) 4229 continue; 4230 4231 // There are some 32-bit instructions that always clear the high-order 32 4232 // bits, there are also some instructions (like AND) that we can look 4233 // through. 4234 SmallPtrSet<SDNode *, 16> ToPromote; 4235 if (!PeepholePPC64ZExtGather(Op32, ToPromote)) 4236 continue; 4237 4238 // If the ToPromote set contains nodes that have uses outside of the set 4239 // (except for the original INSERT_SUBREG), then abort the transformation. 4240 bool OutsideUse = false; 4241 for (SDNode *PN : ToPromote) { 4242 for (SDNode *UN : PN->uses()) { 4243 if (!ToPromote.count(UN) && UN != ISR.getNode()) { 4244 OutsideUse = true; 4245 break; 4246 } 4247 } 4248 4249 if (OutsideUse) 4250 break; 4251 } 4252 if (OutsideUse) 4253 continue; 4254 4255 MadeChange = true; 4256 4257 // We now know that this zero extension can be removed by promoting to 4258 // nodes in ToPromote to 64-bit operations, where for operations in the 4259 // frontier of the set, we need to insert INSERT_SUBREGs for their 4260 // operands. 4261 for (SDNode *PN : ToPromote) { 4262 unsigned NewOpcode; 4263 switch (PN->getMachineOpcode()) { 4264 default: 4265 llvm_unreachable("Don't know the 64-bit variant of this instruction"); 4266 case PPC::RLWINM: NewOpcode = PPC::RLWINM8; break; 4267 case PPC::RLWNM: NewOpcode = PPC::RLWNM8; break; 4268 case PPC::SLW: NewOpcode = PPC::SLW8; break; 4269 case PPC::SRW: NewOpcode = PPC::SRW8; break; 4270 case PPC::LI: NewOpcode = PPC::LI8; break; 4271 case PPC::LIS: NewOpcode = PPC::LIS8; break; 4272 case PPC::LHBRX: NewOpcode = PPC::LHBRX8; break; 4273 case PPC::LWBRX: NewOpcode = PPC::LWBRX8; break; 4274 case PPC::CNTLZW: NewOpcode = PPC::CNTLZW8; break; 4275 case PPC::CNTTZW: NewOpcode = PPC::CNTTZW8; break; 4276 case PPC::RLWIMI: NewOpcode = PPC::RLWIMI8; break; 4277 case PPC::OR: NewOpcode = PPC::OR8; break; 4278 case PPC::SELECT_I4: NewOpcode = PPC::SELECT_I8; break; 4279 case PPC::ORI: NewOpcode = PPC::ORI8; break; 4280 case PPC::ORIS: NewOpcode = PPC::ORIS8; break; 4281 case PPC::AND: NewOpcode = PPC::AND8; break; 4282 case PPC::ANDIo: NewOpcode = PPC::ANDIo8; break; 4283 case PPC::ANDISo: NewOpcode = PPC::ANDISo8; break; 4284 } 4285 4286 // Note: During the replacement process, the nodes will be in an 4287 // inconsistent state (some instructions will have operands with values 4288 // of the wrong type). Once done, however, everything should be right 4289 // again. 4290 4291 SmallVector<SDValue, 4> Ops; 4292 for (const SDValue &V : PN->ops()) { 4293 if (!ToPromote.count(V.getNode()) && V.getValueType() == MVT::i32 && 4294 !isa<ConstantSDNode>(V)) { 4295 SDValue ReplOpOps[] = { ISR.getOperand(0), V, ISR.getOperand(2) }; 4296 SDNode *ReplOp = 4297 CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(V), 4298 ISR.getNode()->getVTList(), ReplOpOps); 4299 Ops.push_back(SDValue(ReplOp, 0)); 4300 } else { 4301 Ops.push_back(V); 4302 } 4303 } 4304 4305 // Because all to-be-promoted nodes only have users that are other 4306 // promoted nodes (or the original INSERT_SUBREG), we can safely replace 4307 // the i32 result value type with i64. 4308 4309 SmallVector<EVT, 2> NewVTs; 4310 SDVTList VTs = PN->getVTList(); 4311 for (unsigned i = 0, ie = VTs.NumVTs; i != ie; ++i) 4312 if (VTs.VTs[i] == MVT::i32) 4313 NewVTs.push_back(MVT::i64); 4314 else 4315 NewVTs.push_back(VTs.VTs[i]); 4316 4317 DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld: "); 4318 DEBUG(PN->dump(CurDAG)); 4319 4320 CurDAG->SelectNodeTo(PN, NewOpcode, CurDAG->getVTList(NewVTs), Ops); 4321 4322 DEBUG(dbgs() << "\nNew: "); 4323 DEBUG(PN->dump(CurDAG)); 4324 DEBUG(dbgs() << "\n"); 4325 } 4326 4327 // Now we replace the original zero extend and its associated INSERT_SUBREG 4328 // with the value feeding the INSERT_SUBREG (which has now been promoted to 4329 // return an i64). 4330 4331 DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld: "); 4332 DEBUG(N->dump(CurDAG)); 4333 DEBUG(dbgs() << "\nNew: "); 4334 DEBUG(Op32.getNode()->dump(CurDAG)); 4335 DEBUG(dbgs() << "\n"); 4336 4337 ReplaceUses(N, Op32.getNode()); 4338 } 4339 4340 if (MadeChange) 4341 CurDAG->RemoveDeadNodes(); 4342 } 4343 4344 void PPCDAGToDAGISel::PeepholePPC64() { 4345 // These optimizations are currently supported only for 64-bit SVR4. 4346 if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64()) 4347 return; 4348 4349 SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode()); 4350 ++Position; 4351 4352 while (Position != CurDAG->allnodes_begin()) { 4353 SDNode *N = &*--Position; 4354 // Skip dead nodes and any non-machine opcodes. 4355 if (N->use_empty() || !N->isMachineOpcode()) 4356 continue; 4357 4358 unsigned FirstOp; 4359 unsigned StorageOpcode = N->getMachineOpcode(); 4360 4361 switch (StorageOpcode) { 4362 default: continue; 4363 4364 case PPC::LBZ: 4365 case PPC::LBZ8: 4366 case PPC::LD: 4367 case PPC::LFD: 4368 case PPC::LFS: 4369 case PPC::LHA: 4370 case PPC::LHA8: 4371 case PPC::LHZ: 4372 case PPC::LHZ8: 4373 case PPC::LWA: 4374 case PPC::LWZ: 4375 case PPC::LWZ8: 4376 FirstOp = 0; 4377 break; 4378 4379 case PPC::STB: 4380 case PPC::STB8: 4381 case PPC::STD: 4382 case PPC::STFD: 4383 case PPC::STFS: 4384 case PPC::STH: 4385 case PPC::STH8: 4386 case PPC::STW: 4387 case PPC::STW8: 4388 FirstOp = 1; 4389 break; 4390 } 4391 4392 // If this is a load or store with a zero offset, or within the alignment, 4393 // we may be able to fold an add-immediate into the memory operation. 4394 // The check against alignment is below, as it can't occur until we check 4395 // the arguments to N 4396 if (!isa<ConstantSDNode>(N->getOperand(FirstOp))) 4397 continue; 4398 4399 SDValue Base = N->getOperand(FirstOp + 1); 4400 if (!Base.isMachineOpcode()) 4401 continue; 4402 4403 unsigned Flags = 0; 4404 bool ReplaceFlags = true; 4405 4406 // When the feeding operation is an add-immediate of some sort, 4407 // determine whether we need to add relocation information to the 4408 // target flags on the immediate operand when we fold it into the 4409 // load instruction. 4410 // 4411 // For something like ADDItocL, the relocation information is 4412 // inferred from the opcode; when we process it in the AsmPrinter, 4413 // we add the necessary relocation there. A load, though, can receive 4414 // relocation from various flavors of ADDIxxx, so we need to carry 4415 // the relocation information in the target flags. 4416 switch (Base.getMachineOpcode()) { 4417 default: continue; 4418 4419 case PPC::ADDI8: 4420 case PPC::ADDI: 4421 // In some cases (such as TLS) the relocation information 4422 // is already in place on the operand, so copying the operand 4423 // is sufficient. 4424 ReplaceFlags = false; 4425 // For these cases, the immediate may not be divisible by 4, in 4426 // which case the fold is illegal for DS-form instructions. (The 4427 // other cases provide aligned addresses and are always safe.) 4428 if ((StorageOpcode == PPC::LWA || 4429 StorageOpcode == PPC::LD || 4430 StorageOpcode == PPC::STD) && 4431 (!isa<ConstantSDNode>(Base.getOperand(1)) || 4432 Base.getConstantOperandVal(1) % 4 != 0)) 4433 continue; 4434 break; 4435 case PPC::ADDIdtprelL: 4436 Flags = PPCII::MO_DTPREL_LO; 4437 break; 4438 case PPC::ADDItlsldL: 4439 Flags = PPCII::MO_TLSLD_LO; 4440 break; 4441 case PPC::ADDItocL: 4442 Flags = PPCII::MO_TOC_LO; 4443 break; 4444 } 4445 4446 SDValue ImmOpnd = Base.getOperand(1); 4447 4448 // On PPC64, the TOC base pointer is guaranteed by the ABI only to have 4449 // 8-byte alignment, and so we can only use offsets less than 8 (otherwise, 4450 // we might have needed different @ha relocation values for the offset 4451 // pointers). 4452 int MaxDisplacement = 7; 4453 if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) { 4454 const GlobalValue *GV = GA->getGlobal(); 4455 MaxDisplacement = std::min((int) GV->getAlignment() - 1, MaxDisplacement); 4456 } 4457 4458 bool UpdateHBase = false; 4459 SDValue HBase = Base.getOperand(0); 4460 4461 int Offset = N->getConstantOperandVal(FirstOp); 4462 if (ReplaceFlags) { 4463 if (Offset < 0 || Offset > MaxDisplacement) { 4464 // If we have a addi(toc@l)/addis(toc@ha) pair, and the addis has only 4465 // one use, then we can do this for any offset, we just need to also 4466 // update the offset (i.e. the symbol addend) on the addis also. 4467 if (Base.getMachineOpcode() != PPC::ADDItocL) 4468 continue; 4469 4470 if (!HBase.isMachineOpcode() || 4471 HBase.getMachineOpcode() != PPC::ADDIStocHA) 4472 continue; 4473 4474 if (!Base.hasOneUse() || !HBase.hasOneUse()) 4475 continue; 4476 4477 SDValue HImmOpnd = HBase.getOperand(1); 4478 if (HImmOpnd != ImmOpnd) 4479 continue; 4480 4481 UpdateHBase = true; 4482 } 4483 } else { 4484 // If we're directly folding the addend from an addi instruction, then: 4485 // 1. In general, the offset on the memory access must be zero. 4486 // 2. If the addend is a constant, then it can be combined with a 4487 // non-zero offset, but only if the result meets the encoding 4488 // requirements. 4489 if (auto *C = dyn_cast<ConstantSDNode>(ImmOpnd)) { 4490 Offset += C->getSExtValue(); 4491 4492 if ((StorageOpcode == PPC::LWA || StorageOpcode == PPC::LD || 4493 StorageOpcode == PPC::STD) && (Offset % 4) != 0) 4494 continue; 4495 4496 if (!isInt<16>(Offset)) 4497 continue; 4498 4499 ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd), 4500 ImmOpnd.getValueType()); 4501 } else if (Offset != 0) { 4502 continue; 4503 } 4504 } 4505 4506 // We found an opportunity. Reverse the operands from the add 4507 // immediate and substitute them into the load or store. If 4508 // needed, update the target flags for the immediate operand to 4509 // reflect the necessary relocation information. 4510 DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: "); 4511 DEBUG(Base->dump(CurDAG)); 4512 DEBUG(dbgs() << "\nN: "); 4513 DEBUG(N->dump(CurDAG)); 4514 DEBUG(dbgs() << "\n"); 4515 4516 // If the relocation information isn't already present on the 4517 // immediate operand, add it now. 4518 if (ReplaceFlags) { 4519 if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) { 4520 SDLoc dl(GA); 4521 const GlobalValue *GV = GA->getGlobal(); 4522 // We can't perform this optimization for data whose alignment 4523 // is insufficient for the instruction encoding. 4524 if (GV->getAlignment() < 4 && 4525 (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD || 4526 StorageOpcode == PPC::LWA || (Offset % 4) != 0)) { 4527 DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n"); 4528 continue; 4529 } 4530 ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags); 4531 } else if (ConstantPoolSDNode *CP = 4532 dyn_cast<ConstantPoolSDNode>(ImmOpnd)) { 4533 const Constant *C = CP->getConstVal(); 4534 ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64, 4535 CP->getAlignment(), 4536 Offset, Flags); 4537 } 4538 } 4539 4540 if (FirstOp == 1) // Store 4541 (void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd, 4542 Base.getOperand(0), N->getOperand(3)); 4543 else // Load 4544 (void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0), 4545 N->getOperand(2)); 4546 4547 if (UpdateHBase) 4548 (void)CurDAG->UpdateNodeOperands(HBase.getNode(), HBase.getOperand(0), 4549 ImmOpnd); 4550 4551 // The add-immediate may now be dead, in which case remove it. 4552 if (Base.getNode()->use_empty()) 4553 CurDAG->RemoveDeadNode(Base.getNode()); 4554 } 4555 } 4556 4557 /// createPPCISelDag - This pass converts a legalized DAG into a 4558 /// PowerPC-specific DAG, ready for instruction scheduling. 4559 /// 4560 FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) { 4561 return new PPCDAGToDAGISel(TM); 4562 } 4563