1 //===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the PPCISelLowering class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "PPCISelLowering.h" 15 #include "MCTargetDesc/PPCPredicates.h" 16 #include "PPC.h" 17 #include "PPCCCState.h" 18 #include "PPCCallingConv.h" 19 #include "PPCFrameLowering.h" 20 #include "PPCInstrInfo.h" 21 #include "PPCMachineFunctionInfo.h" 22 #include "PPCPerfectShuffle.h" 23 #include "PPCRegisterInfo.h" 24 #include "PPCSubtarget.h" 25 #include "PPCTargetMachine.h" 26 #include "llvm/ADT/APFloat.h" 27 #include "llvm/ADT/APInt.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/DenseMap.h" 30 #include "llvm/ADT/None.h" 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/ADT/SmallPtrSet.h" 33 #include "llvm/ADT/SmallSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Statistic.h" 36 #include "llvm/ADT/StringRef.h" 37 #include "llvm/ADT/StringSwitch.h" 38 #include "llvm/CodeGen/CallingConvLower.h" 39 #include "llvm/CodeGen/ISDOpcodes.h" 40 #include "llvm/CodeGen/MachineBasicBlock.h" 41 #include "llvm/CodeGen/MachineFrameInfo.h" 42 #include "llvm/CodeGen/MachineFunction.h" 43 #include "llvm/CodeGen/MachineInstr.h" 44 #include "llvm/CodeGen/MachineInstrBuilder.h" 45 #include "llvm/CodeGen/MachineJumpTableInfo.h" 46 #include "llvm/CodeGen/MachineLoopInfo.h" 47 #include "llvm/CodeGen/MachineMemOperand.h" 48 #include "llvm/CodeGen/MachineOperand.h" 49 #include "llvm/CodeGen/MachineRegisterInfo.h" 50 #include "llvm/CodeGen/MachineValueType.h" 51 #include "llvm/CodeGen/RuntimeLibcalls.h" 52 #include "llvm/CodeGen/SelectionDAG.h" 53 #include "llvm/CodeGen/SelectionDAGNodes.h" 54 #include "llvm/CodeGen/ValueTypes.h" 55 #include "llvm/IR/CallSite.h" 56 #include "llvm/IR/CallingConv.h" 57 #include "llvm/IR/Constant.h" 58 #include "llvm/IR/Constants.h" 59 #include "llvm/IR/DataLayout.h" 60 #include "llvm/IR/DebugLoc.h" 61 #include "llvm/IR/DerivedTypes.h" 62 #include "llvm/IR/Function.h" 63 #include "llvm/IR/GlobalValue.h" 64 #include "llvm/IR/IRBuilder.h" 65 #include "llvm/IR/Instructions.h" 66 #include "llvm/IR/Intrinsics.h" 67 #include "llvm/IR/Module.h" 68 #include "llvm/IR/Type.h" 69 #include "llvm/IR/Use.h" 70 #include "llvm/IR/Value.h" 71 #include "llvm/MC/MCExpr.h" 72 #include "llvm/MC/MCRegisterInfo.h" 73 #include "llvm/Support/AtomicOrdering.h" 74 #include "llvm/Support/BranchProbability.h" 75 #include "llvm/Support/Casting.h" 76 #include "llvm/Support/CodeGen.h" 77 #include "llvm/Support/CommandLine.h" 78 #include "llvm/Support/Compiler.h" 79 #include "llvm/Support/Debug.h" 80 #include "llvm/Support/ErrorHandling.h" 81 #include "llvm/Support/Format.h" 82 #include "llvm/Support/KnownBits.h" 83 #include "llvm/Support/MathExtras.h" 84 #include "llvm/Support/raw_ostream.h" 85 #include "llvm/Target/TargetInstrInfo.h" 86 #include "llvm/Target/TargetLowering.h" 87 #include "llvm/Target/TargetMachine.h" 88 #include "llvm/Target/TargetOptions.h" 89 #include "llvm/Target/TargetRegisterInfo.h" 90 #include <algorithm> 91 #include <cassert> 92 #include <cstdint> 93 #include <iterator> 94 #include <list> 95 #include <utility> 96 #include <vector> 97 98 using namespace llvm; 99 100 #define DEBUG_TYPE "ppc-lowering" 101 102 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc", 103 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); 104 105 static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref", 106 cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden); 107 108 static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned", 109 cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); 110 111 static cl::opt<bool> DisableSCO("disable-ppc-sco", 112 cl::desc("disable sibling call optimization on ppc"), cl::Hidden); 113 114 STATISTIC(NumTailCalls, "Number of tail calls"); 115 STATISTIC(NumSiblingCalls, "Number of sibling calls"); 116 117 // FIXME: Remove this once the bug has been fixed! 118 extern cl::opt<bool> ANDIGlueBug; 119 120 PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, 121 const PPCSubtarget &STI) 122 : TargetLowering(TM), Subtarget(STI) { 123 // Use _setjmp/_longjmp instead of setjmp/longjmp. 124 setUseUnderscoreSetJmp(true); 125 setUseUnderscoreLongJmp(true); 126 127 // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all 128 // arguments are at least 4/8 bytes aligned. 129 bool isPPC64 = Subtarget.isPPC64(); 130 setMinStackArgumentAlignment(isPPC64 ? 8:4); 131 132 // Set up the register classes. 133 addRegisterClass(MVT::i32, &PPC::GPRCRegClass); 134 if (!useSoftFloat()) { 135 addRegisterClass(MVT::f32, &PPC::F4RCRegClass); 136 addRegisterClass(MVT::f64, &PPC::F8RCRegClass); 137 } 138 139 // PowerPC has an i16 but no i8 (or i1) SEXTLOAD 140 for (MVT VT : MVT::integer_valuetypes()) { 141 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 142 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); 143 } 144 145 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 146 147 // PowerPC has pre-inc load and store's. 148 setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal); 149 setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal); 150 setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal); 151 setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal); 152 setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal); 153 setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal); 154 setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal); 155 setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal); 156 setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal); 157 setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal); 158 setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal); 159 setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal); 160 setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal); 161 setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal); 162 163 if (Subtarget.useCRBits()) { 164 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 165 166 if (isPPC64 || Subtarget.hasFPCVT()) { 167 setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); 168 AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, 169 isPPC64 ? MVT::i64 : MVT::i32); 170 setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); 171 AddPromotedToType(ISD::UINT_TO_FP, MVT::i1, 172 isPPC64 ? MVT::i64 : MVT::i32); 173 } else { 174 setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom); 175 setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); 176 } 177 178 // PowerPC does not support direct load / store of condition registers 179 setOperationAction(ISD::LOAD, MVT::i1, Custom); 180 setOperationAction(ISD::STORE, MVT::i1, Custom); 181 182 // FIXME: Remove this once the ANDI glue bug is fixed: 183 if (ANDIGlueBug) 184 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); 185 186 for (MVT VT : MVT::integer_valuetypes()) { 187 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 188 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 189 setTruncStoreAction(VT, MVT::i1, Expand); 190 } 191 192 addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass); 193 } 194 195 // This is used in the ppcf128->int sequence. Note it has different semantics 196 // from FP_ROUND: that rounds to nearest, this rounds to zero. 197 setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom); 198 199 // We do not currently implement these libm ops for PowerPC. 200 setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand); 201 setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand); 202 setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand); 203 setOperationAction(ISD::FRINT, MVT::ppcf128, Expand); 204 setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand); 205 setOperationAction(ISD::FREM, MVT::ppcf128, Expand); 206 207 // PowerPC has no SREM/UREM instructions 208 setOperationAction(ISD::SREM, MVT::i32, Expand); 209 setOperationAction(ISD::UREM, MVT::i32, Expand); 210 setOperationAction(ISD::SREM, MVT::i64, Expand); 211 setOperationAction(ISD::UREM, MVT::i64, Expand); 212 213 // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. 214 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 215 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 216 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 217 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 218 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 219 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 220 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 221 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 222 223 // We don't support sin/cos/sqrt/fmod/pow 224 setOperationAction(ISD::FSIN , MVT::f64, Expand); 225 setOperationAction(ISD::FCOS , MVT::f64, Expand); 226 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 227 setOperationAction(ISD::FREM , MVT::f64, Expand); 228 setOperationAction(ISD::FPOW , MVT::f64, Expand); 229 setOperationAction(ISD::FMA , MVT::f64, Legal); 230 setOperationAction(ISD::FSIN , MVT::f32, Expand); 231 setOperationAction(ISD::FCOS , MVT::f32, Expand); 232 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 233 setOperationAction(ISD::FREM , MVT::f32, Expand); 234 setOperationAction(ISD::FPOW , MVT::f32, Expand); 235 setOperationAction(ISD::FMA , MVT::f32, Legal); 236 237 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 238 239 // If we're enabling GP optimizations, use hardware square root 240 if (!Subtarget.hasFSQRT() && 241 !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() && 242 Subtarget.hasFRE())) 243 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 244 245 if (!Subtarget.hasFSQRT() && 246 !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() && 247 Subtarget.hasFRES())) 248 setOperationAction(ISD::FSQRT, MVT::f32, Expand); 249 250 if (Subtarget.hasFCPSGN()) { 251 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal); 252 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal); 253 } else { 254 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 255 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 256 } 257 258 if (Subtarget.hasFPRND()) { 259 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 260 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 261 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 262 setOperationAction(ISD::FROUND, MVT::f64, Legal); 263 264 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 265 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 266 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 267 setOperationAction(ISD::FROUND, MVT::f32, Legal); 268 } 269 270 // PowerPC does not have BSWAP 271 // CTPOP or CTTZ were introduced in P8/P9 respectivelly 272 setOperationAction(ISD::BSWAP, MVT::i32 , Expand); 273 setOperationAction(ISD::BSWAP, MVT::i64 , Expand); 274 if (Subtarget.isISA3_0()) { 275 setOperationAction(ISD::CTTZ , MVT::i32 , Legal); 276 setOperationAction(ISD::CTTZ , MVT::i64 , Legal); 277 } else { 278 setOperationAction(ISD::CTTZ , MVT::i32 , Expand); 279 setOperationAction(ISD::CTTZ , MVT::i64 , Expand); 280 } 281 282 if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) { 283 setOperationAction(ISD::CTPOP, MVT::i32 , Legal); 284 setOperationAction(ISD::CTPOP, MVT::i64 , Legal); 285 } else { 286 setOperationAction(ISD::CTPOP, MVT::i32 , Expand); 287 setOperationAction(ISD::CTPOP, MVT::i64 , Expand); 288 } 289 290 // PowerPC does not have ROTR 291 setOperationAction(ISD::ROTR, MVT::i32 , Expand); 292 setOperationAction(ISD::ROTR, MVT::i64 , Expand); 293 294 if (!Subtarget.useCRBits()) { 295 // PowerPC does not have Select 296 setOperationAction(ISD::SELECT, MVT::i32, Expand); 297 setOperationAction(ISD::SELECT, MVT::i64, Expand); 298 setOperationAction(ISD::SELECT, MVT::f32, Expand); 299 setOperationAction(ISD::SELECT, MVT::f64, Expand); 300 } 301 302 // PowerPC wants to turn select_cc of FP into fsel when possible. 303 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 304 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 305 306 // PowerPC wants to optimize integer setcc a bit 307 if (!Subtarget.useCRBits()) 308 setOperationAction(ISD::SETCC, MVT::i32, Custom); 309 310 // PowerPC does not have BRCOND which requires SetCC 311 if (!Subtarget.useCRBits()) 312 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 313 314 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 315 316 // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores. 317 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 318 319 // PowerPC does not have [U|S]INT_TO_FP 320 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); 321 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); 322 323 if (Subtarget.hasDirectMove() && isPPC64) { 324 setOperationAction(ISD::BITCAST, MVT::f32, Legal); 325 setOperationAction(ISD::BITCAST, MVT::i32, Legal); 326 setOperationAction(ISD::BITCAST, MVT::i64, Legal); 327 setOperationAction(ISD::BITCAST, MVT::f64, Legal); 328 } else { 329 setOperationAction(ISD::BITCAST, MVT::f32, Expand); 330 setOperationAction(ISD::BITCAST, MVT::i32, Expand); 331 setOperationAction(ISD::BITCAST, MVT::i64, Expand); 332 setOperationAction(ISD::BITCAST, MVT::f64, Expand); 333 } 334 335 // We cannot sextinreg(i1). Expand to shifts. 336 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 337 338 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support 339 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 340 // support continuation, user-level threading, and etc.. As a result, no 341 // other SjLj exception interfaces are implemented and please don't build 342 // your own exception handling based on them. 343 // LLVM/Clang supports zero-cost DWARF exception handling. 344 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 345 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 346 347 // We want to legalize GlobalAddress and ConstantPool nodes into the 348 // appropriate instructions to materialize the address. 349 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 350 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 351 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 352 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 353 setOperationAction(ISD::JumpTable, MVT::i32, Custom); 354 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 355 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 356 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 357 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 358 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 359 360 // TRAP is legal. 361 setOperationAction(ISD::TRAP, MVT::Other, Legal); 362 363 // TRAMPOLINE is custom lowered. 364 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 365 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 366 367 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 368 setOperationAction(ISD::VASTART , MVT::Other, Custom); 369 370 if (Subtarget.isSVR4ABI()) { 371 if (isPPC64) { 372 // VAARG always uses double-word chunks, so promote anything smaller. 373 setOperationAction(ISD::VAARG, MVT::i1, Promote); 374 AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64); 375 setOperationAction(ISD::VAARG, MVT::i8, Promote); 376 AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64); 377 setOperationAction(ISD::VAARG, MVT::i16, Promote); 378 AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64); 379 setOperationAction(ISD::VAARG, MVT::i32, Promote); 380 AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64); 381 setOperationAction(ISD::VAARG, MVT::Other, Expand); 382 } else { 383 // VAARG is custom lowered with the 32-bit SVR4 ABI. 384 setOperationAction(ISD::VAARG, MVT::Other, Custom); 385 setOperationAction(ISD::VAARG, MVT::i64, Custom); 386 } 387 } else 388 setOperationAction(ISD::VAARG, MVT::Other, Expand); 389 390 if (Subtarget.isSVR4ABI() && !isPPC64) 391 // VACOPY is custom lowered with the 32-bit SVR4 ABI. 392 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 393 else 394 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 395 396 // Use the default implementation. 397 setOperationAction(ISD::VAEND , MVT::Other, Expand); 398 setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); 399 setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); 400 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); 401 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); 402 setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom); 403 setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom); 404 setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); 405 setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom); 406 407 // We want to custom lower some of our intrinsics. 408 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 409 410 // To handle counter-based loop conditions. 411 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); 412 413 setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); 414 setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); 415 setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom); 416 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 417 418 // Comparisons that require checking two conditions. 419 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 420 setCondCodeAction(ISD::SETULT, MVT::f64, Expand); 421 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 422 setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); 423 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 424 setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); 425 setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); 426 setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); 427 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 428 setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); 429 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 430 setCondCodeAction(ISD::SETONE, MVT::f64, Expand); 431 432 if (Subtarget.has64BitSupport()) { 433 // They also have instructions for converting between i64 and fp. 434 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 435 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); 436 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 437 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); 438 // This is just the low 32 bits of a (signed) fp->i64 conversion. 439 // We cannot do this with Promote because i64 is not a legal type. 440 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 441 442 if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) 443 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 444 } else { 445 // PowerPC does not have FP_TO_UINT on 32-bit implementations. 446 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); 447 } 448 449 // With the instructions enabled under FPCVT, we can do everything. 450 if (Subtarget.hasFPCVT()) { 451 if (Subtarget.has64BitSupport()) { 452 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 453 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 454 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 455 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 456 } 457 458 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 459 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 460 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 461 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 462 } 463 464 if (Subtarget.use64BitRegs()) { 465 // 64-bit PowerPC implementations can support i64 types directly 466 addRegisterClass(MVT::i64, &PPC::G8RCRegClass); 467 // BUILD_PAIR can't be handled natively, and should be expanded to shl/or 468 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); 469 // 64-bit PowerPC wants to expand i128 shifts itself. 470 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 471 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 472 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 473 } else { 474 // 32-bit PowerPC wants to expand i64 shifts itself. 475 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 476 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 477 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 478 } 479 480 if (Subtarget.hasAltivec()) { 481 // First set operation action for all vector types to expand. Then we 482 // will selectively turn on ones that can be effectively codegen'd. 483 for (MVT VT : MVT::vector_valuetypes()) { 484 // add/sub are legal for all supported vector VT's. 485 setOperationAction(ISD::ADD, VT, Legal); 486 setOperationAction(ISD::SUB, VT, Legal); 487 488 // Vector instructions introduced in P8 489 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { 490 setOperationAction(ISD::CTPOP, VT, Legal); 491 setOperationAction(ISD::CTLZ, VT, Legal); 492 } 493 else { 494 setOperationAction(ISD::CTPOP, VT, Expand); 495 setOperationAction(ISD::CTLZ, VT, Expand); 496 } 497 498 // Vector instructions introduced in P9 499 if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128)) 500 setOperationAction(ISD::CTTZ, VT, Legal); 501 else 502 setOperationAction(ISD::CTTZ, VT, Expand); 503 504 // We promote all shuffles to v16i8. 505 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote); 506 AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8); 507 508 // We promote all non-typed operations to v4i32. 509 setOperationAction(ISD::AND , VT, Promote); 510 AddPromotedToType (ISD::AND , VT, MVT::v4i32); 511 setOperationAction(ISD::OR , VT, Promote); 512 AddPromotedToType (ISD::OR , VT, MVT::v4i32); 513 setOperationAction(ISD::XOR , VT, Promote); 514 AddPromotedToType (ISD::XOR , VT, MVT::v4i32); 515 setOperationAction(ISD::LOAD , VT, Promote); 516 AddPromotedToType (ISD::LOAD , VT, MVT::v4i32); 517 setOperationAction(ISD::SELECT, VT, Promote); 518 AddPromotedToType (ISD::SELECT, VT, MVT::v4i32); 519 setOperationAction(ISD::SELECT_CC, VT, Promote); 520 AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32); 521 setOperationAction(ISD::STORE, VT, Promote); 522 AddPromotedToType (ISD::STORE, VT, MVT::v4i32); 523 524 // No other operations are legal. 525 setOperationAction(ISD::MUL , VT, Expand); 526 setOperationAction(ISD::SDIV, VT, Expand); 527 setOperationAction(ISD::SREM, VT, Expand); 528 setOperationAction(ISD::UDIV, VT, Expand); 529 setOperationAction(ISD::UREM, VT, Expand); 530 setOperationAction(ISD::FDIV, VT, Expand); 531 setOperationAction(ISD::FREM, VT, Expand); 532 setOperationAction(ISD::FNEG, VT, Expand); 533 setOperationAction(ISD::FSQRT, VT, Expand); 534 setOperationAction(ISD::FLOG, VT, Expand); 535 setOperationAction(ISD::FLOG10, VT, Expand); 536 setOperationAction(ISD::FLOG2, VT, Expand); 537 setOperationAction(ISD::FEXP, VT, Expand); 538 setOperationAction(ISD::FEXP2, VT, Expand); 539 setOperationAction(ISD::FSIN, VT, Expand); 540 setOperationAction(ISD::FCOS, VT, Expand); 541 setOperationAction(ISD::FABS, VT, Expand); 542 setOperationAction(ISD::FFLOOR, VT, Expand); 543 setOperationAction(ISD::FCEIL, VT, Expand); 544 setOperationAction(ISD::FTRUNC, VT, Expand); 545 setOperationAction(ISD::FRINT, VT, Expand); 546 setOperationAction(ISD::FNEARBYINT, VT, Expand); 547 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); 548 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 549 setOperationAction(ISD::BUILD_VECTOR, VT, Expand); 550 setOperationAction(ISD::MULHU, VT, Expand); 551 setOperationAction(ISD::MULHS, VT, Expand); 552 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 553 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 554 setOperationAction(ISD::UDIVREM, VT, Expand); 555 setOperationAction(ISD::SDIVREM, VT, Expand); 556 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 557 setOperationAction(ISD::FPOW, VT, Expand); 558 setOperationAction(ISD::BSWAP, VT, Expand); 559 setOperationAction(ISD::VSELECT, VT, Expand); 560 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 561 setOperationAction(ISD::ROTL, VT, Expand); 562 setOperationAction(ISD::ROTR, VT, Expand); 563 564 for (MVT InnerVT : MVT::vector_valuetypes()) { 565 setTruncStoreAction(VT, InnerVT, Expand); 566 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 567 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 568 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 569 } 570 } 571 572 // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle 573 // with merges, splats, etc. 574 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); 575 576 setOperationAction(ISD::AND , MVT::v4i32, Legal); 577 setOperationAction(ISD::OR , MVT::v4i32, Legal); 578 setOperationAction(ISD::XOR , MVT::v4i32, Legal); 579 setOperationAction(ISD::LOAD , MVT::v4i32, Legal); 580 setOperationAction(ISD::SELECT, MVT::v4i32, 581 Subtarget.useCRBits() ? Legal : Expand); 582 setOperationAction(ISD::STORE , MVT::v4i32, Legal); 583 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 584 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); 585 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 586 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); 587 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 588 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 589 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 590 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); 591 592 addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); 593 addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); 594 addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); 595 addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass); 596 597 setOperationAction(ISD::MUL, MVT::v4f32, Legal); 598 setOperationAction(ISD::FMA, MVT::v4f32, Legal); 599 600 if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) { 601 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 602 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 603 } 604 605 if (Subtarget.hasP8Altivec()) 606 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 607 else 608 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 609 610 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 611 setOperationAction(ISD::MUL, MVT::v16i8, Custom); 612 613 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom); 614 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom); 615 616 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); 617 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); 618 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); 619 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 620 621 // Altivec does not contain unordered floating-point compare instructions 622 setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); 623 setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand); 624 setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); 625 setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); 626 627 if (Subtarget.hasVSX()) { 628 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); 629 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); 630 if (Subtarget.hasP8Vector()) { 631 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); 632 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); 633 } 634 if (Subtarget.hasDirectMove() && isPPC64) { 635 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal); 636 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal); 637 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal); 638 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal); 639 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal); 640 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal); 641 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); 642 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 643 } 644 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); 645 646 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); 647 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); 648 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); 649 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); 650 setOperationAction(ISD::FROUND, MVT::v2f64, Legal); 651 652 setOperationAction(ISD::FROUND, MVT::v4f32, Legal); 653 654 setOperationAction(ISD::MUL, MVT::v2f64, Legal); 655 setOperationAction(ISD::FMA, MVT::v2f64, Legal); 656 657 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 658 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 659 660 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 661 setOperationAction(ISD::VSELECT, MVT::v8i16, Legal); 662 setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); 663 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 664 setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); 665 666 // Share the Altivec comparison restrictions. 667 setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); 668 setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); 669 setCondCodeAction(ISD::SETO, MVT::v2f64, Expand); 670 setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand); 671 672 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 673 setOperationAction(ISD::STORE, MVT::v2f64, Legal); 674 675 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal); 676 677 if (Subtarget.hasP8Vector()) 678 addRegisterClass(MVT::f32, &PPC::VSSRCRegClass); 679 680 addRegisterClass(MVT::f64, &PPC::VSFRCRegClass); 681 682 addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass); 683 addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass); 684 addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass); 685 686 if (Subtarget.hasP8Altivec()) { 687 setOperationAction(ISD::SHL, MVT::v2i64, Legal); 688 setOperationAction(ISD::SRA, MVT::v2i64, Legal); 689 setOperationAction(ISD::SRL, MVT::v2i64, Legal); 690 691 // 128 bit shifts can be accomplished via 3 instructions for SHL and 692 // SRL, but not for SRA because of the instructions available: 693 // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth 694 // doing 695 setOperationAction(ISD::SHL, MVT::v1i128, Expand); 696 setOperationAction(ISD::SRL, MVT::v1i128, Expand); 697 setOperationAction(ISD::SRA, MVT::v1i128, Expand); 698 699 setOperationAction(ISD::SETCC, MVT::v2i64, Legal); 700 } 701 else { 702 setOperationAction(ISD::SHL, MVT::v2i64, Expand); 703 setOperationAction(ISD::SRA, MVT::v2i64, Expand); 704 setOperationAction(ISD::SRL, MVT::v2i64, Expand); 705 706 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 707 708 // VSX v2i64 only supports non-arithmetic operations. 709 setOperationAction(ISD::ADD, MVT::v2i64, Expand); 710 setOperationAction(ISD::SUB, MVT::v2i64, Expand); 711 } 712 713 setOperationAction(ISD::LOAD, MVT::v2i64, Promote); 714 AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64); 715 setOperationAction(ISD::STORE, MVT::v2i64, Promote); 716 AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64); 717 718 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal); 719 720 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); 721 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); 722 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); 723 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); 724 725 // Vector operation legalization checks the result type of 726 // SIGN_EXTEND_INREG, overall legalization checks the inner type. 727 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal); 728 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal); 729 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 730 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 731 732 setOperationAction(ISD::FNEG, MVT::v4f32, Legal); 733 setOperationAction(ISD::FNEG, MVT::v2f64, Legal); 734 setOperationAction(ISD::FABS, MVT::v4f32, Legal); 735 setOperationAction(ISD::FABS, MVT::v2f64, Legal); 736 737 if (Subtarget.hasDirectMove()) 738 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 739 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 740 741 addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass); 742 } 743 744 if (Subtarget.hasP8Altivec()) { 745 addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass); 746 addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass); 747 } 748 749 if (Subtarget.hasP9Vector()) { 750 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 751 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 752 753 // 128 bit shifts can be accomplished via 3 instructions for SHL and 754 // SRL, but not for SRA because of the instructions available: 755 // VS{RL} and VS{RL}O. 756 setOperationAction(ISD::SHL, MVT::v1i128, Legal); 757 setOperationAction(ISD::SRL, MVT::v1i128, Legal); 758 setOperationAction(ISD::SRA, MVT::v1i128, Expand); 759 } 760 } 761 762 if (Subtarget.hasQPX()) { 763 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 764 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 765 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 766 setOperationAction(ISD::FREM, MVT::v4f64, Expand); 767 768 setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal); 769 setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand); 770 771 setOperationAction(ISD::LOAD , MVT::v4f64, Custom); 772 setOperationAction(ISD::STORE , MVT::v4f64, Custom); 773 774 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); 775 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom); 776 777 if (!Subtarget.useCRBits()) 778 setOperationAction(ISD::SELECT, MVT::v4f64, Expand); 779 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 780 781 setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal); 782 setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand); 783 setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand); 784 setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand); 785 setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom); 786 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal); 787 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 788 789 setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal); 790 setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); 791 792 setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); 793 setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand); 794 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); 795 796 setOperationAction(ISD::FNEG , MVT::v4f64, Legal); 797 setOperationAction(ISD::FABS , MVT::v4f64, Legal); 798 setOperationAction(ISD::FSIN , MVT::v4f64, Expand); 799 setOperationAction(ISD::FCOS , MVT::v4f64, Expand); 800 setOperationAction(ISD::FPOW , MVT::v4f64, Expand); 801 setOperationAction(ISD::FLOG , MVT::v4f64, Expand); 802 setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); 803 setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand); 804 setOperationAction(ISD::FEXP , MVT::v4f64, Expand); 805 setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand); 806 807 setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal); 808 setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal); 809 810 setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal); 811 setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal); 812 813 addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass); 814 815 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 816 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 817 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 818 setOperationAction(ISD::FREM, MVT::v4f32, Expand); 819 820 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); 821 setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand); 822 823 setOperationAction(ISD::LOAD , MVT::v4f32, Custom); 824 setOperationAction(ISD::STORE , MVT::v4f32, Custom); 825 826 if (!Subtarget.useCRBits()) 827 setOperationAction(ISD::SELECT, MVT::v4f32, Expand); 828 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 829 830 setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal); 831 setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand); 832 setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand); 833 setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand); 834 setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom); 835 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); 836 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 837 838 setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal); 839 setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand); 840 841 setOperationAction(ISD::FNEG , MVT::v4f32, Legal); 842 setOperationAction(ISD::FABS , MVT::v4f32, Legal); 843 setOperationAction(ISD::FSIN , MVT::v4f32, Expand); 844 setOperationAction(ISD::FCOS , MVT::v4f32, Expand); 845 setOperationAction(ISD::FPOW , MVT::v4f32, Expand); 846 setOperationAction(ISD::FLOG , MVT::v4f32, Expand); 847 setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); 848 setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand); 849 setOperationAction(ISD::FEXP , MVT::v4f32, Expand); 850 setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand); 851 852 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 853 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 854 855 setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal); 856 setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal); 857 858 addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass); 859 860 setOperationAction(ISD::AND , MVT::v4i1, Legal); 861 setOperationAction(ISD::OR , MVT::v4i1, Legal); 862 setOperationAction(ISD::XOR , MVT::v4i1, Legal); 863 864 if (!Subtarget.useCRBits()) 865 setOperationAction(ISD::SELECT, MVT::v4i1, Expand); 866 setOperationAction(ISD::VSELECT, MVT::v4i1, Legal); 867 868 setOperationAction(ISD::LOAD , MVT::v4i1, Custom); 869 setOperationAction(ISD::STORE , MVT::v4i1, Custom); 870 871 setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom); 872 setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand); 873 setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand); 874 setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand); 875 setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom); 876 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand); 877 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); 878 879 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); 880 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); 881 882 addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass); 883 884 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 885 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); 886 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); 887 setOperationAction(ISD::FROUND, MVT::v4f64, Legal); 888 889 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 890 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 891 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 892 setOperationAction(ISD::FROUND, MVT::v4f32, Legal); 893 894 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand); 895 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 896 897 // These need to set FE_INEXACT, and so cannot be vectorized here. 898 setOperationAction(ISD::FRINT, MVT::v4f64, Expand); 899 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 900 901 if (TM.Options.UnsafeFPMath) { 902 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 903 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 904 905 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 906 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 907 } else { 908 setOperationAction(ISD::FDIV, MVT::v4f64, Expand); 909 setOperationAction(ISD::FSQRT, MVT::v4f64, Expand); 910 911 setOperationAction(ISD::FDIV, MVT::v4f32, Expand); 912 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 913 } 914 } 915 916 if (Subtarget.has64BitSupport()) 917 setOperationAction(ISD::PREFETCH, MVT::Other, Legal); 918 919 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom); 920 921 if (!isPPC64) { 922 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); 923 setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); 924 } 925 926 setBooleanContents(ZeroOrOneBooleanContent); 927 928 if (Subtarget.hasAltivec()) { 929 // Altivec instructions set fields to all zeros or all ones. 930 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 931 } 932 933 if (!isPPC64) { 934 // These libcalls are not available in 32-bit. 935 setLibcallName(RTLIB::SHL_I128, nullptr); 936 setLibcallName(RTLIB::SRL_I128, nullptr); 937 setLibcallName(RTLIB::SRA_I128, nullptr); 938 } 939 940 setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); 941 942 // We have target-specific dag combine patterns for the following nodes: 943 setTargetDAGCombine(ISD::SHL); 944 setTargetDAGCombine(ISD::SRA); 945 setTargetDAGCombine(ISD::SRL); 946 setTargetDAGCombine(ISD::SINT_TO_FP); 947 setTargetDAGCombine(ISD::BUILD_VECTOR); 948 if (Subtarget.hasFPCVT()) 949 setTargetDAGCombine(ISD::UINT_TO_FP); 950 setTargetDAGCombine(ISD::LOAD); 951 setTargetDAGCombine(ISD::STORE); 952 setTargetDAGCombine(ISD::BR_CC); 953 if (Subtarget.useCRBits()) 954 setTargetDAGCombine(ISD::BRCOND); 955 setTargetDAGCombine(ISD::BSWAP); 956 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 957 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 958 setTargetDAGCombine(ISD::INTRINSIC_VOID); 959 960 setTargetDAGCombine(ISD::SIGN_EXTEND); 961 setTargetDAGCombine(ISD::ZERO_EXTEND); 962 setTargetDAGCombine(ISD::ANY_EXTEND); 963 964 if (Subtarget.useCRBits()) { 965 setTargetDAGCombine(ISD::TRUNCATE); 966 setTargetDAGCombine(ISD::SETCC); 967 setTargetDAGCombine(ISD::SELECT_CC); 968 } 969 970 // Use reciprocal estimates. 971 if (TM.Options.UnsafeFPMath) { 972 setTargetDAGCombine(ISD::FDIV); 973 setTargetDAGCombine(ISD::FSQRT); 974 } 975 976 // Darwin long double math library functions have $LDBL128 appended. 977 if (Subtarget.isDarwin()) { 978 setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); 979 setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128"); 980 setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128"); 981 setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128"); 982 setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128"); 983 setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128"); 984 setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128"); 985 setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128"); 986 setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128"); 987 setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128"); 988 } 989 990 // With 32 condition bits, we don't need to sink (and duplicate) compares 991 // aggressively in CodeGenPrep. 992 if (Subtarget.useCRBits()) { 993 setHasMultipleConditionRegisters(); 994 setJumpIsExpensive(); 995 } 996 997 setMinFunctionAlignment(2); 998 if (Subtarget.isDarwin()) 999 setPrefFunctionAlignment(4); 1000 1001 switch (Subtarget.getDarwinDirective()) { 1002 default: break; 1003 case PPC::DIR_970: 1004 case PPC::DIR_A2: 1005 case PPC::DIR_E500mc: 1006 case PPC::DIR_E5500: 1007 case PPC::DIR_PWR4: 1008 case PPC::DIR_PWR5: 1009 case PPC::DIR_PWR5X: 1010 case PPC::DIR_PWR6: 1011 case PPC::DIR_PWR6X: 1012 case PPC::DIR_PWR7: 1013 case PPC::DIR_PWR8: 1014 case PPC::DIR_PWR9: 1015 setPrefFunctionAlignment(4); 1016 setPrefLoopAlignment(4); 1017 break; 1018 } 1019 1020 if (Subtarget.enableMachineScheduler()) 1021 setSchedulingPreference(Sched::Source); 1022 else 1023 setSchedulingPreference(Sched::Hybrid); 1024 1025 computeRegisterProperties(STI.getRegisterInfo()); 1026 1027 // The Freescale cores do better with aggressive inlining of memcpy and 1028 // friends. GCC uses same threshold of 128 bytes (= 32 word stores). 1029 if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || 1030 Subtarget.getDarwinDirective() == PPC::DIR_E5500) { 1031 MaxStoresPerMemset = 32; 1032 MaxStoresPerMemsetOptSize = 16; 1033 MaxStoresPerMemcpy = 32; 1034 MaxStoresPerMemcpyOptSize = 8; 1035 MaxStoresPerMemmove = 32; 1036 MaxStoresPerMemmoveOptSize = 8; 1037 } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) { 1038 // The A2 also benefits from (very) aggressive inlining of memcpy and 1039 // friends. The overhead of a the function call, even when warm, can be 1040 // over one hundred cycles. 1041 MaxStoresPerMemset = 128; 1042 MaxStoresPerMemcpy = 128; 1043 MaxStoresPerMemmove = 128; 1044 MaxLoadsPerMemcmp = 128; 1045 } else { 1046 MaxLoadsPerMemcmp = 8; 1047 MaxLoadsPerMemcmpOptSize = 4; 1048 } 1049 } 1050 1051 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1052 /// the desired ByVal argument alignment. 1053 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, 1054 unsigned MaxMaxAlign) { 1055 if (MaxAlign == MaxMaxAlign) 1056 return; 1057 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1058 if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256) 1059 MaxAlign = 32; 1060 else if (VTy->getBitWidth() >= 128 && MaxAlign < 16) 1061 MaxAlign = 16; 1062 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1063 unsigned EltAlign = 0; 1064 getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign); 1065 if (EltAlign > MaxAlign) 1066 MaxAlign = EltAlign; 1067 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1068 for (auto *EltTy : STy->elements()) { 1069 unsigned EltAlign = 0; 1070 getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign); 1071 if (EltAlign > MaxAlign) 1072 MaxAlign = EltAlign; 1073 if (MaxAlign == MaxMaxAlign) 1074 break; 1075 } 1076 } 1077 } 1078 1079 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1080 /// function arguments in the caller parameter area. 1081 unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, 1082 const DataLayout &DL) const { 1083 // Darwin passes everything on 4 byte boundary. 1084 if (Subtarget.isDarwin()) 1085 return 4; 1086 1087 // 16byte and wider vectors are passed on 16byte boundary. 1088 // The rest is 8 on PPC64 and 4 on PPC32 boundary. 1089 unsigned Align = Subtarget.isPPC64() ? 8 : 4; 1090 if (Subtarget.hasAltivec() || Subtarget.hasQPX()) 1091 getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16); 1092 return Align; 1093 } 1094 1095 bool PPCTargetLowering::useSoftFloat() const { 1096 return Subtarget.useSoftFloat(); 1097 } 1098 1099 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { 1100 switch ((PPCISD::NodeType)Opcode) { 1101 case PPCISD::FIRST_NUMBER: break; 1102 case PPCISD::FSEL: return "PPCISD::FSEL"; 1103 case PPCISD::FCFID: return "PPCISD::FCFID"; 1104 case PPCISD::FCFIDU: return "PPCISD::FCFIDU"; 1105 case PPCISD::FCFIDS: return "PPCISD::FCFIDS"; 1106 case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS"; 1107 case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ"; 1108 case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; 1109 case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ"; 1110 case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ"; 1111 case PPCISD::FRE: return "PPCISD::FRE"; 1112 case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; 1113 case PPCISD::STFIWX: return "PPCISD::STFIWX"; 1114 case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; 1115 case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; 1116 case PPCISD::VPERM: return "PPCISD::VPERM"; 1117 case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; 1118 case PPCISD::XXINSERT: return "PPCISD::XXINSERT"; 1119 case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; 1120 case PPCISD::VECSHL: return "PPCISD::VECSHL"; 1121 case PPCISD::CMPB: return "PPCISD::CMPB"; 1122 case PPCISD::Hi: return "PPCISD::Hi"; 1123 case PPCISD::Lo: return "PPCISD::Lo"; 1124 case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; 1125 case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; 1126 case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; 1127 case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; 1128 case PPCISD::SRL: return "PPCISD::SRL"; 1129 case PPCISD::SRA: return "PPCISD::SRA"; 1130 case PPCISD::SHL: return "PPCISD::SHL"; 1131 case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE"; 1132 case PPCISD::CALL: return "PPCISD::CALL"; 1133 case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP"; 1134 case PPCISD::MTCTR: return "PPCISD::MTCTR"; 1135 case PPCISD::BCTRL: return "PPCISD::BCTRL"; 1136 case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC"; 1137 case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; 1138 case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE"; 1139 case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; 1140 case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP"; 1141 case PPCISD::MFOCRF: return "PPCISD::MFOCRF"; 1142 case PPCISD::MFVSR: return "PPCISD::MFVSR"; 1143 case PPCISD::MTVSRA: return "PPCISD::MTVSRA"; 1144 case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ"; 1145 case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP"; 1146 case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP"; 1147 case PPCISD::ANDIo_1_EQ_BIT: return "PPCISD::ANDIo_1_EQ_BIT"; 1148 case PPCISD::ANDIo_1_GT_BIT: return "PPCISD::ANDIo_1_GT_BIT"; 1149 case PPCISD::VCMP: return "PPCISD::VCMP"; 1150 case PPCISD::VCMPo: return "PPCISD::VCMPo"; 1151 case PPCISD::LBRX: return "PPCISD::LBRX"; 1152 case PPCISD::STBRX: return "PPCISD::STBRX"; 1153 case PPCISD::LFIWAX: return "PPCISD::LFIWAX"; 1154 case PPCISD::LFIWZX: return "PPCISD::LFIWZX"; 1155 case PPCISD::LXSIZX: return "PPCISD::LXSIZX"; 1156 case PPCISD::STXSIX: return "PPCISD::STXSIX"; 1157 case PPCISD::VEXTS: return "PPCISD::VEXTS"; 1158 case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; 1159 case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; 1160 case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; 1161 case PPCISD::BDNZ: return "PPCISD::BDNZ"; 1162 case PPCISD::BDZ: return "PPCISD::BDZ"; 1163 case PPCISD::MFFS: return "PPCISD::MFFS"; 1164 case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ"; 1165 case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN"; 1166 case PPCISD::CR6SET: return "PPCISD::CR6SET"; 1167 case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET"; 1168 case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT"; 1169 case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT"; 1170 case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA"; 1171 case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L"; 1172 case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS"; 1173 case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA"; 1174 case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L"; 1175 case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR"; 1176 case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR"; 1177 case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA"; 1178 case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L"; 1179 case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR"; 1180 case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR"; 1181 case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; 1182 case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; 1183 case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; 1184 case PPCISD::SC: return "PPCISD::SC"; 1185 case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB"; 1186 case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE"; 1187 case PPCISD::RFEBB: return "PPCISD::RFEBB"; 1188 case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; 1189 case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; 1190 case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; 1191 case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; 1192 case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; 1193 case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI"; 1194 case PPCISD::QBFLT: return "PPCISD::QBFLT"; 1195 case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; 1196 } 1197 return nullptr; 1198 } 1199 1200 EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, 1201 EVT VT) const { 1202 if (!VT.isVector()) 1203 return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; 1204 1205 if (Subtarget.hasQPX()) 1206 return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements()); 1207 1208 return VT.changeVectorElementTypeToInteger(); 1209 } 1210 1211 bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const { 1212 assert(VT.isFloatingPoint() && "Non-floating-point FMA?"); 1213 return true; 1214 } 1215 1216 //===----------------------------------------------------------------------===// 1217 // Node matching predicates, for use by the tblgen matching code. 1218 //===----------------------------------------------------------------------===// 1219 1220 /// isFloatingPointZero - Return true if this is 0.0 or -0.0. 1221 static bool isFloatingPointZero(SDValue Op) { 1222 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 1223 return CFP->getValueAPF().isZero(); 1224 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 1225 // Maybe this has already been legalized into the constant pool? 1226 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1))) 1227 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 1228 return CFP->getValueAPF().isZero(); 1229 } 1230 return false; 1231 } 1232 1233 /// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return 1234 /// true if Op is undef or if it matches the specified value. 1235 static bool isConstantOrUndef(int Op, int Val) { 1236 return Op < 0 || Op == Val; 1237 } 1238 1239 /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a 1240 /// VPKUHUM instruction. 1241 /// The ShuffleKind distinguishes between big-endian operations with 1242 /// two different inputs (0), either-endian operations with two identical 1243 /// inputs (1), and little-endian operations with two different inputs (2). 1244 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). 1245 bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, 1246 SelectionDAG &DAG) { 1247 bool IsLE = DAG.getDataLayout().isLittleEndian(); 1248 if (ShuffleKind == 0) { 1249 if (IsLE) 1250 return false; 1251 for (unsigned i = 0; i != 16; ++i) 1252 if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) 1253 return false; 1254 } else if (ShuffleKind == 2) { 1255 if (!IsLE) 1256 return false; 1257 for (unsigned i = 0; i != 16; ++i) 1258 if (!isConstantOrUndef(N->getMaskElt(i), i*2)) 1259 return false; 1260 } else if (ShuffleKind == 1) { 1261 unsigned j = IsLE ? 0 : 1; 1262 for (unsigned i = 0; i != 8; ++i) 1263 if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || 1264 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) 1265 return false; 1266 } 1267 return true; 1268 } 1269 1270 /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a 1271 /// VPKUWUM instruction. 1272 /// The ShuffleKind distinguishes between big-endian operations with 1273 /// two different inputs (0), either-endian operations with two identical 1274 /// inputs (1), and little-endian operations with two different inputs (2). 1275 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). 1276 bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, 1277 SelectionDAG &DAG) { 1278 bool IsLE = DAG.getDataLayout().isLittleEndian(); 1279 if (ShuffleKind == 0) { 1280 if (IsLE) 1281 return false; 1282 for (unsigned i = 0; i != 16; i += 2) 1283 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || 1284 !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) 1285 return false; 1286 } else if (ShuffleKind == 2) { 1287 if (!IsLE) 1288 return false; 1289 for (unsigned i = 0; i != 16; i += 2) 1290 if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || 1291 !isConstantOrUndef(N->getMaskElt(i+1), i*2+1)) 1292 return false; 1293 } else if (ShuffleKind == 1) { 1294 unsigned j = IsLE ? 0 : 2; 1295 for (unsigned i = 0; i != 8; i += 2) 1296 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || 1297 !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || 1298 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || 1299 !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1)) 1300 return false; 1301 } 1302 return true; 1303 } 1304 1305 /// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a 1306 /// VPKUDUM instruction, AND the VPKUDUM instruction exists for the 1307 /// current subtarget. 1308 /// 1309 /// The ShuffleKind distinguishes between big-endian operations with 1310 /// two different inputs (0), either-endian operations with two identical 1311 /// inputs (1), and little-endian operations with two different inputs (2). 1312 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). 1313 bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, 1314 SelectionDAG &DAG) { 1315 const PPCSubtarget& Subtarget = 1316 static_cast<const PPCSubtarget&>(DAG.getSubtarget()); 1317 if (!Subtarget.hasP8Vector()) 1318 return false; 1319 1320 bool IsLE = DAG.getDataLayout().isLittleEndian(); 1321 if (ShuffleKind == 0) { 1322 if (IsLE) 1323 return false; 1324 for (unsigned i = 0; i != 16; i += 4) 1325 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) || 1326 !isConstantOrUndef(N->getMaskElt(i+1), i*2+5) || 1327 !isConstantOrUndef(N->getMaskElt(i+2), i*2+6) || 1328 !isConstantOrUndef(N->getMaskElt(i+3), i*2+7)) 1329 return false; 1330 } else if (ShuffleKind == 2) { 1331 if (!IsLE) 1332 return false; 1333 for (unsigned i = 0; i != 16; i += 4) 1334 if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || 1335 !isConstantOrUndef(N->getMaskElt(i+1), i*2+1) || 1336 !isConstantOrUndef(N->getMaskElt(i+2), i*2+2) || 1337 !isConstantOrUndef(N->getMaskElt(i+3), i*2+3)) 1338 return false; 1339 } else if (ShuffleKind == 1) { 1340 unsigned j = IsLE ? 0 : 4; 1341 for (unsigned i = 0; i != 8; i += 4) 1342 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || 1343 !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || 1344 !isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) || 1345 !isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) || 1346 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || 1347 !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) || 1348 !isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) || 1349 !isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3)) 1350 return false; 1351 } 1352 return true; 1353 } 1354 1355 /// isVMerge - Common function, used to match vmrg* shuffles. 1356 /// 1357 static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, 1358 unsigned LHSStart, unsigned RHSStart) { 1359 if (N->getValueType(0) != MVT::v16i8) 1360 return false; 1361 assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) && 1362 "Unsupported merge size!"); 1363 1364 for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units 1365 for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit 1366 if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j), 1367 LHSStart+j+i*UnitSize) || 1368 !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j), 1369 RHSStart+j+i*UnitSize)) 1370 return false; 1371 } 1372 return true; 1373 } 1374 1375 /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for 1376 /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). 1377 /// The ShuffleKind distinguishes between big-endian merges with two 1378 /// different inputs (0), either-endian merges with two identical inputs (1), 1379 /// and little-endian merges with two different inputs (2). For the latter, 1380 /// the input operands are swapped (see PPCInstrAltivec.td). 1381 bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, 1382 unsigned ShuffleKind, SelectionDAG &DAG) { 1383 if (DAG.getDataLayout().isLittleEndian()) { 1384 if (ShuffleKind == 1) // unary 1385 return isVMerge(N, UnitSize, 0, 0); 1386 else if (ShuffleKind == 2) // swapped 1387 return isVMerge(N, UnitSize, 0, 16); 1388 else 1389 return false; 1390 } else { 1391 if (ShuffleKind == 1) // unary 1392 return isVMerge(N, UnitSize, 8, 8); 1393 else if (ShuffleKind == 0) // normal 1394 return isVMerge(N, UnitSize, 8, 24); 1395 else 1396 return false; 1397 } 1398 } 1399 1400 /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for 1401 /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). 1402 /// The ShuffleKind distinguishes between big-endian merges with two 1403 /// different inputs (0), either-endian merges with two identical inputs (1), 1404 /// and little-endian merges with two different inputs (2). For the latter, 1405 /// the input operands are swapped (see PPCInstrAltivec.td). 1406 bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, 1407 unsigned ShuffleKind, SelectionDAG &DAG) { 1408 if (DAG.getDataLayout().isLittleEndian()) { 1409 if (ShuffleKind == 1) // unary 1410 return isVMerge(N, UnitSize, 8, 8); 1411 else if (ShuffleKind == 2) // swapped 1412 return isVMerge(N, UnitSize, 8, 24); 1413 else 1414 return false; 1415 } else { 1416 if (ShuffleKind == 1) // unary 1417 return isVMerge(N, UnitSize, 0, 0); 1418 else if (ShuffleKind == 0) // normal 1419 return isVMerge(N, UnitSize, 0, 16); 1420 else 1421 return false; 1422 } 1423 } 1424 1425 /** 1426 * \brief Common function used to match vmrgew and vmrgow shuffles 1427 * 1428 * The indexOffset determines whether to look for even or odd words in 1429 * the shuffle mask. This is based on the of the endianness of the target 1430 * machine. 1431 * - Little Endian: 1432 * - Use offset of 0 to check for odd elements 1433 * - Use offset of 4 to check for even elements 1434 * - Big Endian: 1435 * - Use offset of 0 to check for even elements 1436 * - Use offset of 4 to check for odd elements 1437 * A detailed description of the vector element ordering for little endian and 1438 * big endian can be found at 1439 * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html 1440 * Targeting your applications - what little endian and big endian IBM XL C/C++ 1441 * compiler differences mean to you 1442 * 1443 * The mask to the shuffle vector instruction specifies the indices of the 1444 * elements from the two input vectors to place in the result. The elements are 1445 * numbered in array-access order, starting with the first vector. These vectors 1446 * are always of type v16i8, thus each vector will contain 16 elements of size 1447 * 8. More info on the shuffle vector can be found in the 1448 * http://llvm.org/docs/LangRef.html#shufflevector-instruction 1449 * Language Reference. 1450 * 1451 * The RHSStartValue indicates whether the same input vectors are used (unary) 1452 * or two different input vectors are used, based on the following: 1453 * - If the instruction uses the same vector for both inputs, the range of the 1454 * indices will be 0 to 15. In this case, the RHSStart value passed should 1455 * be 0. 1456 * - If the instruction has two different vectors then the range of the 1457 * indices will be 0 to 31. In this case, the RHSStart value passed should 1458 * be 16 (indices 0-15 specify elements in the first vector while indices 16 1459 * to 31 specify elements in the second vector). 1460 * 1461 * \param[in] N The shuffle vector SD Node to analyze 1462 * \param[in] IndexOffset Specifies whether to look for even or odd elements 1463 * \param[in] RHSStartValue Specifies the starting index for the righthand input 1464 * vector to the shuffle_vector instruction 1465 * \return true iff this shuffle vector represents an even or odd word merge 1466 */ 1467 static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset, 1468 unsigned RHSStartValue) { 1469 if (N->getValueType(0) != MVT::v16i8) 1470 return false; 1471 1472 for (unsigned i = 0; i < 2; ++i) 1473 for (unsigned j = 0; j < 4; ++j) 1474 if (!isConstantOrUndef(N->getMaskElt(i*4+j), 1475 i*RHSStartValue+j+IndexOffset) || 1476 !isConstantOrUndef(N->getMaskElt(i*4+j+8), 1477 i*RHSStartValue+j+IndexOffset+8)) 1478 return false; 1479 return true; 1480 } 1481 1482 /** 1483 * \brief Determine if the specified shuffle mask is suitable for the vmrgew or 1484 * vmrgow instructions. 1485 * 1486 * \param[in] N The shuffle vector SD Node to analyze 1487 * \param[in] CheckEven Check for an even merge (true) or an odd merge (false) 1488 * \param[in] ShuffleKind Identify the type of merge: 1489 * - 0 = big-endian merge with two different inputs; 1490 * - 1 = either-endian merge with two identical inputs; 1491 * - 2 = little-endian merge with two different inputs (inputs are swapped for 1492 * little-endian merges). 1493 * \param[in] DAG The current SelectionDAG 1494 * \return true iff this shuffle mask 1495 */ 1496 bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, 1497 unsigned ShuffleKind, SelectionDAG &DAG) { 1498 if (DAG.getDataLayout().isLittleEndian()) { 1499 unsigned indexOffset = CheckEven ? 4 : 0; 1500 if (ShuffleKind == 1) // Unary 1501 return isVMerge(N, indexOffset, 0); 1502 else if (ShuffleKind == 2) // swapped 1503 return isVMerge(N, indexOffset, 16); 1504 else 1505 return false; 1506 } 1507 else { 1508 unsigned indexOffset = CheckEven ? 0 : 4; 1509 if (ShuffleKind == 1) // Unary 1510 return isVMerge(N, indexOffset, 0); 1511 else if (ShuffleKind == 0) // Normal 1512 return isVMerge(N, indexOffset, 16); 1513 else 1514 return false; 1515 } 1516 return false; 1517 } 1518 1519 /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift 1520 /// amount, otherwise return -1. 1521 /// The ShuffleKind distinguishes between big-endian operations with two 1522 /// different inputs (0), either-endian operations with two identical inputs 1523 /// (1), and little-endian operations with two different inputs (2). For the 1524 /// latter, the input operands are swapped (see PPCInstrAltivec.td). 1525 int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, 1526 SelectionDAG &DAG) { 1527 if (N->getValueType(0) != MVT::v16i8) 1528 return -1; 1529 1530 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 1531 1532 // Find the first non-undef value in the shuffle mask. 1533 unsigned i; 1534 for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i) 1535 /*search*/; 1536 1537 if (i == 16) return -1; // all undef. 1538 1539 // Otherwise, check to see if the rest of the elements are consecutively 1540 // numbered from this value. 1541 unsigned ShiftAmt = SVOp->getMaskElt(i); 1542 if (ShiftAmt < i) return -1; 1543 1544 ShiftAmt -= i; 1545 bool isLE = DAG.getDataLayout().isLittleEndian(); 1546 1547 if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { 1548 // Check the rest of the elements to see if they are consecutive. 1549 for (++i; i != 16; ++i) 1550 if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) 1551 return -1; 1552 } else if (ShuffleKind == 1) { 1553 // Check the rest of the elements to see if they are consecutive. 1554 for (++i; i != 16; ++i) 1555 if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) 1556 return -1; 1557 } else 1558 return -1; 1559 1560 if (isLE) 1561 ShiftAmt = 16 - ShiftAmt; 1562 1563 return ShiftAmt; 1564 } 1565 1566 /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand 1567 /// specifies a splat of a single element that is suitable for input to 1568 /// VSPLTB/VSPLTH/VSPLTW. 1569 bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { 1570 assert(N->getValueType(0) == MVT::v16i8 && 1571 (EltSize == 1 || EltSize == 2 || EltSize == 4)); 1572 1573 // The consecutive indices need to specify an element, not part of two 1574 // different elements. So abandon ship early if this isn't the case. 1575 if (N->getMaskElt(0) % EltSize != 0) 1576 return false; 1577 1578 // This is a splat operation if each element of the permute is the same, and 1579 // if the value doesn't reference the second vector. 1580 unsigned ElementBase = N->getMaskElt(0); 1581 1582 // FIXME: Handle UNDEF elements too! 1583 if (ElementBase >= 16) 1584 return false; 1585 1586 // Check that the indices are consecutive, in the case of a multi-byte element 1587 // splatted with a v16i8 mask. 1588 for (unsigned i = 1; i != EltSize; ++i) 1589 if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase)) 1590 return false; 1591 1592 for (unsigned i = EltSize, e = 16; i != e; i += EltSize) { 1593 if (N->getMaskElt(i) < 0) continue; 1594 for (unsigned j = 0; j != EltSize; ++j) 1595 if (N->getMaskElt(i+j) != N->getMaskElt(j)) 1596 return false; 1597 } 1598 return true; 1599 } 1600 1601 // Check that the mask is shuffling N byte elements. 1602 static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width) { 1603 assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) && 1604 "Unexpected element width."); 1605 1606 unsigned NumOfElem = 16 / Width; 1607 unsigned MaskVal[16]; // Width is never greater than 16 1608 for (unsigned i = 0; i < NumOfElem; ++i) { 1609 MaskVal[0] = N->getMaskElt(i * Width); 1610 if (MaskVal[0] % Width) { 1611 return false; 1612 } 1613 1614 for (unsigned int j = 1; j < Width; ++j) { 1615 MaskVal[j] = N->getMaskElt(i * Width + j); 1616 if (MaskVal[j] != MaskVal[j-1] + 1) { 1617 return false; 1618 } 1619 } 1620 } 1621 1622 return true; 1623 } 1624 1625 bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, 1626 unsigned &InsertAtByte, bool &Swap, bool IsLE) { 1627 if (!isNByteElemShuffleMask(N, 4)) 1628 return false; 1629 1630 // Now we look at mask elements 0,4,8,12 1631 unsigned M0 = N->getMaskElt(0) / 4; 1632 unsigned M1 = N->getMaskElt(4) / 4; 1633 unsigned M2 = N->getMaskElt(8) / 4; 1634 unsigned M3 = N->getMaskElt(12) / 4; 1635 unsigned LittleEndianShifts[] = { 2, 1, 0, 3 }; 1636 unsigned BigEndianShifts[] = { 3, 0, 1, 2 }; 1637 1638 // Below, let H and L be arbitrary elements of the shuffle mask 1639 // where H is in the range [4,7] and L is in the range [0,3]. 1640 // H, 1, 2, 3 or L, 5, 6, 7 1641 if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) || 1642 (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) { 1643 ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3]; 1644 InsertAtByte = IsLE ? 12 : 0; 1645 Swap = M0 < 4; 1646 return true; 1647 } 1648 // 0, H, 2, 3 or 4, L, 6, 7 1649 if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) || 1650 (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) { 1651 ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3]; 1652 InsertAtByte = IsLE ? 8 : 4; 1653 Swap = M1 < 4; 1654 return true; 1655 } 1656 // 0, 1, H, 3 or 4, 5, L, 7 1657 if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) || 1658 (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) { 1659 ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3]; 1660 InsertAtByte = IsLE ? 4 : 8; 1661 Swap = M2 < 4; 1662 return true; 1663 } 1664 // 0, 1, 2, H or 4, 5, 6, L 1665 if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) || 1666 (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) { 1667 ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3]; 1668 InsertAtByte = IsLE ? 0 : 12; 1669 Swap = M3 < 4; 1670 return true; 1671 } 1672 1673 // If both vector operands for the shuffle are the same vector, the mask will 1674 // contain only elements from the first one and the second one will be undef. 1675 if (N->getOperand(1).isUndef()) { 1676 ShiftElts = 0; 1677 Swap = true; 1678 unsigned XXINSERTWSrcElem = IsLE ? 2 : 1; 1679 if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) { 1680 InsertAtByte = IsLE ? 12 : 0; 1681 return true; 1682 } 1683 if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) { 1684 InsertAtByte = IsLE ? 8 : 4; 1685 return true; 1686 } 1687 if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) { 1688 InsertAtByte = IsLE ? 4 : 8; 1689 return true; 1690 } 1691 if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) { 1692 InsertAtByte = IsLE ? 0 : 12; 1693 return true; 1694 } 1695 } 1696 1697 return false; 1698 } 1699 1700 bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, 1701 bool &Swap, bool IsLE) { 1702 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); 1703 // Ensure each byte index of the word is consecutive. 1704 if (!isNByteElemShuffleMask(N, 4)) 1705 return false; 1706 1707 // Now we look at mask elements 0,4,8,12, which are the beginning of words. 1708 unsigned M0 = N->getMaskElt(0) / 4; 1709 unsigned M1 = N->getMaskElt(4) / 4; 1710 unsigned M2 = N->getMaskElt(8) / 4; 1711 unsigned M3 = N->getMaskElt(12) / 4; 1712 1713 // If both vector operands for the shuffle are the same vector, the mask will 1714 // contain only elements from the first one and the second one will be undef. 1715 if (N->getOperand(1).isUndef()) { 1716 assert(M0 < 4 && "Indexing into an undef vector?"); 1717 if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4) 1718 return false; 1719 1720 ShiftElts = IsLE ? (4 - M0) % 4 : M0; 1721 Swap = false; 1722 return true; 1723 } 1724 1725 // Ensure each word index of the ShuffleVector Mask is consecutive. 1726 if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8) 1727 return false; 1728 1729 if (IsLE) { 1730 if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) { 1731 // Input vectors don't need to be swapped if the leading element 1732 // of the result is one of the 3 left elements of the second vector 1733 // (or if there is no shift to be done at all). 1734 Swap = false; 1735 ShiftElts = (8 - M0) % 8; 1736 } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) { 1737 // Input vectors need to be swapped if the leading element 1738 // of the result is one of the 3 left elements of the first vector 1739 // (or if we're shifting by 4 - thereby simply swapping the vectors). 1740 Swap = true; 1741 ShiftElts = (4 - M0) % 4; 1742 } 1743 1744 return true; 1745 } else { // BE 1746 if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) { 1747 // Input vectors don't need to be swapped if the leading element 1748 // of the result is one of the 4 elements of the first vector. 1749 Swap = false; 1750 ShiftElts = M0; 1751 } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) { 1752 // Input vectors need to be swapped if the leading element 1753 // of the result is one of the 4 elements of the right vector. 1754 Swap = true; 1755 ShiftElts = M0 - 4; 1756 } 1757 1758 return true; 1759 } 1760 } 1761 1762 /// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap 1763 /// if the inputs to the instruction should be swapped and set \p DM to the 1764 /// value for the immediate. 1765 /// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI 1766 /// AND element 0 of the result comes from the first input (LE) or second input 1767 /// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered. 1768 /// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle 1769 /// mask. 1770 bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM, 1771 bool &Swap, bool IsLE) { 1772 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); 1773 1774 // Ensure each byte index of the double word is consecutive. 1775 if (!isNByteElemShuffleMask(N, 8)) 1776 return false; 1777 1778 unsigned M0 = N->getMaskElt(0) / 8; 1779 unsigned M1 = N->getMaskElt(8) / 8; 1780 assert(((M0 | M1) < 4) && "A mask element out of bounds?"); 1781 1782 // If both vector operands for the shuffle are the same vector, the mask will 1783 // contain only elements from the first one and the second one will be undef. 1784 if (N->getOperand(1).isUndef()) { 1785 if ((M0 | M1) < 2) { 1786 DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1); 1787 Swap = false; 1788 return true; 1789 } else 1790 return false; 1791 } 1792 1793 if (IsLE) { 1794 if (M0 > 1 && M1 < 2) { 1795 Swap = false; 1796 } else if (M0 < 2 && M1 > 1) { 1797 M0 = (M0 + 2) % 4; 1798 M1 = (M1 + 2) % 4; 1799 Swap = true; 1800 } else 1801 return false; 1802 1803 // Note: if control flow comes here that means Swap is already set above 1804 DM = (((~M1) & 1) << 1) + ((~M0) & 1); 1805 return true; 1806 } else { // BE 1807 if (M0 < 2 && M1 > 1) { 1808 Swap = false; 1809 } else if (M0 > 1 && M1 < 2) { 1810 M0 = (M0 + 2) % 4; 1811 M1 = (M1 + 2) % 4; 1812 Swap = true; 1813 } else 1814 return false; 1815 1816 // Note: if control flow comes here that means Swap is already set above 1817 DM = (M0 << 1) + (M1 & 1); 1818 return true; 1819 } 1820 } 1821 1822 1823 /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the 1824 /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. 1825 unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, 1826 SelectionDAG &DAG) { 1827 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 1828 assert(isSplatShuffleMask(SVOp, EltSize)); 1829 if (DAG.getDataLayout().isLittleEndian()) 1830 return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); 1831 else 1832 return SVOp->getMaskElt(0) / EltSize; 1833 } 1834 1835 /// get_VSPLTI_elt - If this is a build_vector of constants which can be formed 1836 /// by using a vspltis[bhw] instruction of the specified element size, return 1837 /// the constant being splatted. The ByteSize field indicates the number of 1838 /// bytes of each element [124] -> [bhw]. 1839 SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { 1840 SDValue OpVal(nullptr, 0); 1841 1842 // If ByteSize of the splat is bigger than the element size of the 1843 // build_vector, then we have a case where we are checking for a splat where 1844 // multiple elements of the buildvector are folded together into a single 1845 // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8). 1846 unsigned EltSize = 16/N->getNumOperands(); 1847 if (EltSize < ByteSize) { 1848 unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval. 1849 SDValue UniquedVals[4]; 1850 assert(Multiple > 1 && Multiple <= 4 && "How can this happen?"); 1851 1852 // See if all of the elements in the buildvector agree across. 1853 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 1854 if (N->getOperand(i).isUndef()) continue; 1855 // If the element isn't a constant, bail fully out. 1856 if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue(); 1857 1858 if (!UniquedVals[i&(Multiple-1)].getNode()) 1859 UniquedVals[i&(Multiple-1)] = N->getOperand(i); 1860 else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i)) 1861 return SDValue(); // no match. 1862 } 1863 1864 // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains 1865 // either constant or undef values that are identical for each chunk. See 1866 // if these chunks can form into a larger vspltis*. 1867 1868 // Check to see if all of the leading entries are either 0 or -1. If 1869 // neither, then this won't fit into the immediate field. 1870 bool LeadingZero = true; 1871 bool LeadingOnes = true; 1872 for (unsigned i = 0; i != Multiple-1; ++i) { 1873 if (!UniquedVals[i].getNode()) continue; // Must have been undefs. 1874 1875 LeadingZero &= isNullConstant(UniquedVals[i]); 1876 LeadingOnes &= isAllOnesConstant(UniquedVals[i]); 1877 } 1878 // Finally, check the least significant entry. 1879 if (LeadingZero) { 1880 if (!UniquedVals[Multiple-1].getNode()) 1881 return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef 1882 int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue(); 1883 if (Val < 16) // 0,0,0,4 -> vspltisw(4) 1884 return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); 1885 } 1886 if (LeadingOnes) { 1887 if (!UniquedVals[Multiple-1].getNode()) 1888 return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef 1889 int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue(); 1890 if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2) 1891 return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); 1892 } 1893 1894 return SDValue(); 1895 } 1896 1897 // Check to see if this buildvec has a single non-undef value in its elements. 1898 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 1899 if (N->getOperand(i).isUndef()) continue; 1900 if (!OpVal.getNode()) 1901 OpVal = N->getOperand(i); 1902 else if (OpVal != N->getOperand(i)) 1903 return SDValue(); 1904 } 1905 1906 if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def. 1907 1908 unsigned ValSizeInBytes = EltSize; 1909 uint64_t Value = 0; 1910 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) { 1911 Value = CN->getZExtValue(); 1912 } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) { 1913 assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!"); 1914 Value = FloatToBits(CN->getValueAPF().convertToFloat()); 1915 } 1916 1917 // If the splat value is larger than the element value, then we can never do 1918 // this splat. The only case that we could fit the replicated bits into our 1919 // immediate field for would be zero, and we prefer to use vxor for it. 1920 if (ValSizeInBytes < ByteSize) return SDValue(); 1921 1922 // If the element value is larger than the splat value, check if it consists 1923 // of a repeated bit pattern of size ByteSize. 1924 if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8)) 1925 return SDValue(); 1926 1927 // Properly sign extend the value. 1928 int MaskVal = SignExtend32(Value, ByteSize * 8); 1929 1930 // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros. 1931 if (MaskVal == 0) return SDValue(); 1932 1933 // Finally, if this value fits in a 5 bit sext field, return it 1934 if (SignExtend32<5>(MaskVal) == MaskVal) 1935 return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32); 1936 return SDValue(); 1937 } 1938 1939 /// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift 1940 /// amount, otherwise return -1. 1941 int PPC::isQVALIGNIShuffleMask(SDNode *N) { 1942 EVT VT = N->getValueType(0); 1943 if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1) 1944 return -1; 1945 1946 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 1947 1948 // Find the first non-undef value in the shuffle mask. 1949 unsigned i; 1950 for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i) 1951 /*search*/; 1952 1953 if (i == 4) return -1; // all undef. 1954 1955 // Otherwise, check to see if the rest of the elements are consecutively 1956 // numbered from this value. 1957 unsigned ShiftAmt = SVOp->getMaskElt(i); 1958 if (ShiftAmt < i) return -1; 1959 ShiftAmt -= i; 1960 1961 // Check the rest of the elements to see if they are consecutive. 1962 for (++i; i != 4; ++i) 1963 if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) 1964 return -1; 1965 1966 return ShiftAmt; 1967 } 1968 1969 //===----------------------------------------------------------------------===// 1970 // Addressing Mode Selection 1971 //===----------------------------------------------------------------------===// 1972 1973 /// isIntS16Immediate - This method tests to see if the node is either a 32-bit 1974 /// or 64-bit immediate, and if the value can be accurately represented as a 1975 /// sign extension from a 16-bit value. If so, this returns true and the 1976 /// immediate. 1977 static bool isIntS16Immediate(SDNode *N, short &Imm) { 1978 if (!isa<ConstantSDNode>(N)) 1979 return false; 1980 1981 Imm = (short)cast<ConstantSDNode>(N)->getZExtValue(); 1982 if (N->getValueType(0) == MVT::i32) 1983 return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); 1984 else 1985 return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); 1986 } 1987 static bool isIntS16Immediate(SDValue Op, short &Imm) { 1988 return isIntS16Immediate(Op.getNode(), Imm); 1989 } 1990 1991 /// SelectAddressRegReg - Given the specified addressed, check to see if it 1992 /// can be represented as an indexed [r+r] operation. Returns false if it 1993 /// can be more efficiently represented with [r+imm]. 1994 bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, 1995 SDValue &Index, 1996 SelectionDAG &DAG) const { 1997 short imm = 0; 1998 if (N.getOpcode() == ISD::ADD) { 1999 if (isIntS16Immediate(N.getOperand(1), imm)) 2000 return false; // r+i 2001 if (N.getOperand(1).getOpcode() == PPCISD::Lo) 2002 return false; // r+i 2003 2004 Base = N.getOperand(0); 2005 Index = N.getOperand(1); 2006 return true; 2007 } else if (N.getOpcode() == ISD::OR) { 2008 if (isIntS16Immediate(N.getOperand(1), imm)) 2009 return false; // r+i can fold it if we can. 2010 2011 // If this is an or of disjoint bitfields, we can codegen this as an add 2012 // (for better address arithmetic) if the LHS and RHS of the OR are provably 2013 // disjoint. 2014 KnownBits LHSKnown, RHSKnown; 2015 DAG.computeKnownBits(N.getOperand(0), LHSKnown); 2016 2017 if (LHSKnown.Zero.getBoolValue()) { 2018 DAG.computeKnownBits(N.getOperand(1), RHSKnown); 2019 // If all of the bits are known zero on the LHS or RHS, the add won't 2020 // carry. 2021 if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) { 2022 Base = N.getOperand(0); 2023 Index = N.getOperand(1); 2024 return true; 2025 } 2026 } 2027 } 2028 2029 return false; 2030 } 2031 2032 // If we happen to be doing an i64 load or store into a stack slot that has 2033 // less than a 4-byte alignment, then the frame-index elimination may need to 2034 // use an indexed load or store instruction (because the offset may not be a 2035 // multiple of 4). The extra register needed to hold the offset comes from the 2036 // register scavenger, and it is possible that the scavenger will need to use 2037 // an emergency spill slot. As a result, we need to make sure that a spill slot 2038 // is allocated when doing an i64 load/store into a less-than-4-byte-aligned 2039 // stack slot. 2040 static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { 2041 // FIXME: This does not handle the LWA case. 2042 if (VT != MVT::i64) 2043 return; 2044 2045 // NOTE: We'll exclude negative FIs here, which come from argument 2046 // lowering, because there are no known test cases triggering this problem 2047 // using packed structures (or similar). We can remove this exclusion if 2048 // we find such a test case. The reason why this is so test-case driven is 2049 // because this entire 'fixup' is only to prevent crashes (from the 2050 // register scavenger) on not-really-valid inputs. For example, if we have: 2051 // %a = alloca i1 2052 // %b = bitcast i1* %a to i64* 2053 // store i64* a, i64 b 2054 // then the store should really be marked as 'align 1', but is not. If it 2055 // were marked as 'align 1' then the indexed form would have been 2056 // instruction-selected initially, and the problem this 'fixup' is preventing 2057 // won't happen regardless. 2058 if (FrameIdx < 0) 2059 return; 2060 2061 MachineFunction &MF = DAG.getMachineFunction(); 2062 MachineFrameInfo &MFI = MF.getFrameInfo(); 2063 2064 unsigned Align = MFI.getObjectAlignment(FrameIdx); 2065 if (Align >= 4) 2066 return; 2067 2068 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 2069 FuncInfo->setHasNonRISpills(); 2070 } 2071 2072 /// Returns true if the address N can be represented by a base register plus 2073 /// a signed 16-bit displacement [r+imm], and if it is not better 2074 /// represented as reg+reg. If Aligned is true, only accept displacements 2075 /// suitable for STD and friends, i.e. multiples of 4. 2076 bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, 2077 SDValue &Base, 2078 SelectionDAG &DAG, 2079 bool Aligned) const { 2080 // FIXME dl should come from parent load or store, not from address 2081 SDLoc dl(N); 2082 // If this can be more profitably realized as r+r, fail. 2083 if (SelectAddressRegReg(N, Disp, Base, DAG)) 2084 return false; 2085 2086 if (N.getOpcode() == ISD::ADD) { 2087 short imm = 0; 2088 if (isIntS16Immediate(N.getOperand(1), imm) && 2089 (!Aligned || (imm & 3) == 0)) { 2090 Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); 2091 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { 2092 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); 2093 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); 2094 } else { 2095 Base = N.getOperand(0); 2096 } 2097 return true; // [r+i] 2098 } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { 2099 // Match LOAD (ADD (X, Lo(G))). 2100 assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue() 2101 && "Cannot handle constant offsets yet!"); 2102 Disp = N.getOperand(1).getOperand(0); // The global address. 2103 assert(Disp.getOpcode() == ISD::TargetGlobalAddress || 2104 Disp.getOpcode() == ISD::TargetGlobalTLSAddress || 2105 Disp.getOpcode() == ISD::TargetConstantPool || 2106 Disp.getOpcode() == ISD::TargetJumpTable); 2107 Base = N.getOperand(0); 2108 return true; // [&g+r] 2109 } 2110 } else if (N.getOpcode() == ISD::OR) { 2111 short imm = 0; 2112 if (isIntS16Immediate(N.getOperand(1), imm) && 2113 (!Aligned || (imm & 3) == 0)) { 2114 // If this is an or of disjoint bitfields, we can codegen this as an add 2115 // (for better address arithmetic) if the LHS and RHS of the OR are 2116 // provably disjoint. 2117 KnownBits LHSKnown; 2118 DAG.computeKnownBits(N.getOperand(0), LHSKnown); 2119 2120 if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { 2121 // If all of the bits are known zero on the LHS or RHS, the add won't 2122 // carry. 2123 if (FrameIndexSDNode *FI = 2124 dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { 2125 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); 2126 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); 2127 } else { 2128 Base = N.getOperand(0); 2129 } 2130 Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); 2131 return true; 2132 } 2133 } 2134 } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { 2135 // Loading from a constant address. 2136 2137 // If this address fits entirely in a 16-bit sext immediate field, codegen 2138 // this as "d, 0" 2139 short Imm; 2140 if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) { 2141 Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); 2142 Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, 2143 CN->getValueType(0)); 2144 return true; 2145 } 2146 2147 // Handle 32-bit sext immediates with LIS + addr mode. 2148 if ((CN->getValueType(0) == MVT::i32 || 2149 (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && 2150 (!Aligned || (CN->getZExtValue() & 3) == 0)) { 2151 int Addr = (int)CN->getZExtValue(); 2152 2153 // Otherwise, break this down into an LIS + disp. 2154 Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32); 2155 2156 Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl, 2157 MVT::i32); 2158 unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8; 2159 Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0); 2160 return true; 2161 } 2162 } 2163 2164 Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout())); 2165 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) { 2166 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); 2167 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); 2168 } else 2169 Base = N; 2170 return true; // [r+0] 2171 } 2172 2173 /// SelectAddressRegRegOnly - Given the specified addressed, force it to be 2174 /// represented as an indexed [r+r] operation. 2175 bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, 2176 SDValue &Index, 2177 SelectionDAG &DAG) const { 2178 // Check to see if we can easily represent this as an [r+r] address. This 2179 // will fail if it thinks that the address is more profitably represented as 2180 // reg+imm, e.g. where imm = 0. 2181 if (SelectAddressRegReg(N, Base, Index, DAG)) 2182 return true; 2183 2184 // If the operand is an addition, always emit this as [r+r], since this is 2185 // better (for code size, and execution, as the memop does the add for free) 2186 // than emitting an explicit add. 2187 if (N.getOpcode() == ISD::ADD) { 2188 Base = N.getOperand(0); 2189 Index = N.getOperand(1); 2190 return true; 2191 } 2192 2193 // Otherwise, do it the hard way, using R0 as the base register. 2194 Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, 2195 N.getValueType()); 2196 Index = N; 2197 return true; 2198 } 2199 2200 /// getPreIndexedAddressParts - returns true by value, base pointer and 2201 /// offset pointer and addressing mode by reference if the node's address 2202 /// can be legally represented as pre-indexed load / store address. 2203 bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 2204 SDValue &Offset, 2205 ISD::MemIndexedMode &AM, 2206 SelectionDAG &DAG) const { 2207 if (DisablePPCPreinc) return false; 2208 2209 bool isLoad = true; 2210 SDValue Ptr; 2211 EVT VT; 2212 unsigned Alignment; 2213 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 2214 Ptr = LD->getBasePtr(); 2215 VT = LD->getMemoryVT(); 2216 Alignment = LD->getAlignment(); 2217 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 2218 Ptr = ST->getBasePtr(); 2219 VT = ST->getMemoryVT(); 2220 Alignment = ST->getAlignment(); 2221 isLoad = false; 2222 } else 2223 return false; 2224 2225 // PowerPC doesn't have preinc load/store instructions for vectors (except 2226 // for QPX, which does have preinc r+r forms). 2227 if (VT.isVector()) { 2228 if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) { 2229 return false; 2230 } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) { 2231 AM = ISD::PRE_INC; 2232 return true; 2233 } 2234 } 2235 2236 if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { 2237 // Common code will reject creating a pre-inc form if the base pointer 2238 // is a frame index, or if N is a store and the base pointer is either 2239 // the same as or a predecessor of the value being stored. Check for 2240 // those situations here, and try with swapped Base/Offset instead. 2241 bool Swap = false; 2242 2243 if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base)) 2244 Swap = true; 2245 else if (!isLoad) { 2246 SDValue Val = cast<StoreSDNode>(N)->getValue(); 2247 if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode())) 2248 Swap = true; 2249 } 2250 2251 if (Swap) 2252 std::swap(Base, Offset); 2253 2254 AM = ISD::PRE_INC; 2255 return true; 2256 } 2257 2258 // LDU/STU can only handle immediates that are a multiple of 4. 2259 if (VT != MVT::i64) { 2260 if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false)) 2261 return false; 2262 } else { 2263 // LDU/STU need an address with at least 4-byte alignment. 2264 if (Alignment < 4) 2265 return false; 2266 2267 if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true)) 2268 return false; 2269 } 2270 2271 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 2272 // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of 2273 // sext i32 to i64 when addr mode is r+i. 2274 if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 && 2275 LD->getExtensionType() == ISD::SEXTLOAD && 2276 isa<ConstantSDNode>(Offset)) 2277 return false; 2278 } 2279 2280 AM = ISD::PRE_INC; 2281 return true; 2282 } 2283 2284 //===----------------------------------------------------------------------===// 2285 // LowerOperation implementation 2286 //===----------------------------------------------------------------------===// 2287 2288 /// Return true if we should reference labels using a PICBase, set the HiOpFlags 2289 /// and LoOpFlags to the target MO flags. 2290 static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget, 2291 unsigned &HiOpFlags, unsigned &LoOpFlags, 2292 const GlobalValue *GV = nullptr) { 2293 HiOpFlags = PPCII::MO_HA; 2294 LoOpFlags = PPCII::MO_LO; 2295 2296 // Don't use the pic base if not in PIC relocation model. 2297 if (IsPIC) { 2298 HiOpFlags |= PPCII::MO_PIC_FLAG; 2299 LoOpFlags |= PPCII::MO_PIC_FLAG; 2300 } 2301 2302 // If this is a reference to a global value that requires a non-lazy-ptr, make 2303 // sure that instruction lowering adds it. 2304 if (GV && Subtarget.hasLazyResolverStub(GV)) { 2305 HiOpFlags |= PPCII::MO_NLP_FLAG; 2306 LoOpFlags |= PPCII::MO_NLP_FLAG; 2307 2308 if (GV->hasHiddenVisibility()) { 2309 HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; 2310 LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; 2311 } 2312 } 2313 } 2314 2315 static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC, 2316 SelectionDAG &DAG) { 2317 SDLoc DL(HiPart); 2318 EVT PtrVT = HiPart.getValueType(); 2319 SDValue Zero = DAG.getConstant(0, DL, PtrVT); 2320 2321 SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero); 2322 SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero); 2323 2324 // With PIC, the first instruction is actually "GR+hi(&G)". 2325 if (isPIC) 2326 Hi = DAG.getNode(ISD::ADD, DL, PtrVT, 2327 DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi); 2328 2329 // Generate non-pic code that has direct accesses to the constant pool. 2330 // The address of the global is just (hi(&g)+lo(&g)). 2331 return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo); 2332 } 2333 2334 static void setUsesTOCBasePtr(MachineFunction &MF) { 2335 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 2336 FuncInfo->setUsesTOCBasePtr(); 2337 } 2338 2339 static void setUsesTOCBasePtr(SelectionDAG &DAG) { 2340 setUsesTOCBasePtr(DAG.getMachineFunction()); 2341 } 2342 2343 static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit, 2344 SDValue GA) { 2345 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 2346 SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) : 2347 DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); 2348 2349 SDValue Ops[] = { GA, Reg }; 2350 return DAG.getMemIntrinsicNode( 2351 PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, 2352 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true, 2353 false, 0); 2354 } 2355 2356 SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, 2357 SelectionDAG &DAG) const { 2358 EVT PtrVT = Op.getValueType(); 2359 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 2360 const Constant *C = CP->getConstVal(); 2361 2362 // 64-bit SVR4 ABI code is always position-independent. 2363 // The actual address of the GlobalValue is stored in the TOC. 2364 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2365 setUsesTOCBasePtr(DAG); 2366 SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); 2367 return getTOCEntry(DAG, SDLoc(CP), true, GA); 2368 } 2369 2370 unsigned MOHiFlag, MOLoFlag; 2371 bool IsPIC = isPositionIndependent(); 2372 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); 2373 2374 if (IsPIC && Subtarget.isSVR4ABI()) { 2375 SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 2376 PPCII::MO_PIC_FLAG); 2377 return getTOCEntry(DAG, SDLoc(CP), false, GA); 2378 } 2379 2380 SDValue CPIHi = 2381 DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag); 2382 SDValue CPILo = 2383 DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag); 2384 return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG); 2385 } 2386 2387 // For 64-bit PowerPC, prefer the more compact relative encodings. 2388 // This trades 32 bits per jump table entry for one or two instructions 2389 // on the jump site. 2390 unsigned PPCTargetLowering::getJumpTableEncoding() const { 2391 if (isJumpTableRelative()) 2392 return MachineJumpTableInfo::EK_LabelDifference32; 2393 2394 return TargetLowering::getJumpTableEncoding(); 2395 } 2396 2397 bool PPCTargetLowering::isJumpTableRelative() const { 2398 if (Subtarget.isPPC64()) 2399 return true; 2400 return TargetLowering::isJumpTableRelative(); 2401 } 2402 2403 SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table, 2404 SelectionDAG &DAG) const { 2405 if (!Subtarget.isPPC64()) 2406 return TargetLowering::getPICJumpTableRelocBase(Table, DAG); 2407 2408 switch (getTargetMachine().getCodeModel()) { 2409 case CodeModel::Default: 2410 case CodeModel::Small: 2411 case CodeModel::Medium: 2412 return TargetLowering::getPICJumpTableRelocBase(Table, DAG); 2413 default: 2414 return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(), 2415 getPointerTy(DAG.getDataLayout())); 2416 } 2417 } 2418 2419 const MCExpr * 2420 PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF, 2421 unsigned JTI, 2422 MCContext &Ctx) const { 2423 if (!Subtarget.isPPC64()) 2424 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 2425 2426 switch (getTargetMachine().getCodeModel()) { 2427 case CodeModel::Default: 2428 case CodeModel::Small: 2429 case CodeModel::Medium: 2430 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 2431 default: 2432 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); 2433 } 2434 } 2435 2436 SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 2437 EVT PtrVT = Op.getValueType(); 2438 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 2439 2440 // 64-bit SVR4 ABI code is always position-independent. 2441 // The actual address of the GlobalValue is stored in the TOC. 2442 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2443 setUsesTOCBasePtr(DAG); 2444 SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); 2445 return getTOCEntry(DAG, SDLoc(JT), true, GA); 2446 } 2447 2448 unsigned MOHiFlag, MOLoFlag; 2449 bool IsPIC = isPositionIndependent(); 2450 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); 2451 2452 if (IsPIC && Subtarget.isSVR4ABI()) { 2453 SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 2454 PPCII::MO_PIC_FLAG); 2455 return getTOCEntry(DAG, SDLoc(GA), false, GA); 2456 } 2457 2458 SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); 2459 SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag); 2460 return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG); 2461 } 2462 2463 SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, 2464 SelectionDAG &DAG) const { 2465 EVT PtrVT = Op.getValueType(); 2466 BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op); 2467 const BlockAddress *BA = BASDN->getBlockAddress(); 2468 2469 // 64-bit SVR4 ABI code is always position-independent. 2470 // The actual BlockAddress is stored in the TOC. 2471 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2472 setUsesTOCBasePtr(DAG); 2473 SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()); 2474 return getTOCEntry(DAG, SDLoc(BASDN), true, GA); 2475 } 2476 2477 unsigned MOHiFlag, MOLoFlag; 2478 bool IsPIC = isPositionIndependent(); 2479 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); 2480 SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag); 2481 SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag); 2482 return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG); 2483 } 2484 2485 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, 2486 SelectionDAG &DAG) const { 2487 // FIXME: TLS addresses currently use medium model code sequences, 2488 // which is the most useful form. Eventually support for small and 2489 // large models could be added if users need it, at the cost of 2490 // additional complexity. 2491 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 2492 if (DAG.getTarget().Options.EmulatedTLS) 2493 return LowerToTLSEmulatedModel(GA, DAG); 2494 2495 SDLoc dl(GA); 2496 const GlobalValue *GV = GA->getGlobal(); 2497 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2498 bool is64bit = Subtarget.isPPC64(); 2499 const Module *M = DAG.getMachineFunction().getFunction()->getParent(); 2500 PICLevel::Level picLevel = M->getPICLevel(); 2501 2502 TLSModel::Model Model = getTargetMachine().getTLSModel(GV); 2503 2504 if (Model == TLSModel::LocalExec) { 2505 SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2506 PPCII::MO_TPREL_HA); 2507 SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2508 PPCII::MO_TPREL_LO); 2509 SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, 2510 is64bit ? MVT::i64 : MVT::i32); 2511 SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); 2512 return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); 2513 } 2514 2515 if (Model == TLSModel::InitialExec) { 2516 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); 2517 SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2518 PPCII::MO_TLS); 2519 SDValue GOTPtr; 2520 if (is64bit) { 2521 setUsesTOCBasePtr(DAG); 2522 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); 2523 GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, 2524 PtrVT, GOTReg, TGA); 2525 } else 2526 GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); 2527 SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, 2528 PtrVT, TGA, GOTPtr); 2529 return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS); 2530 } 2531 2532 if (Model == TLSModel::GeneralDynamic) { 2533 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); 2534 SDValue GOTPtr; 2535 if (is64bit) { 2536 setUsesTOCBasePtr(DAG); 2537 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); 2538 GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, 2539 GOTReg, TGA); 2540 } else { 2541 if (picLevel == PICLevel::SmallPIC) 2542 GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); 2543 else 2544 GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); 2545 } 2546 return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT, 2547 GOTPtr, TGA, TGA); 2548 } 2549 2550 if (Model == TLSModel::LocalDynamic) { 2551 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); 2552 SDValue GOTPtr; 2553 if (is64bit) { 2554 setUsesTOCBasePtr(DAG); 2555 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); 2556 GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, 2557 GOTReg, TGA); 2558 } else { 2559 if (picLevel == PICLevel::SmallPIC) 2560 GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); 2561 else 2562 GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); 2563 } 2564 SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl, 2565 PtrVT, GOTPtr, TGA, TGA); 2566 SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, 2567 PtrVT, TLSAddr, TGA); 2568 return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); 2569 } 2570 2571 llvm_unreachable("Unknown TLS model!"); 2572 } 2573 2574 SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, 2575 SelectionDAG &DAG) const { 2576 EVT PtrVT = Op.getValueType(); 2577 GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op); 2578 SDLoc DL(GSDN); 2579 const GlobalValue *GV = GSDN->getGlobal(); 2580 2581 // 64-bit SVR4 ABI code is always position-independent. 2582 // The actual address of the GlobalValue is stored in the TOC. 2583 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2584 setUsesTOCBasePtr(DAG); 2585 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset()); 2586 return getTOCEntry(DAG, DL, true, GA); 2587 } 2588 2589 unsigned MOHiFlag, MOLoFlag; 2590 bool IsPIC = isPositionIndependent(); 2591 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV); 2592 2593 if (IsPIC && Subtarget.isSVR4ABI()) { 2594 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 2595 GSDN->getOffset(), 2596 PPCII::MO_PIC_FLAG); 2597 return getTOCEntry(DAG, DL, false, GA); 2598 } 2599 2600 SDValue GAHi = 2601 DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag); 2602 SDValue GALo = 2603 DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag); 2604 2605 SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG); 2606 2607 // If the global reference is actually to a non-lazy-pointer, we have to do an 2608 // extra load to get the address of the global. 2609 if (MOHiFlag & PPCII::MO_NLP_FLAG) 2610 Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo()); 2611 return Ptr; 2612 } 2613 2614 SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 2615 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 2616 SDLoc dl(Op); 2617 2618 if (Op.getValueType() == MVT::v2i64) { 2619 // When the operands themselves are v2i64 values, we need to do something 2620 // special because VSX has no underlying comparison operations for these. 2621 if (Op.getOperand(0).getValueType() == MVT::v2i64) { 2622 // Equality can be handled by casting to the legal type for Altivec 2623 // comparisons, everything else needs to be expanded. 2624 if (CC == ISD::SETEQ || CC == ISD::SETNE) { 2625 return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 2626 DAG.getSetCC(dl, MVT::v4i32, 2627 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), 2628 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)), 2629 CC)); 2630 } 2631 2632 return SDValue(); 2633 } 2634 2635 // We handle most of these in the usual way. 2636 return Op; 2637 } 2638 2639 // If we're comparing for equality to zero, expose the fact that this is 2640 // implemented as a ctlz/srl pair on ppc, so that the dag combiner can 2641 // fold the new nodes. 2642 if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG)) 2643 return V; 2644 2645 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 2646 // Leave comparisons against 0 and -1 alone for now, since they're usually 2647 // optimized. FIXME: revisit this when we can custom lower all setcc 2648 // optimizations. 2649 if (C->isAllOnesValue() || C->isNullValue()) 2650 return SDValue(); 2651 } 2652 2653 // If we have an integer seteq/setne, turn it into a compare against zero 2654 // by xor'ing the rhs with the lhs, which is faster than setting a 2655 // condition register, reading it back out, and masking the correct bit. The 2656 // normal approach here uses sub to do this instead of xor. Using xor exposes 2657 // the result to other bit-twiddling opportunities. 2658 EVT LHSVT = Op.getOperand(0).getValueType(); 2659 if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 2660 EVT VT = Op.getValueType(); 2661 SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0), 2662 Op.getOperand(1)); 2663 return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC); 2664 } 2665 return SDValue(); 2666 } 2667 2668 SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 2669 SDNode *Node = Op.getNode(); 2670 EVT VT = Node->getValueType(0); 2671 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2672 SDValue InChain = Node->getOperand(0); 2673 SDValue VAListPtr = Node->getOperand(1); 2674 const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); 2675 SDLoc dl(Node); 2676 2677 assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only"); 2678 2679 // gpr_index 2680 SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, 2681 VAListPtr, MachinePointerInfo(SV), MVT::i8); 2682 InChain = GprIndex.getValue(1); 2683 2684 if (VT == MVT::i64) { 2685 // Check if GprIndex is even 2686 SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex, 2687 DAG.getConstant(1, dl, MVT::i32)); 2688 SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd, 2689 DAG.getConstant(0, dl, MVT::i32), ISD::SETNE); 2690 SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex, 2691 DAG.getConstant(1, dl, MVT::i32)); 2692 // Align GprIndex to be even if it isn't 2693 GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne, 2694 GprIndex); 2695 } 2696 2697 // fpr index is 1 byte after gpr 2698 SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, 2699 DAG.getConstant(1, dl, MVT::i32)); 2700 2701 // fpr 2702 SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, 2703 FprPtr, MachinePointerInfo(SV), MVT::i8); 2704 InChain = FprIndex.getValue(1); 2705 2706 SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, 2707 DAG.getConstant(8, dl, MVT::i32)); 2708 2709 SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, 2710 DAG.getConstant(4, dl, MVT::i32)); 2711 2712 // areas 2713 SDValue OverflowArea = 2714 DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo()); 2715 InChain = OverflowArea.getValue(1); 2716 2717 SDValue RegSaveArea = 2718 DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo()); 2719 InChain = RegSaveArea.getValue(1); 2720 2721 // select overflow_area if index > 8 2722 SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex, 2723 DAG.getConstant(8, dl, MVT::i32), ISD::SETLT); 2724 2725 // adjustment constant gpr_index * 4/8 2726 SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32, 2727 VT.isInteger() ? GprIndex : FprIndex, 2728 DAG.getConstant(VT.isInteger() ? 4 : 8, dl, 2729 MVT::i32)); 2730 2731 // OurReg = RegSaveArea + RegConstant 2732 SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea, 2733 RegConstant); 2734 2735 // Floating types are 32 bytes into RegSaveArea 2736 if (VT.isFloatingPoint()) 2737 OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg, 2738 DAG.getConstant(32, dl, MVT::i32)); 2739 2740 // increase {f,g}pr_index by 1 (or 2 if VT is i64) 2741 SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32, 2742 VT.isInteger() ? GprIndex : FprIndex, 2743 DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl, 2744 MVT::i32)); 2745 2746 InChain = DAG.getTruncStore(InChain, dl, IndexPlus1, 2747 VT.isInteger() ? VAListPtr : FprPtr, 2748 MachinePointerInfo(SV), MVT::i8); 2749 2750 // determine if we should load from reg_save_area or overflow_area 2751 SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea); 2752 2753 // increase overflow_area by 4/8 if gpr/fpr > 8 2754 SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea, 2755 DAG.getConstant(VT.isInteger() ? 4 : 8, 2756 dl, MVT::i32)); 2757 2758 OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea, 2759 OverflowAreaPlusN); 2760 2761 InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr, 2762 MachinePointerInfo(), MVT::i32); 2763 2764 return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo()); 2765 } 2766 2767 SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 2768 assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only"); 2769 2770 // We have to copy the entire va_list struct: 2771 // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte 2772 return DAG.getMemcpy(Op.getOperand(0), Op, 2773 Op.getOperand(1), Op.getOperand(2), 2774 DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true, 2775 false, MachinePointerInfo(), MachinePointerInfo()); 2776 } 2777 2778 SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, 2779 SelectionDAG &DAG) const { 2780 return Op.getOperand(0); 2781 } 2782 2783 SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 2784 SelectionDAG &DAG) const { 2785 SDValue Chain = Op.getOperand(0); 2786 SDValue Trmp = Op.getOperand(1); // trampoline 2787 SDValue FPtr = Op.getOperand(2); // nested function 2788 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 2789 SDLoc dl(Op); 2790 2791 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2792 bool isPPC64 = (PtrVT == MVT::i64); 2793 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); 2794 2795 TargetLowering::ArgListTy Args; 2796 TargetLowering::ArgListEntry Entry; 2797 2798 Entry.Ty = IntPtrTy; 2799 Entry.Node = Trmp; Args.push_back(Entry); 2800 2801 // TrampSize == (isPPC64 ? 48 : 40); 2802 Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl, 2803 isPPC64 ? MVT::i64 : MVT::i32); 2804 Args.push_back(Entry); 2805 2806 Entry.Node = FPtr; Args.push_back(Entry); 2807 Entry.Node = Nest; Args.push_back(Entry); 2808 2809 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) 2810 TargetLowering::CallLoweringInfo CLI(DAG); 2811 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 2812 CallingConv::C, Type::getVoidTy(*DAG.getContext()), 2813 DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args)); 2814 2815 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2816 return CallResult.second; 2817 } 2818 2819 SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2820 MachineFunction &MF = DAG.getMachineFunction(); 2821 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 2822 EVT PtrVT = getPointerTy(MF.getDataLayout()); 2823 2824 SDLoc dl(Op); 2825 2826 if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) { 2827 // vastart just stores the address of the VarArgsFrameIndex slot into the 2828 // memory location argument. 2829 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 2830 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2831 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 2832 MachinePointerInfo(SV)); 2833 } 2834 2835 // For the 32-bit SVR4 ABI we follow the layout of the va_list struct. 2836 // We suppose the given va_list is already allocated. 2837 // 2838 // typedef struct { 2839 // char gpr; /* index into the array of 8 GPRs 2840 // * stored in the register save area 2841 // * gpr=0 corresponds to r3, 2842 // * gpr=1 to r4, etc. 2843 // */ 2844 // char fpr; /* index into the array of 8 FPRs 2845 // * stored in the register save area 2846 // * fpr=0 corresponds to f1, 2847 // * fpr=1 to f2, etc. 2848 // */ 2849 // char *overflow_arg_area; 2850 // /* location on stack that holds 2851 // * the next overflow argument 2852 // */ 2853 // char *reg_save_area; 2854 // /* where r3:r10 and f1:f8 (if saved) 2855 // * are stored 2856 // */ 2857 // } va_list[1]; 2858 2859 SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32); 2860 SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32); 2861 SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(), 2862 PtrVT); 2863 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 2864 PtrVT); 2865 2866 uint64_t FrameOffset = PtrVT.getSizeInBits()/8; 2867 SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT); 2868 2869 uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1; 2870 SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT); 2871 2872 uint64_t FPROffset = 1; 2873 SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT); 2874 2875 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2876 2877 // Store first byte : number of int regs 2878 SDValue firstStore = 2879 DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1), 2880 MachinePointerInfo(SV), MVT::i8); 2881 uint64_t nextOffset = FPROffset; 2882 SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1), 2883 ConstFPROffset); 2884 2885 // Store second byte : number of float regs 2886 SDValue secondStore = 2887 DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr, 2888 MachinePointerInfo(SV, nextOffset), MVT::i8); 2889 nextOffset += StackOffset; 2890 nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset); 2891 2892 // Store second word : arguments given on stack 2893 SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr, 2894 MachinePointerInfo(SV, nextOffset)); 2895 nextOffset += FrameOffset; 2896 nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset); 2897 2898 // Store third word : arguments given in registers 2899 return DAG.getStore(thirdStore, dl, FR, nextPtr, 2900 MachinePointerInfo(SV, nextOffset)); 2901 } 2902 2903 #include "PPCGenCallingConv.inc" 2904 2905 // Function whose sole purpose is to kill compiler warnings 2906 // stemming from unused functions included from PPCGenCallingConv.inc. 2907 CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { 2908 return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; 2909 } 2910 2911 bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, 2912 CCValAssign::LocInfo &LocInfo, 2913 ISD::ArgFlagsTy &ArgFlags, 2914 CCState &State) { 2915 return true; 2916 } 2917 2918 bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, 2919 MVT &LocVT, 2920 CCValAssign::LocInfo &LocInfo, 2921 ISD::ArgFlagsTy &ArgFlags, 2922 CCState &State) { 2923 static const MCPhysReg ArgRegs[] = { 2924 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 2925 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 2926 }; 2927 const unsigned NumArgRegs = array_lengthof(ArgRegs); 2928 2929 unsigned RegNum = State.getFirstUnallocated(ArgRegs); 2930 2931 // Skip one register if the first unallocated register has an even register 2932 // number and there are still argument registers available which have not been 2933 // allocated yet. RegNum is actually an index into ArgRegs, which means we 2934 // need to skip a register if RegNum is odd. 2935 if (RegNum != NumArgRegs && RegNum % 2 == 1) { 2936 State.AllocateReg(ArgRegs[RegNum]); 2937 } 2938 2939 // Always return false here, as this function only makes sure that the first 2940 // unallocated register has an odd register number and does not actually 2941 // allocate a register for the current argument. 2942 return false; 2943 } 2944 2945 bool 2946 llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT, 2947 MVT &LocVT, 2948 CCValAssign::LocInfo &LocInfo, 2949 ISD::ArgFlagsTy &ArgFlags, 2950 CCState &State) { 2951 static const MCPhysReg ArgRegs[] = { 2952 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 2953 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 2954 }; 2955 const unsigned NumArgRegs = array_lengthof(ArgRegs); 2956 2957 unsigned RegNum = State.getFirstUnallocated(ArgRegs); 2958 int RegsLeft = NumArgRegs - RegNum; 2959 2960 // Skip if there is not enough registers left for long double type (4 gpr regs 2961 // in soft float mode) and put long double argument on the stack. 2962 if (RegNum != NumArgRegs && RegsLeft < 4) { 2963 for (int i = 0; i < RegsLeft; i++) { 2964 State.AllocateReg(ArgRegs[RegNum + i]); 2965 } 2966 } 2967 2968 return false; 2969 } 2970 2971 bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, 2972 MVT &LocVT, 2973 CCValAssign::LocInfo &LocInfo, 2974 ISD::ArgFlagsTy &ArgFlags, 2975 CCState &State) { 2976 static const MCPhysReg ArgRegs[] = { 2977 PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, 2978 PPC::F8 2979 }; 2980 2981 const unsigned NumArgRegs = array_lengthof(ArgRegs); 2982 2983 unsigned RegNum = State.getFirstUnallocated(ArgRegs); 2984 2985 // If there is only one Floating-point register left we need to put both f64 2986 // values of a split ppc_fp128 value on the stack. 2987 if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) { 2988 State.AllocateReg(ArgRegs[RegNum]); 2989 } 2990 2991 // Always return false here, as this function only makes sure that the two f64 2992 // values a ppc_fp128 value is split into are both passed in registers or both 2993 // passed on the stack and does not actually allocate a register for the 2994 // current argument. 2995 return false; 2996 } 2997 2998 /// FPR - The set of FP registers that should be allocated for arguments, 2999 /// on Darwin. 3000 static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, 3001 PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, 3002 PPC::F11, PPC::F12, PPC::F13}; 3003 3004 /// QFPR - The set of QPX registers that should be allocated for arguments. 3005 static const MCPhysReg QFPR[] = { 3006 PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, 3007 PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; 3008 3009 /// CalculateStackSlotSize - Calculates the size reserved for this argument on 3010 /// the stack. 3011 static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, 3012 unsigned PtrByteSize) { 3013 unsigned ArgSize = ArgVT.getStoreSize(); 3014 if (Flags.isByVal()) 3015 ArgSize = Flags.getByValSize(); 3016 3017 // Round up to multiples of the pointer size, except for array members, 3018 // which are always packed. 3019 if (!Flags.isInConsecutiveRegs()) 3020 ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3021 3022 return ArgSize; 3023 } 3024 3025 /// CalculateStackSlotAlignment - Calculates the alignment of this argument 3026 /// on the stack. 3027 static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, 3028 ISD::ArgFlagsTy Flags, 3029 unsigned PtrByteSize) { 3030 unsigned Align = PtrByteSize; 3031 3032 // Altivec parameters are padded to a 16 byte boundary. 3033 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || 3034 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || 3035 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || 3036 ArgVT == MVT::v1i128) 3037 Align = 16; 3038 // QPX vector types stored in double-precision are padded to a 32 byte 3039 // boundary. 3040 else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1) 3041 Align = 32; 3042 3043 // ByVal parameters are aligned as requested. 3044 if (Flags.isByVal()) { 3045 unsigned BVAlign = Flags.getByValAlign(); 3046 if (BVAlign > PtrByteSize) { 3047 if (BVAlign % PtrByteSize != 0) 3048 llvm_unreachable( 3049 "ByVal alignment is not a multiple of the pointer size"); 3050 3051 Align = BVAlign; 3052 } 3053 } 3054 3055 // Array members are always packed to their original alignment. 3056 if (Flags.isInConsecutiveRegs()) { 3057 // If the array member was split into multiple registers, the first 3058 // needs to be aligned to the size of the full type. (Except for 3059 // ppcf128, which is only aligned as its f64 components.) 3060 if (Flags.isSplit() && OrigVT != MVT::ppcf128) 3061 Align = OrigVT.getStoreSize(); 3062 else 3063 Align = ArgVT.getStoreSize(); 3064 } 3065 3066 return Align; 3067 } 3068 3069 /// CalculateStackSlotUsed - Return whether this argument will use its 3070 /// stack slot (instead of being passed in registers). ArgOffset, 3071 /// AvailableFPRs, and AvailableVRs must hold the current argument 3072 /// position, and will be updated to account for this argument. 3073 static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, 3074 ISD::ArgFlagsTy Flags, 3075 unsigned PtrByteSize, 3076 unsigned LinkageSize, 3077 unsigned ParamAreaSize, 3078 unsigned &ArgOffset, 3079 unsigned &AvailableFPRs, 3080 unsigned &AvailableVRs, bool HasQPX) { 3081 bool UseMemory = false; 3082 3083 // Respect alignment of argument on the stack. 3084 unsigned Align = 3085 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); 3086 ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; 3087 // If there's no space left in the argument save area, we must 3088 // use memory (this check also catches zero-sized arguments). 3089 if (ArgOffset >= LinkageSize + ParamAreaSize) 3090 UseMemory = true; 3091 3092 // Allocate argument on the stack. 3093 ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); 3094 if (Flags.isInConsecutiveRegsLast()) 3095 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3096 // If we overran the argument save area, we must use memory 3097 // (this check catches arguments passed partially in memory) 3098 if (ArgOffset > LinkageSize + ParamAreaSize) 3099 UseMemory = true; 3100 3101 // However, if the argument is actually passed in an FPR or a VR, 3102 // we don't use memory after all. 3103 if (!Flags.isByVal()) { 3104 if (ArgVT == MVT::f32 || ArgVT == MVT::f64 || 3105 // QPX registers overlap with the scalar FP registers. 3106 (HasQPX && (ArgVT == MVT::v4f32 || 3107 ArgVT == MVT::v4f64 || 3108 ArgVT == MVT::v4i1))) 3109 if (AvailableFPRs > 0) { 3110 --AvailableFPRs; 3111 return false; 3112 } 3113 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || 3114 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || 3115 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || 3116 ArgVT == MVT::v1i128) 3117 if (AvailableVRs > 0) { 3118 --AvailableVRs; 3119 return false; 3120 } 3121 } 3122 3123 return UseMemory; 3124 } 3125 3126 /// EnsureStackAlignment - Round stack frame size up from NumBytes to 3127 /// ensure minimum alignment required for target. 3128 static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering, 3129 unsigned NumBytes) { 3130 unsigned TargetAlign = Lowering->getStackAlignment(); 3131 unsigned AlignMask = TargetAlign - 1; 3132 NumBytes = (NumBytes + AlignMask) & ~AlignMask; 3133 return NumBytes; 3134 } 3135 3136 SDValue PPCTargetLowering::LowerFormalArguments( 3137 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3138 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3139 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3140 if (Subtarget.isSVR4ABI()) { 3141 if (Subtarget.isPPC64()) 3142 return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, 3143 dl, DAG, InVals); 3144 else 3145 return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, 3146 dl, DAG, InVals); 3147 } else { 3148 return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, 3149 dl, DAG, InVals); 3150 } 3151 } 3152 3153 SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( 3154 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3155 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3156 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3157 3158 // 32-bit SVR4 ABI Stack Frame Layout: 3159 // +-----------------------------------+ 3160 // +--> | Back chain | 3161 // | +-----------------------------------+ 3162 // | | Floating-point register save area | 3163 // | +-----------------------------------+ 3164 // | | General register save area | 3165 // | +-----------------------------------+ 3166 // | | CR save word | 3167 // | +-----------------------------------+ 3168 // | | VRSAVE save word | 3169 // | +-----------------------------------+ 3170 // | | Alignment padding | 3171 // | +-----------------------------------+ 3172 // | | Vector register save area | 3173 // | +-----------------------------------+ 3174 // | | Local variable space | 3175 // | +-----------------------------------+ 3176 // | | Parameter list area | 3177 // | +-----------------------------------+ 3178 // | | LR save word | 3179 // | +-----------------------------------+ 3180 // SP--> +--- | Back chain | 3181 // +-----------------------------------+ 3182 // 3183 // Specifications: 3184 // System V Application Binary Interface PowerPC Processor Supplement 3185 // AltiVec Technology Programming Interface Manual 3186 3187 MachineFunction &MF = DAG.getMachineFunction(); 3188 MachineFrameInfo &MFI = MF.getFrameInfo(); 3189 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3190 3191 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3192 // Potential tail calls could cause overwriting of argument stack slots. 3193 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && 3194 (CallConv == CallingConv::Fast)); 3195 unsigned PtrByteSize = 4; 3196 3197 // Assign locations to all of the incoming arguments. 3198 SmallVector<CCValAssign, 16> ArgLocs; 3199 PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3200 *DAG.getContext()); 3201 3202 // Reserve space for the linkage area on the stack. 3203 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 3204 CCInfo.AllocateStack(LinkageSize, PtrByteSize); 3205 if (useSoftFloat()) 3206 CCInfo.PreAnalyzeFormalArguments(Ins); 3207 3208 CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); 3209 CCInfo.clearWasPPCF128(); 3210 3211 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3212 CCValAssign &VA = ArgLocs[i]; 3213 3214 // Arguments stored in registers. 3215 if (VA.isRegLoc()) { 3216 const TargetRegisterClass *RC; 3217 EVT ValVT = VA.getValVT(); 3218 3219 switch (ValVT.getSimpleVT().SimpleTy) { 3220 default: 3221 llvm_unreachable("ValVT not supported by formal arguments Lowering"); 3222 case MVT::i1: 3223 case MVT::i32: 3224 RC = &PPC::GPRCRegClass; 3225 break; 3226 case MVT::f32: 3227 if (Subtarget.hasP8Vector()) 3228 RC = &PPC::VSSRCRegClass; 3229 else 3230 RC = &PPC::F4RCRegClass; 3231 break; 3232 case MVT::f64: 3233 if (Subtarget.hasVSX()) 3234 RC = &PPC::VSFRCRegClass; 3235 else 3236 RC = &PPC::F8RCRegClass; 3237 break; 3238 case MVT::v16i8: 3239 case MVT::v8i16: 3240 case MVT::v4i32: 3241 RC = &PPC::VRRCRegClass; 3242 break; 3243 case MVT::v4f32: 3244 RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass; 3245 break; 3246 case MVT::v2f64: 3247 case MVT::v2i64: 3248 RC = &PPC::VRRCRegClass; 3249 break; 3250 case MVT::v4f64: 3251 RC = &PPC::QFRCRegClass; 3252 break; 3253 case MVT::v4i1: 3254 RC = &PPC::QBRCRegClass; 3255 break; 3256 } 3257 3258 // Transform the arguments stored in physical registers into virtual ones. 3259 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3260 SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, 3261 ValVT == MVT::i1 ? MVT::i32 : ValVT); 3262 3263 if (ValVT == MVT::i1) 3264 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); 3265 3266 InVals.push_back(ArgValue); 3267 } else { 3268 // Argument stored in memory. 3269 assert(VA.isMemLoc()); 3270 3271 unsigned ArgSize = VA.getLocVT().getStoreSize(); 3272 int FI = MFI.CreateFixedObject(ArgSize, VA.getLocMemOffset(), 3273 isImmutable); 3274 3275 // Create load nodes to retrieve arguments from the stack. 3276 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3277 InVals.push_back( 3278 DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo())); 3279 } 3280 } 3281 3282 // Assign locations to all of the incoming aggregate by value arguments. 3283 // Aggregates passed by value are stored in the local variable space of the 3284 // caller's stack frame, right above the parameter list area. 3285 SmallVector<CCValAssign, 16> ByValArgLocs; 3286 CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), 3287 ByValArgLocs, *DAG.getContext()); 3288 3289 // Reserve stack space for the allocations in CCInfo. 3290 CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); 3291 3292 CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal); 3293 3294 // Area that is at least reserved in the caller of this function. 3295 unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); 3296 MinReservedArea = std::max(MinReservedArea, LinkageSize); 3297 3298 // Set the size that is at least reserved in caller of this function. Tail 3299 // call optimized function's reserved stack space needs to be aligned so that 3300 // taking the difference between two stack areas will result in an aligned 3301 // stack. 3302 MinReservedArea = 3303 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); 3304 FuncInfo->setMinReservedArea(MinReservedArea); 3305 3306 SmallVector<SDValue, 8> MemOps; 3307 3308 // If the function takes variable number of arguments, make a frame index for 3309 // the start of the first vararg value... for expansion of llvm.va_start. 3310 if (isVarArg) { 3311 static const MCPhysReg GPArgRegs[] = { 3312 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 3313 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 3314 }; 3315 const unsigned NumGPArgRegs = array_lengthof(GPArgRegs); 3316 3317 static const MCPhysReg FPArgRegs[] = { 3318 PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, 3319 PPC::F8 3320 }; 3321 unsigned NumFPArgRegs = array_lengthof(FPArgRegs); 3322 3323 if (useSoftFloat()) 3324 NumFPArgRegs = 0; 3325 3326 FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs)); 3327 FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs)); 3328 3329 // Make room for NumGPArgRegs and NumFPArgRegs. 3330 int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 + 3331 NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8; 3332 3333 FuncInfo->setVarArgsStackOffset( 3334 MFI.CreateFixedObject(PtrVT.getSizeInBits()/8, 3335 CCInfo.getNextStackOffset(), true)); 3336 3337 FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false)); 3338 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3339 3340 // The fixed integer arguments of a variadic function are stored to the 3341 // VarArgsFrameIndex on the stack so that they may be loaded by 3342 // dereferencing the result of va_next. 3343 for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) { 3344 // Get an existing live-in vreg, or add a new one. 3345 unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]); 3346 if (!VReg) 3347 VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass); 3348 3349 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3350 SDValue Store = 3351 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 3352 MemOps.push_back(Store); 3353 // Increment the address by four for the next argument to store 3354 SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); 3355 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 3356 } 3357 3358 // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6 3359 // is set. 3360 // The double arguments are stored to the VarArgsFrameIndex 3361 // on the stack. 3362 for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) { 3363 // Get an existing live-in vreg, or add a new one. 3364 unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]); 3365 if (!VReg) 3366 VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass); 3367 3368 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64); 3369 SDValue Store = 3370 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 3371 MemOps.push_back(Store); 3372 // Increment the address by eight for the next argument to store 3373 SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl, 3374 PtrVT); 3375 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 3376 } 3377 } 3378 3379 if (!MemOps.empty()) 3380 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3381 3382 return Chain; 3383 } 3384 3385 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote 3386 // value to MVT::i64 and then truncate to the correct register size. 3387 SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, 3388 EVT ObjectVT, SelectionDAG &DAG, 3389 SDValue ArgVal, 3390 const SDLoc &dl) const { 3391 if (Flags.isSExt()) 3392 ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal, 3393 DAG.getValueType(ObjectVT)); 3394 else if (Flags.isZExt()) 3395 ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal, 3396 DAG.getValueType(ObjectVT)); 3397 3398 return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal); 3399 } 3400 3401 SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( 3402 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3403 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3404 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3405 // TODO: add description of PPC stack frame format, or at least some docs. 3406 // 3407 bool isELFv2ABI = Subtarget.isELFv2ABI(); 3408 bool isLittleEndian = Subtarget.isLittleEndian(); 3409 MachineFunction &MF = DAG.getMachineFunction(); 3410 MachineFrameInfo &MFI = MF.getFrameInfo(); 3411 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3412 3413 assert(!(CallConv == CallingConv::Fast && isVarArg) && 3414 "fastcc not supported on varargs functions"); 3415 3416 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3417 // Potential tail calls could cause overwriting of argument stack slots. 3418 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && 3419 (CallConv == CallingConv::Fast)); 3420 unsigned PtrByteSize = 8; 3421 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 3422 3423 static const MCPhysReg GPR[] = { 3424 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 3425 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 3426 }; 3427 static const MCPhysReg VR[] = { 3428 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 3429 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 3430 }; 3431 3432 const unsigned Num_GPR_Regs = array_lengthof(GPR); 3433 const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; 3434 const unsigned Num_VR_Regs = array_lengthof(VR); 3435 const unsigned Num_QFPR_Regs = Num_FPR_Regs; 3436 3437 // Do a first pass over the arguments to determine whether the ABI 3438 // guarantees that our caller has allocated the parameter save area 3439 // on its stack frame. In the ELFv1 ABI, this is always the case; 3440 // in the ELFv2 ABI, it is true if this is a vararg function or if 3441 // any parameter is located in a stack slot. 3442 3443 bool HasParameterArea = !isELFv2ABI || isVarArg; 3444 unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; 3445 unsigned NumBytes = LinkageSize; 3446 unsigned AvailableFPRs = Num_FPR_Regs; 3447 unsigned AvailableVRs = Num_VR_Regs; 3448 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 3449 if (Ins[i].Flags.isNest()) 3450 continue; 3451 3452 if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, 3453 PtrByteSize, LinkageSize, ParamAreaSize, 3454 NumBytes, AvailableFPRs, AvailableVRs, 3455 Subtarget.hasQPX())) 3456 HasParameterArea = true; 3457 } 3458 3459 // Add DAG nodes to load the arguments or copy them out of registers. On 3460 // entry to a function on PPC, the arguments start after the linkage area, 3461 // although the first ones are often in registers. 3462 3463 unsigned ArgOffset = LinkageSize; 3464 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 3465 unsigned &QFPR_idx = FPR_idx; 3466 SmallVector<SDValue, 8> MemOps; 3467 Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); 3468 unsigned CurArgIdx = 0; 3469 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { 3470 SDValue ArgVal; 3471 bool needsLoad = false; 3472 EVT ObjectVT = Ins[ArgNo].VT; 3473 EVT OrigVT = Ins[ArgNo].ArgVT; 3474 unsigned ObjSize = ObjectVT.getStoreSize(); 3475 unsigned ArgSize = ObjSize; 3476 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; 3477 if (Ins[ArgNo].isOrigArg()) { 3478 std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); 3479 CurArgIdx = Ins[ArgNo].getOrigArgIndex(); 3480 } 3481 // We re-align the argument offset for each argument, except when using the 3482 // fast calling convention, when we need to make sure we do that only when 3483 // we'll actually use a stack slot. 3484 unsigned CurArgOffset, Align; 3485 auto ComputeArgOffset = [&]() { 3486 /* Respect alignment of argument on the stack. */ 3487 Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); 3488 ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; 3489 CurArgOffset = ArgOffset; 3490 }; 3491 3492 if (CallConv != CallingConv::Fast) { 3493 ComputeArgOffset(); 3494 3495 /* Compute GPR index associated with argument offset. */ 3496 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; 3497 GPR_idx = std::min(GPR_idx, Num_GPR_Regs); 3498 } 3499 3500 // FIXME the codegen can be much improved in some cases. 3501 // We do not have to keep everything in memory. 3502 if (Flags.isByVal()) { 3503 assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); 3504 3505 if (CallConv == CallingConv::Fast) 3506 ComputeArgOffset(); 3507 3508 // ObjSize is the true size, ArgSize rounded up to multiple of registers. 3509 ObjSize = Flags.getByValSize(); 3510 ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3511 // Empty aggregate parameters do not take up registers. Examples: 3512 // struct { } a; 3513 // union { } b; 3514 // int c[0]; 3515 // etc. However, we have to provide a place-holder in InVals, so 3516 // pretend we have an 8-byte item at the current address for that 3517 // purpose. 3518 if (!ObjSize) { 3519 int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true); 3520 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3521 InVals.push_back(FIN); 3522 continue; 3523 } 3524 3525 // Create a stack object covering all stack doublewords occupied 3526 // by the argument. If the argument is (fully or partially) on 3527 // the stack, or if the argument is fully in registers but the 3528 // caller has allocated the parameter save anyway, we can refer 3529 // directly to the caller's stack frame. Otherwise, create a 3530 // local copy in our own frame. 3531 int FI; 3532 if (HasParameterArea || 3533 ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) 3534 FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true); 3535 else 3536 FI = MFI.CreateStackObject(ArgSize, Align, false); 3537 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3538 3539 // Handle aggregates smaller than 8 bytes. 3540 if (ObjSize < PtrByteSize) { 3541 // The value of the object is its address, which differs from the 3542 // address of the enclosing doubleword on big-endian systems. 3543 SDValue Arg = FIN; 3544 if (!isLittleEndian) { 3545 SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT); 3546 Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff); 3547 } 3548 InVals.push_back(Arg); 3549 3550 if (GPR_idx != Num_GPR_Regs) { 3551 unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); 3552 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3553 SDValue Store; 3554 3555 if (ObjSize==1 || ObjSize==2 || ObjSize==4) { 3556 EVT ObjType = (ObjSize == 1 ? MVT::i8 : 3557 (ObjSize == 2 ? MVT::i16 : MVT::i32)); 3558 Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, 3559 MachinePointerInfo(&*FuncArg), ObjType); 3560 } else { 3561 // For sizes that don't fit a truncating store (3, 5, 6, 7), 3562 // store the whole register as-is to the parameter save area 3563 // slot. 3564 Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3565 MachinePointerInfo(&*FuncArg)); 3566 } 3567 3568 MemOps.push_back(Store); 3569 } 3570 // Whether we copied from a register or not, advance the offset 3571 // into the parameter save area by a full doubleword. 3572 ArgOffset += PtrByteSize; 3573 continue; 3574 } 3575 3576 // The value of the object is its address, which is the address of 3577 // its first stack doubleword. 3578 InVals.push_back(FIN); 3579 3580 // Store whatever pieces of the object are in registers to memory. 3581 for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { 3582 if (GPR_idx == Num_GPR_Regs) 3583 break; 3584 3585 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 3586 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3587 SDValue Addr = FIN; 3588 if (j) { 3589 SDValue Off = DAG.getConstant(j, dl, PtrVT); 3590 Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); 3591 } 3592 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, 3593 MachinePointerInfo(&*FuncArg, j)); 3594 MemOps.push_back(Store); 3595 ++GPR_idx; 3596 } 3597 ArgOffset += ArgSize; 3598 continue; 3599 } 3600 3601 switch (ObjectVT.getSimpleVT().SimpleTy) { 3602 default: llvm_unreachable("Unhandled argument type!"); 3603 case MVT::i1: 3604 case MVT::i32: 3605 case MVT::i64: 3606 if (Flags.isNest()) { 3607 // The 'nest' parameter, if any, is passed in R11. 3608 unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass); 3609 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 3610 3611 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) 3612 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); 3613 3614 break; 3615 } 3616 3617 // These can be scalar arguments or elements of an integer array type 3618 // passed directly. Clang may use those instead of "byval" aggregate 3619 // types to avoid forcing arguments to memory unnecessarily. 3620 if (GPR_idx != Num_GPR_Regs) { 3621 unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); 3622 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 3623 3624 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) 3625 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote 3626 // value to MVT::i64 and then truncate to the correct register size. 3627 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); 3628 } else { 3629 if (CallConv == CallingConv::Fast) 3630 ComputeArgOffset(); 3631 3632 needsLoad = true; 3633 ArgSize = PtrByteSize; 3634 } 3635 if (CallConv != CallingConv::Fast || needsLoad) 3636 ArgOffset += 8; 3637 break; 3638 3639 case MVT::f32: 3640 case MVT::f64: 3641 // These can be scalar arguments or elements of a float array type 3642 // passed directly. The latter are used to implement ELFv2 homogenous 3643 // float aggregates. 3644 if (FPR_idx != Num_FPR_Regs) { 3645 unsigned VReg; 3646 3647 if (ObjectVT == MVT::f32) 3648 VReg = MF.addLiveIn(FPR[FPR_idx], 3649 Subtarget.hasP8Vector() 3650 ? &PPC::VSSRCRegClass 3651 : &PPC::F4RCRegClass); 3652 else 3653 VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() 3654 ? &PPC::VSFRCRegClass 3655 : &PPC::F8RCRegClass); 3656 3657 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 3658 ++FPR_idx; 3659 } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) { 3660 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 3661 // once we support fp <-> gpr moves. 3662 3663 // This can only ever happen in the presence of f32 array types, 3664 // since otherwise we never run out of FPRs before running out 3665 // of GPRs. 3666 unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); 3667 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 3668 3669 if (ObjectVT == MVT::f32) { 3670 if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) 3671 ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, 3672 DAG.getConstant(32, dl, MVT::i32)); 3673 ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); 3674 } 3675 3676 ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); 3677 } else { 3678 if (CallConv == CallingConv::Fast) 3679 ComputeArgOffset(); 3680 3681 needsLoad = true; 3682 } 3683 3684 // When passing an array of floats, the array occupies consecutive 3685 // space in the argument area; only round up to the next doubleword 3686 // at the end of the array. Otherwise, each float takes 8 bytes. 3687 if (CallConv != CallingConv::Fast || needsLoad) { 3688 ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; 3689 ArgOffset += ArgSize; 3690 if (Flags.isInConsecutiveRegsLast()) 3691 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3692 } 3693 break; 3694 case MVT::v4f32: 3695 case MVT::v4i32: 3696 case MVT::v8i16: 3697 case MVT::v16i8: 3698 case MVT::v2f64: 3699 case MVT::v2i64: 3700 case MVT::v1i128: 3701 if (!Subtarget.hasQPX()) { 3702 // These can be scalar arguments or elements of a vector array type 3703 // passed directly. The latter are used to implement ELFv2 homogenous 3704 // vector aggregates. 3705 if (VR_idx != Num_VR_Regs) { 3706 unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); 3707 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 3708 ++VR_idx; 3709 } else { 3710 if (CallConv == CallingConv::Fast) 3711 ComputeArgOffset(); 3712 3713 needsLoad = true; 3714 } 3715 if (CallConv != CallingConv::Fast || needsLoad) 3716 ArgOffset += 16; 3717 break; 3718 } // not QPX 3719 3720 assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && 3721 "Invalid QPX parameter type"); 3722 /* fall through */ 3723 3724 case MVT::v4f64: 3725 case MVT::v4i1: 3726 // QPX vectors are treated like their scalar floating-point subregisters 3727 // (except that they're larger). 3728 unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32; 3729 if (QFPR_idx != Num_QFPR_Regs) { 3730 const TargetRegisterClass *RC; 3731 switch (ObjectVT.getSimpleVT().SimpleTy) { 3732 case MVT::v4f64: RC = &PPC::QFRCRegClass; break; 3733 case MVT::v4f32: RC = &PPC::QSRCRegClass; break; 3734 default: RC = &PPC::QBRCRegClass; break; 3735 } 3736 3737 unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC); 3738 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 3739 ++QFPR_idx; 3740 } else { 3741 if (CallConv == CallingConv::Fast) 3742 ComputeArgOffset(); 3743 needsLoad = true; 3744 } 3745 if (CallConv != CallingConv::Fast || needsLoad) 3746 ArgOffset += Sz; 3747 break; 3748 } 3749 3750 // We need to load the argument to a virtual register if we determined 3751 // above that we ran out of physical registers of the appropriate type. 3752 if (needsLoad) { 3753 if (ObjSize < ArgSize && !isLittleEndian) 3754 CurArgOffset += ArgSize - ObjSize; 3755 int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable); 3756 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3757 ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); 3758 } 3759 3760 InVals.push_back(ArgVal); 3761 } 3762 3763 // Area that is at least reserved in the caller of this function. 3764 unsigned MinReservedArea; 3765 if (HasParameterArea) 3766 MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); 3767 else 3768 MinReservedArea = LinkageSize; 3769 3770 // Set the size that is at least reserved in caller of this function. Tail 3771 // call optimized functions' reserved stack space needs to be aligned so that 3772 // taking the difference between two stack areas will result in an aligned 3773 // stack. 3774 MinReservedArea = 3775 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); 3776 FuncInfo->setMinReservedArea(MinReservedArea); 3777 3778 // If the function takes variable number of arguments, make a frame index for 3779 // the start of the first vararg value... for expansion of llvm.va_start. 3780 if (isVarArg) { 3781 int Depth = ArgOffset; 3782 3783 FuncInfo->setVarArgsFrameIndex( 3784 MFI.CreateFixedObject(PtrByteSize, Depth, true)); 3785 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3786 3787 // If this function is vararg, store any remaining integer argument regs 3788 // to their spots on the stack so that they may be loaded by dereferencing 3789 // the result of va_next. 3790 for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; 3791 GPR_idx < Num_GPR_Regs; ++GPR_idx) { 3792 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 3793 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3794 SDValue Store = 3795 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 3796 MemOps.push_back(Store); 3797 // Increment the address by four for the next argument to store 3798 SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT); 3799 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 3800 } 3801 } 3802 3803 if (!MemOps.empty()) 3804 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3805 3806 return Chain; 3807 } 3808 3809 SDValue PPCTargetLowering::LowerFormalArguments_Darwin( 3810 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3811 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3812 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3813 // TODO: add description of PPC stack frame format, or at least some docs. 3814 // 3815 MachineFunction &MF = DAG.getMachineFunction(); 3816 MachineFrameInfo &MFI = MF.getFrameInfo(); 3817 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3818 3819 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3820 bool isPPC64 = PtrVT == MVT::i64; 3821 // Potential tail calls could cause overwriting of argument stack slots. 3822 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && 3823 (CallConv == CallingConv::Fast)); 3824 unsigned PtrByteSize = isPPC64 ? 8 : 4; 3825 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 3826 unsigned ArgOffset = LinkageSize; 3827 // Area that is at least reserved in caller of this function. 3828 unsigned MinReservedArea = ArgOffset; 3829 3830 static const MCPhysReg GPR_32[] = { // 32-bit registers. 3831 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 3832 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 3833 }; 3834 static const MCPhysReg GPR_64[] = { // 64-bit registers. 3835 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 3836 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 3837 }; 3838 static const MCPhysReg VR[] = { 3839 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 3840 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 3841 }; 3842 3843 const unsigned Num_GPR_Regs = array_lengthof(GPR_32); 3844 const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; 3845 const unsigned Num_VR_Regs = array_lengthof( VR); 3846 3847 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 3848 3849 const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; 3850 3851 // In 32-bit non-varargs functions, the stack space for vectors is after the 3852 // stack space for non-vectors. We do not use this space unless we have 3853 // too many vectors to fit in registers, something that only occurs in 3854 // constructed examples:), but we have to walk the arglist to figure 3855 // that out...for the pathological case, compute VecArgOffset as the 3856 // start of the vector parameter area. Computing VecArgOffset is the 3857 // entire point of the following loop. 3858 unsigned VecArgOffset = ArgOffset; 3859 if (!isVarArg && !isPPC64) { 3860 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; 3861 ++ArgNo) { 3862 EVT ObjectVT = Ins[ArgNo].VT; 3863 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; 3864 3865 if (Flags.isByVal()) { 3866 // ObjSize is the true size, ArgSize rounded up to multiple of regs. 3867 unsigned ObjSize = Flags.getByValSize(); 3868 unsigned ArgSize = 3869 ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3870 VecArgOffset += ArgSize; 3871 continue; 3872 } 3873 3874 switch(ObjectVT.getSimpleVT().SimpleTy) { 3875 default: llvm_unreachable("Unhandled argument type!"); 3876 case MVT::i1: 3877 case MVT::i32: 3878 case MVT::f32: 3879 VecArgOffset += 4; 3880 break; 3881 case MVT::i64: // PPC64 3882 case MVT::f64: 3883 // FIXME: We are guaranteed to be !isPPC64 at this point. 3884 // Does MVT::i64 apply? 3885 VecArgOffset += 8; 3886 break; 3887 case MVT::v4f32: 3888 case MVT::v4i32: 3889 case MVT::v8i16: 3890 case MVT::v16i8: 3891 // Nothing to do, we're only looking at Nonvector args here. 3892 break; 3893 } 3894 } 3895 } 3896 // We've found where the vector parameter area in memory is. Skip the 3897 // first 12 parameters; these don't use that memory. 3898 VecArgOffset = ((VecArgOffset+15)/16)*16; 3899 VecArgOffset += 12*16; 3900 3901 // Add DAG nodes to load the arguments or copy them out of registers. On 3902 // entry to a function on PPC, the arguments start after the linkage area, 3903 // although the first ones are often in registers. 3904 3905 SmallVector<SDValue, 8> MemOps; 3906 unsigned nAltivecParamsAtEnd = 0; 3907 Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); 3908 unsigned CurArgIdx = 0; 3909 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { 3910 SDValue ArgVal; 3911 bool needsLoad = false; 3912 EVT ObjectVT = Ins[ArgNo].VT; 3913 unsigned ObjSize = ObjectVT.getSizeInBits()/8; 3914 unsigned ArgSize = ObjSize; 3915 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; 3916 if (Ins[ArgNo].isOrigArg()) { 3917 std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); 3918 CurArgIdx = Ins[ArgNo].getOrigArgIndex(); 3919 } 3920 unsigned CurArgOffset = ArgOffset; 3921 3922 // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. 3923 if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || 3924 ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) { 3925 if (isVarArg || isPPC64) { 3926 MinReservedArea = ((MinReservedArea+15)/16)*16; 3927 MinReservedArea += CalculateStackSlotSize(ObjectVT, 3928 Flags, 3929 PtrByteSize); 3930 } else nAltivecParamsAtEnd++; 3931 } else 3932 // Calculate min reserved area. 3933 MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT, 3934 Flags, 3935 PtrByteSize); 3936 3937 // FIXME the codegen can be much improved in some cases. 3938 // We do not have to keep everything in memory. 3939 if (Flags.isByVal()) { 3940 assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); 3941 3942 // ObjSize is the true size, ArgSize rounded up to multiple of registers. 3943 ObjSize = Flags.getByValSize(); 3944 ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3945 // Objects of size 1 and 2 are right justified, everything else is 3946 // left justified. This means the memory address is adjusted forwards. 3947 if (ObjSize==1 || ObjSize==2) { 3948 CurArgOffset = CurArgOffset + (4 - ObjSize); 3949 } 3950 // The value of the object is its address. 3951 int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true); 3952 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3953 InVals.push_back(FIN); 3954 if (ObjSize==1 || ObjSize==2) { 3955 if (GPR_idx != Num_GPR_Regs) { 3956 unsigned VReg; 3957 if (isPPC64) 3958 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 3959 else 3960 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 3961 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3962 EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16; 3963 SDValue Store = 3964 DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, 3965 MachinePointerInfo(&*FuncArg), ObjType); 3966 MemOps.push_back(Store); 3967 ++GPR_idx; 3968 } 3969 3970 ArgOffset += PtrByteSize; 3971 3972 continue; 3973 } 3974 for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { 3975 // Store whatever pieces of the object are in registers 3976 // to memory. ArgOffset will be the address of the beginning 3977 // of the object. 3978 if (GPR_idx != Num_GPR_Regs) { 3979 unsigned VReg; 3980 if (isPPC64) 3981 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 3982 else 3983 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 3984 int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true); 3985 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3986 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3987 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3988 MachinePointerInfo(&*FuncArg, j)); 3989 MemOps.push_back(Store); 3990 ++GPR_idx; 3991 ArgOffset += PtrByteSize; 3992 } else { 3993 ArgOffset += ArgSize - (ArgOffset-CurArgOffset); 3994 break; 3995 } 3996 } 3997 continue; 3998 } 3999 4000 switch (ObjectVT.getSimpleVT().SimpleTy) { 4001 default: llvm_unreachable("Unhandled argument type!"); 4002 case MVT::i1: 4003 case MVT::i32: 4004 if (!isPPC64) { 4005 if (GPR_idx != Num_GPR_Regs) { 4006 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4007 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 4008 4009 if (ObjectVT == MVT::i1) 4010 ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal); 4011 4012 ++GPR_idx; 4013 } else { 4014 needsLoad = true; 4015 ArgSize = PtrByteSize; 4016 } 4017 // All int arguments reserve stack space in the Darwin ABI. 4018 ArgOffset += PtrByteSize; 4019 break; 4020 } 4021 LLVM_FALLTHROUGH; 4022 case MVT::i64: // PPC64 4023 if (GPR_idx != Num_GPR_Regs) { 4024 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4025 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 4026 4027 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) 4028 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote 4029 // value to MVT::i64 and then truncate to the correct register size. 4030 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); 4031 4032 ++GPR_idx; 4033 } else { 4034 needsLoad = true; 4035 ArgSize = PtrByteSize; 4036 } 4037 // All int arguments reserve stack space in the Darwin ABI. 4038 ArgOffset += 8; 4039 break; 4040 4041 case MVT::f32: 4042 case MVT::f64: 4043 // Every 4 bytes of argument space consumes one of the GPRs available for 4044 // argument passing. 4045 if (GPR_idx != Num_GPR_Regs) { 4046 ++GPR_idx; 4047 if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64) 4048 ++GPR_idx; 4049 } 4050 if (FPR_idx != Num_FPR_Regs) { 4051 unsigned VReg; 4052 4053 if (ObjectVT == MVT::f32) 4054 VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); 4055 else 4056 VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass); 4057 4058 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 4059 ++FPR_idx; 4060 } else { 4061 needsLoad = true; 4062 } 4063 4064 // All FP arguments reserve stack space in the Darwin ABI. 4065 ArgOffset += isPPC64 ? 8 : ObjSize; 4066 break; 4067 case MVT::v4f32: 4068 case MVT::v4i32: 4069 case MVT::v8i16: 4070 case MVT::v16i8: 4071 // Note that vector arguments in registers don't reserve stack space, 4072 // except in varargs functions. 4073 if (VR_idx != Num_VR_Regs) { 4074 unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); 4075 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 4076 if (isVarArg) { 4077 while ((ArgOffset % 16) != 0) { 4078 ArgOffset += PtrByteSize; 4079 if (GPR_idx != Num_GPR_Regs) 4080 GPR_idx++; 4081 } 4082 ArgOffset += 16; 4083 GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64? 4084 } 4085 ++VR_idx; 4086 } else { 4087 if (!isVarArg && !isPPC64) { 4088 // Vectors go after all the nonvectors. 4089 CurArgOffset = VecArgOffset; 4090 VecArgOffset += 16; 4091 } else { 4092 // Vectors are aligned. 4093 ArgOffset = ((ArgOffset+15)/16)*16; 4094 CurArgOffset = ArgOffset; 4095 ArgOffset += 16; 4096 } 4097 needsLoad = true; 4098 } 4099 break; 4100 } 4101 4102 // We need to load the argument to a virtual register if we determined above 4103 // that we ran out of physical registers of the appropriate type. 4104 if (needsLoad) { 4105 int FI = MFI.CreateFixedObject(ObjSize, 4106 CurArgOffset + (ArgSize - ObjSize), 4107 isImmutable); 4108 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4109 ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); 4110 } 4111 4112 InVals.push_back(ArgVal); 4113 } 4114 4115 // Allow for Altivec parameters at the end, if needed. 4116 if (nAltivecParamsAtEnd) { 4117 MinReservedArea = ((MinReservedArea+15)/16)*16; 4118 MinReservedArea += 16*nAltivecParamsAtEnd; 4119 } 4120 4121 // Area that is at least reserved in the caller of this function. 4122 MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); 4123 4124 // Set the size that is at least reserved in caller of this function. Tail 4125 // call optimized functions' reserved stack space needs to be aligned so that 4126 // taking the difference between two stack areas will result in an aligned 4127 // stack. 4128 MinReservedArea = 4129 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); 4130 FuncInfo->setMinReservedArea(MinReservedArea); 4131 4132 // If the function takes variable number of arguments, make a frame index for 4133 // the start of the first vararg value... for expansion of llvm.va_start. 4134 if (isVarArg) { 4135 int Depth = ArgOffset; 4136 4137 FuncInfo->setVarArgsFrameIndex( 4138 MFI.CreateFixedObject(PtrVT.getSizeInBits()/8, 4139 Depth, true)); 4140 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 4141 4142 // If this function is vararg, store any remaining integer argument regs 4143 // to their spots on the stack so that they may be loaded by dereferencing 4144 // the result of va_next. 4145 for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { 4146 unsigned VReg; 4147 4148 if (isPPC64) 4149 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4150 else 4151 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4152 4153 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 4154 SDValue Store = 4155 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 4156 MemOps.push_back(Store); 4157 // Increment the address by four for the next argument to store 4158 SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); 4159 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 4160 } 4161 } 4162 4163 if (!MemOps.empty()) 4164 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 4165 4166 return Chain; 4167 } 4168 4169 /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be 4170 /// adjusted to accommodate the arguments for the tailcall. 4171 static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, 4172 unsigned ParamSize) { 4173 4174 if (!isTailCall) return 0; 4175 4176 PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>(); 4177 unsigned CallerMinReservedArea = FI->getMinReservedArea(); 4178 int SPDiff = (int)CallerMinReservedArea - (int)ParamSize; 4179 // Remember only if the new adjustement is bigger. 4180 if (SPDiff < FI->getTailCallSPDelta()) 4181 FI->setTailCallSPDelta(SPDiff); 4182 4183 return SPDiff; 4184 } 4185 4186 static bool isFunctionGlobalAddress(SDValue Callee); 4187 4188 static bool 4189 resideInSameSection(const Function *Caller, SDValue Callee, 4190 const TargetMachine &TM) { 4191 // If !G, Callee can be an external symbol. 4192 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 4193 if (!G) 4194 return false; 4195 4196 const GlobalValue *GV = G->getGlobal(); 4197 if (!GV->isStrongDefinitionForLinker()) 4198 return false; 4199 4200 // Any explicitly-specified sections and section prefixes must also match. 4201 // Also, if we're using -ffunction-sections, then each function is always in 4202 // a different section (the same is true for COMDAT functions). 4203 if (TM.getFunctionSections() || GV->hasComdat() || Caller->hasComdat() || 4204 GV->getSection() != Caller->getSection()) 4205 return false; 4206 if (const auto *F = dyn_cast<Function>(GV)) { 4207 if (F->getSectionPrefix() != Caller->getSectionPrefix()) 4208 return false; 4209 } 4210 4211 // If the callee might be interposed, then we can't assume the ultimate call 4212 // target will be in the same section. Even in cases where we can assume that 4213 // interposition won't happen, in any case where the linker might insert a 4214 // stub to allow for interposition, we must generate code as though 4215 // interposition might occur. To understand why this matters, consider a 4216 // situation where: a -> b -> c where the arrows indicate calls. b and c are 4217 // in the same section, but a is in a different module (i.e. has a different 4218 // TOC base pointer). If the linker allows for interposition between b and c, 4219 // then it will generate a stub for the call edge between b and c which will 4220 // save the TOC pointer into the designated stack slot allocated by b. If we 4221 // return true here, and therefore allow a tail call between b and c, that 4222 // stack slot won't exist and the b -> c stub will end up saving b'c TOC base 4223 // pointer into the stack slot allocated by a (where the a -> b stub saved 4224 // a's TOC base pointer). If we're not considering a tail call, but rather, 4225 // whether a nop is needed after the call instruction in b, because the linker 4226 // will insert a stub, it might complain about a missing nop if we omit it 4227 // (although many don't complain in this case). 4228 if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV)) 4229 return false; 4230 4231 return true; 4232 } 4233 4234 static bool 4235 needStackSlotPassParameters(const PPCSubtarget &Subtarget, 4236 const SmallVectorImpl<ISD::OutputArg> &Outs) { 4237 assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64()); 4238 4239 const unsigned PtrByteSize = 8; 4240 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 4241 4242 static const MCPhysReg GPR[] = { 4243 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 4244 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 4245 }; 4246 static const MCPhysReg VR[] = { 4247 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 4248 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 4249 }; 4250 4251 const unsigned NumGPRs = array_lengthof(GPR); 4252 const unsigned NumFPRs = 13; 4253 const unsigned NumVRs = array_lengthof(VR); 4254 const unsigned ParamAreaSize = NumGPRs * PtrByteSize; 4255 4256 unsigned NumBytes = LinkageSize; 4257 unsigned AvailableFPRs = NumFPRs; 4258 unsigned AvailableVRs = NumVRs; 4259 4260 for (const ISD::OutputArg& Param : Outs) { 4261 if (Param.Flags.isNest()) continue; 4262 4263 if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, 4264 PtrByteSize, LinkageSize, ParamAreaSize, 4265 NumBytes, AvailableFPRs, AvailableVRs, 4266 Subtarget.hasQPX())) 4267 return true; 4268 } 4269 return false; 4270 } 4271 4272 static bool 4273 hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) { 4274 if (CS->arg_size() != CallerFn->arg_size()) 4275 return false; 4276 4277 ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin(); 4278 ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end(); 4279 Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin(); 4280 4281 for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) { 4282 const Value* CalleeArg = *CalleeArgIter; 4283 const Value* CallerArg = &(*CallerArgIter); 4284 if (CalleeArg == CallerArg) 4285 continue; 4286 4287 // e.g. @caller([4 x i64] %a, [4 x i64] %b) { 4288 // tail call @callee([4 x i64] undef, [4 x i64] %b) 4289 // } 4290 // 1st argument of callee is undef and has the same type as caller. 4291 if (CalleeArg->getType() == CallerArg->getType() && 4292 isa<UndefValue>(CalleeArg)) 4293 continue; 4294 4295 return false; 4296 } 4297 4298 return true; 4299 } 4300 4301 bool 4302 PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( 4303 SDValue Callee, 4304 CallingConv::ID CalleeCC, 4305 ImmutableCallSite *CS, 4306 bool isVarArg, 4307 const SmallVectorImpl<ISD::OutputArg> &Outs, 4308 const SmallVectorImpl<ISD::InputArg> &Ins, 4309 SelectionDAG& DAG) const { 4310 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; 4311 4312 if (DisableSCO && !TailCallOpt) return false; 4313 4314 // Variadic argument functions are not supported. 4315 if (isVarArg) return false; 4316 4317 MachineFunction &MF = DAG.getMachineFunction(); 4318 CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); 4319 4320 // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has 4321 // the same calling convention 4322 if (CallerCC != CalleeCC) return false; 4323 4324 // SCO support C calling convention 4325 if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C) 4326 return false; 4327 4328 // Caller contains any byval parameter is not supported. 4329 if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); })) 4330 return false; 4331 4332 // Callee contains any byval parameter is not supported, too. 4333 // Note: This is a quick work around, because in some cases, e.g. 4334 // caller's stack size > callee's stack size, we are still able to apply 4335 // sibling call optimization. See: https://reviews.llvm.org/D23441#513574 4336 if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); })) 4337 return false; 4338 4339 // No TCO/SCO on indirect call because Caller have to restore its TOC 4340 if (!isFunctionGlobalAddress(Callee) && 4341 !isa<ExternalSymbolSDNode>(Callee)) 4342 return false; 4343 4344 // Check if Callee resides in the same section, because for now, PPC64 SVR4 4345 // ABI (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another 4346 // section. 4347 // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977 4348 if (!resideInSameSection(MF.getFunction(), Callee, getTargetMachine())) 4349 return false; 4350 4351 // TCO allows altering callee ABI, so we don't have to check further. 4352 if (CalleeCC == CallingConv::Fast && TailCallOpt) 4353 return true; 4354 4355 if (DisableSCO) return false; 4356 4357 // If callee use the same argument list that caller is using, then we can 4358 // apply SCO on this case. If it is not, then we need to check if callee needs 4359 // stack for passing arguments. 4360 if (!hasSameArgumentList(MF.getFunction(), CS) && 4361 needStackSlotPassParameters(Subtarget, Outs)) { 4362 return false; 4363 } 4364 4365 return true; 4366 } 4367 4368 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 4369 /// for tail call optimization. Targets which want to do tail call 4370 /// optimization should implement this function. 4371 bool 4372 PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 4373 CallingConv::ID CalleeCC, 4374 bool isVarArg, 4375 const SmallVectorImpl<ISD::InputArg> &Ins, 4376 SelectionDAG& DAG) const { 4377 if (!getTargetMachine().Options.GuaranteedTailCallOpt) 4378 return false; 4379 4380 // Variable argument functions are not supported. 4381 if (isVarArg) 4382 return false; 4383 4384 MachineFunction &MF = DAG.getMachineFunction(); 4385 CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); 4386 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { 4387 // Functions containing by val parameters are not supported. 4388 for (unsigned i = 0; i != Ins.size(); i++) { 4389 ISD::ArgFlagsTy Flags = Ins[i].Flags; 4390 if (Flags.isByVal()) return false; 4391 } 4392 4393 // Non-PIC/GOT tail calls are supported. 4394 if (getTargetMachine().getRelocationModel() != Reloc::PIC_) 4395 return true; 4396 4397 // At the moment we can only do local tail calls (in same module, hidden 4398 // or protected) if we are generating PIC. 4399 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 4400 return G->getGlobal()->hasHiddenVisibility() 4401 || G->getGlobal()->hasProtectedVisibility(); 4402 } 4403 4404 return false; 4405 } 4406 4407 /// isCallCompatibleAddress - Return the immediate to use if the specified 4408 /// 32-bit value is representable in the immediate field of a BxA instruction. 4409 static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) { 4410 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4411 if (!C) return nullptr; 4412 4413 int Addr = C->getZExtValue(); 4414 if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero. 4415 SignExtend32<26>(Addr) != Addr) 4416 return nullptr; // Top 6 bits have to be sext of immediate. 4417 4418 return DAG 4419 .getConstant( 4420 (int)C->getZExtValue() >> 2, SDLoc(Op), 4421 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout())) 4422 .getNode(); 4423 } 4424 4425 namespace { 4426 4427 struct TailCallArgumentInfo { 4428 SDValue Arg; 4429 SDValue FrameIdxOp; 4430 int FrameIdx = 0; 4431 4432 TailCallArgumentInfo() = default; 4433 }; 4434 4435 } // end anonymous namespace 4436 4437 /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. 4438 static void StoreTailCallArgumentsToStackSlot( 4439 SelectionDAG &DAG, SDValue Chain, 4440 const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs, 4441 SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) { 4442 for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) { 4443 SDValue Arg = TailCallArgs[i].Arg; 4444 SDValue FIN = TailCallArgs[i].FrameIdxOp; 4445 int FI = TailCallArgs[i].FrameIdx; 4446 // Store relative to framepointer. 4447 MemOpChains.push_back(DAG.getStore( 4448 Chain, dl, Arg, FIN, 4449 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); 4450 } 4451 } 4452 4453 /// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to 4454 /// the appropriate stack slot for the tail call optimized function call. 4455 static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain, 4456 SDValue OldRetAddr, SDValue OldFP, 4457 int SPDiff, const SDLoc &dl) { 4458 if (SPDiff) { 4459 // Calculate the new stack slot for the return address. 4460 MachineFunction &MF = DAG.getMachineFunction(); 4461 const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); 4462 const PPCFrameLowering *FL = Subtarget.getFrameLowering(); 4463 bool isPPC64 = Subtarget.isPPC64(); 4464 int SlotSize = isPPC64 ? 8 : 4; 4465 int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset(); 4466 int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize, 4467 NewRetAddrLoc, true); 4468 EVT VT = isPPC64 ? MVT::i64 : MVT::i32; 4469 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT); 4470 Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx, 4471 MachinePointerInfo::getFixedStack(MF, NewRetAddr)); 4472 4473 // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack 4474 // slot as the FP is never overwritten. 4475 if (Subtarget.isDarwinABI()) { 4476 int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset(); 4477 int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc, 4478 true); 4479 SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT); 4480 Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx, 4481 MachinePointerInfo::getFixedStack( 4482 DAG.getMachineFunction(), NewFPIdx)); 4483 } 4484 } 4485 return Chain; 4486 } 4487 4488 /// CalculateTailCallArgDest - Remember Argument for later processing. Calculate 4489 /// the position of the argument. 4490 static void 4491 CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64, 4492 SDValue Arg, int SPDiff, unsigned ArgOffset, 4493 SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) { 4494 int Offset = ArgOffset + SPDiff; 4495 uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8; 4496 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); 4497 EVT VT = isPPC64 ? MVT::i64 : MVT::i32; 4498 SDValue FIN = DAG.getFrameIndex(FI, VT); 4499 TailCallArgumentInfo Info; 4500 Info.Arg = Arg; 4501 Info.FrameIdxOp = FIN; 4502 Info.FrameIdx = FI; 4503 TailCallArguments.push_back(Info); 4504 } 4505 4506 /// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address 4507 /// stack slot. Returns the chain as result and the loaded frame pointers in 4508 /// LROpOut/FPOpout. Used when tail calling. 4509 SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr( 4510 SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut, 4511 SDValue &FPOpOut, const SDLoc &dl) const { 4512 if (SPDiff) { 4513 // Load the LR and FP stack slot for later adjusting. 4514 EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; 4515 LROpOut = getReturnAddrFrameIndex(DAG); 4516 LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo()); 4517 Chain = SDValue(LROpOut.getNode(), 1); 4518 4519 // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack 4520 // slot as the FP is never overwritten. 4521 if (Subtarget.isDarwinABI()) { 4522 FPOpOut = getFramePointerFrameIndex(DAG); 4523 FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo()); 4524 Chain = SDValue(FPOpOut.getNode(), 1); 4525 } 4526 } 4527 return Chain; 4528 } 4529 4530 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 4531 /// by "Src" to address "Dst" of size "Size". Alignment information is 4532 /// specified by the specific parameter attribute. The copy will be passed as 4533 /// a byval function parameter. 4534 /// Sometimes what we are copying is the end of a larger object, the part that 4535 /// does not fit in registers. 4536 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, 4537 SDValue Chain, ISD::ArgFlagsTy Flags, 4538 SelectionDAG &DAG, const SDLoc &dl) { 4539 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); 4540 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 4541 false, false, false, MachinePointerInfo(), 4542 MachinePointerInfo()); 4543 } 4544 4545 /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of 4546 /// tail calls. 4547 static void LowerMemOpCallTo( 4548 SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg, 4549 SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64, 4550 bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains, 4551 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) { 4552 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 4553 if (!isTailCall) { 4554 if (isVector) { 4555 SDValue StackPtr; 4556 if (isPPC64) 4557 StackPtr = DAG.getRegister(PPC::X1, MVT::i64); 4558 else 4559 StackPtr = DAG.getRegister(PPC::R1, MVT::i32); 4560 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, 4561 DAG.getConstant(ArgOffset, dl, PtrVT)); 4562 } 4563 MemOpChains.push_back( 4564 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); 4565 // Calculate and remember argument location. 4566 } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset, 4567 TailCallArguments); 4568 } 4569 4570 static void 4571 PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, 4572 const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp, 4573 SDValue FPOp, 4574 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) { 4575 // Emit a sequence of copyto/copyfrom virtual registers for arguments that 4576 // might overwrite each other in case of tail call optimization. 4577 SmallVector<SDValue, 8> MemOpChains2; 4578 // Do not flag preceding copytoreg stuff together with the following stuff. 4579 InFlag = SDValue(); 4580 StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments, 4581 MemOpChains2, dl); 4582 if (!MemOpChains2.empty()) 4583 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); 4584 4585 // Store the return address to the appropriate stack slot. 4586 Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl); 4587 4588 // Emit callseq_end just before tailcall node. 4589 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 4590 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 4591 InFlag = Chain.getValue(1); 4592 } 4593 4594 // Is this global address that of a function that can be called by name? (as 4595 // opposed to something that must hold a descriptor for an indirect call). 4596 static bool isFunctionGlobalAddress(SDValue Callee) { 4597 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 4598 if (Callee.getOpcode() == ISD::GlobalTLSAddress || 4599 Callee.getOpcode() == ISD::TargetGlobalTLSAddress) 4600 return false; 4601 4602 return G->getGlobal()->getValueType()->isFunctionTy(); 4603 } 4604 4605 return false; 4606 } 4607 4608 static unsigned 4609 PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, 4610 SDValue CallSeqStart, const SDLoc &dl, int SPDiff, bool isTailCall, 4611 bool isPatchPoint, bool hasNest, 4612 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, 4613 SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys, 4614 ImmutableCallSite *CS, const PPCSubtarget &Subtarget) { 4615 bool isPPC64 = Subtarget.isPPC64(); 4616 bool isSVR4ABI = Subtarget.isSVR4ABI(); 4617 bool isELFv2ABI = Subtarget.isELFv2ABI(); 4618 4619 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 4620 NodeTys.push_back(MVT::Other); // Returns a chain 4621 NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use. 4622 4623 unsigned CallOpc = PPCISD::CALL; 4624 4625 bool needIndirectCall = true; 4626 if (!isSVR4ABI || !isPPC64) 4627 if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { 4628 // If this is an absolute destination address, use the munged value. 4629 Callee = SDValue(Dest, 0); 4630 needIndirectCall = false; 4631 } 4632 4633 // PC-relative references to external symbols should go through $stub, unless 4634 // we're building with the leopard linker or later, which automatically 4635 // synthesizes these stubs. 4636 const TargetMachine &TM = DAG.getTarget(); 4637 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent(); 4638 const GlobalValue *GV = nullptr; 4639 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) 4640 GV = G->getGlobal(); 4641 bool Local = TM.shouldAssumeDSOLocal(*Mod, GV); 4642 bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64; 4643 4644 if (isFunctionGlobalAddress(Callee)) { 4645 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee); 4646 // A call to a TLS address is actually an indirect call to a 4647 // thread-specific pointer. 4648 unsigned OpFlags = 0; 4649 if (UsePlt) 4650 OpFlags = PPCII::MO_PLT; 4651 4652 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, 4653 // every direct call is) turn it into a TargetGlobalAddress / 4654 // TargetExternalSymbol node so that legalize doesn't hack it. 4655 Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, 4656 Callee.getValueType(), 0, OpFlags); 4657 needIndirectCall = false; 4658 } 4659 4660 if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 4661 unsigned char OpFlags = 0; 4662 4663 if (UsePlt) 4664 OpFlags = PPCII::MO_PLT; 4665 4666 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(), 4667 OpFlags); 4668 needIndirectCall = false; 4669 } 4670 4671 if (isPatchPoint) { 4672 // We'll form an invalid direct call when lowering a patchpoint; the full 4673 // sequence for an indirect call is complicated, and many of the 4674 // instructions introduced might have side effects (and, thus, can't be 4675 // removed later). The call itself will be removed as soon as the 4676 // argument/return lowering is complete, so the fact that it has the wrong 4677 // kind of operands should not really matter. 4678 needIndirectCall = false; 4679 } 4680 4681 if (needIndirectCall) { 4682 // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair 4683 // to do the call, we can't use PPCISD::CALL. 4684 SDValue MTCTROps[] = {Chain, Callee, InFlag}; 4685 4686 if (isSVR4ABI && isPPC64 && !isELFv2ABI) { 4687 // Function pointers in the 64-bit SVR4 ABI do not point to the function 4688 // entry point, but to the function descriptor (the function entry point 4689 // address is part of the function descriptor though). 4690 // The function descriptor is a three doubleword structure with the 4691 // following fields: function entry point, TOC base address and 4692 // environment pointer. 4693 // Thus for a call through a function pointer, the following actions need 4694 // to be performed: 4695 // 1. Save the TOC of the caller in the TOC save area of its stack 4696 // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()). 4697 // 2. Load the address of the function entry point from the function 4698 // descriptor. 4699 // 3. Load the TOC of the callee from the function descriptor into r2. 4700 // 4. Load the environment pointer from the function descriptor into 4701 // r11. 4702 // 5. Branch to the function entry point address. 4703 // 6. On return of the callee, the TOC of the caller needs to be 4704 // restored (this is done in FinishCall()). 4705 // 4706 // The loads are scheduled at the beginning of the call sequence, and the 4707 // register copies are flagged together to ensure that no other 4708 // operations can be scheduled in between. E.g. without flagging the 4709 // copies together, a TOC access in the caller could be scheduled between 4710 // the assignment of the callee TOC and the branch to the callee, which 4711 // results in the TOC access going through the TOC of the callee instead 4712 // of going through the TOC of the caller, which leads to incorrect code. 4713 4714 // Load the address of the function entry point from the function 4715 // descriptor. 4716 SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1); 4717 if (LDChain.getValueType() == MVT::Glue) 4718 LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2); 4719 4720 auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors() 4721 ? (MachineMemOperand::MODereferenceable | 4722 MachineMemOperand::MOInvariant) 4723 : MachineMemOperand::MONone; 4724 4725 MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr); 4726 SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI, 4727 /* Alignment = */ 8, MMOFlags); 4728 4729 // Load environment pointer into r11. 4730 SDValue PtrOff = DAG.getIntPtrConstant(16, dl); 4731 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff); 4732 SDValue LoadEnvPtr = 4733 DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16), 4734 /* Alignment = */ 8, MMOFlags); 4735 4736 SDValue TOCOff = DAG.getIntPtrConstant(8, dl); 4737 SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); 4738 SDValue TOCPtr = 4739 DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8), 4740 /* Alignment = */ 8, MMOFlags); 4741 4742 setUsesTOCBasePtr(DAG); 4743 SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr, 4744 InFlag); 4745 Chain = TOCVal.getValue(0); 4746 InFlag = TOCVal.getValue(1); 4747 4748 // If the function call has an explicit 'nest' parameter, it takes the 4749 // place of the environment pointer. 4750 if (!hasNest) { 4751 SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, 4752 InFlag); 4753 4754 Chain = EnvVal.getValue(0); 4755 InFlag = EnvVal.getValue(1); 4756 } 4757 4758 MTCTROps[0] = Chain; 4759 MTCTROps[1] = LoadFuncPtr; 4760 MTCTROps[2] = InFlag; 4761 } 4762 4763 Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, 4764 makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2)); 4765 InFlag = Chain.getValue(1); 4766 4767 NodeTys.clear(); 4768 NodeTys.push_back(MVT::Other); 4769 NodeTys.push_back(MVT::Glue); 4770 Ops.push_back(Chain); 4771 CallOpc = PPCISD::BCTRL; 4772 Callee.setNode(nullptr); 4773 // Add use of X11 (holding environment pointer) 4774 if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest) 4775 Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); 4776 // Add CTR register as callee so a bctr can be emitted later. 4777 if (isTailCall) 4778 Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT)); 4779 } 4780 4781 // If this is a direct call, pass the chain and the callee. 4782 if (Callee.getNode()) { 4783 Ops.push_back(Chain); 4784 Ops.push_back(Callee); 4785 } 4786 // If this is a tail call add stack pointer delta. 4787 if (isTailCall) 4788 Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32)); 4789 4790 // Add argument registers to the end of the list so that they are known live 4791 // into the call. 4792 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 4793 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 4794 RegsToPass[i].second.getValueType())); 4795 4796 // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live 4797 // into the call. 4798 if (isSVR4ABI && isPPC64 && !isPatchPoint) { 4799 setUsesTOCBasePtr(DAG); 4800 Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); 4801 } 4802 4803 return CallOpc; 4804 } 4805 4806 SDValue PPCTargetLowering::LowerCallResult( 4807 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 4808 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 4809 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 4810 SmallVector<CCValAssign, 16> RVLocs; 4811 CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 4812 *DAG.getContext()); 4813 CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC); 4814 4815 // Copy all of the result registers out of their specified physreg. 4816 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 4817 CCValAssign &VA = RVLocs[i]; 4818 assert(VA.isRegLoc() && "Can only return in registers!"); 4819 4820 SDValue Val = DAG.getCopyFromReg(Chain, dl, 4821 VA.getLocReg(), VA.getLocVT(), InFlag); 4822 Chain = Val.getValue(1); 4823 InFlag = Val.getValue(2); 4824 4825 switch (VA.getLocInfo()) { 4826 default: llvm_unreachable("Unknown loc info!"); 4827 case CCValAssign::Full: break; 4828 case CCValAssign::AExt: 4829 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 4830 break; 4831 case CCValAssign::ZExt: 4832 Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val, 4833 DAG.getValueType(VA.getValVT())); 4834 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 4835 break; 4836 case CCValAssign::SExt: 4837 Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val, 4838 DAG.getValueType(VA.getValVT())); 4839 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 4840 break; 4841 } 4842 4843 InVals.push_back(Val); 4844 } 4845 4846 return Chain; 4847 } 4848 4849 SDValue PPCTargetLowering::FinishCall( 4850 CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg, 4851 bool isPatchPoint, bool hasNest, SelectionDAG &DAG, 4852 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag, 4853 SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff, 4854 unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins, 4855 SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const { 4856 std::vector<EVT> NodeTys; 4857 SmallVector<SDValue, 8> Ops; 4858 unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, 4859 SPDiff, isTailCall, isPatchPoint, hasNest, 4860 RegsToPass, Ops, NodeTys, CS, Subtarget); 4861 4862 // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls 4863 if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) 4864 Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); 4865 4866 // When performing tail call optimization the callee pops its arguments off 4867 // the stack. Account for this here so these bytes can be pushed back on in 4868 // PPCFrameLowering::eliminateCallFramePseudoInstr. 4869 int BytesCalleePops = 4870 (CallConv == CallingConv::Fast && 4871 getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; 4872 4873 // Add a register mask operand representing the call-preserved registers. 4874 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); 4875 const uint32_t *Mask = 4876 TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv); 4877 assert(Mask && "Missing call preserved mask for calling convention"); 4878 Ops.push_back(DAG.getRegisterMask(Mask)); 4879 4880 if (InFlag.getNode()) 4881 Ops.push_back(InFlag); 4882 4883 // Emit tail call. 4884 if (isTailCall) { 4885 assert(((Callee.getOpcode() == ISD::Register && 4886 cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) || 4887 Callee.getOpcode() == ISD::TargetExternalSymbol || 4888 Callee.getOpcode() == ISD::TargetGlobalAddress || 4889 isa<ConstantSDNode>(Callee)) && 4890 "Expecting an global address, external symbol, absolute value or register"); 4891 4892 DAG.getMachineFunction().getFrameInfo().setHasTailCall(); 4893 return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops); 4894 } 4895 4896 // Add a NOP immediately after the branch instruction when using the 64-bit 4897 // SVR4 ABI. At link time, if caller and callee are in a different module and 4898 // thus have a different TOC, the call will be replaced with a call to a stub 4899 // function which saves the current TOC, loads the TOC of the callee and 4900 // branches to the callee. The NOP will be replaced with a load instruction 4901 // which restores the TOC of the caller from the TOC save slot of the current 4902 // stack frame. If caller and callee belong to the same module (and have the 4903 // same TOC), the NOP will remain unchanged. 4904 4905 MachineFunction &MF = DAG.getMachineFunction(); 4906 if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() && 4907 !isPatchPoint) { 4908 if (CallOpc == PPCISD::BCTRL) { 4909 // This is a call through a function pointer. 4910 // Restore the caller TOC from the save area into R2. 4911 // See PrepareCall() for more information about calls through function 4912 // pointers in the 64-bit SVR4 ABI. 4913 // We are using a target-specific load with r2 hard coded, because the 4914 // result of a target-independent load would never go directly into r2, 4915 // since r2 is a reserved register (which prevents the register allocator 4916 // from allocating it), resulting in an additional register being 4917 // allocated and an unnecessary move instruction being generated. 4918 CallOpc = PPCISD::BCTRL_LOAD_TOC; 4919 4920 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4921 SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); 4922 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); 4923 SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); 4924 SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); 4925 4926 // The address needs to go after the chain input but before the flag (or 4927 // any other variadic arguments). 4928 Ops.insert(std::next(Ops.begin()), AddTOC); 4929 } else if (CallOpc == PPCISD::CALL && 4930 !resideInSameSection(MF.getFunction(), Callee, DAG.getTarget())) { 4931 // Otherwise insert NOP for non-local calls. 4932 CallOpc = PPCISD::CALL_NOP; 4933 } 4934 } 4935 4936 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 4937 InFlag = Chain.getValue(1); 4938 4939 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 4940 DAG.getIntPtrConstant(BytesCalleePops, dl, true), 4941 InFlag, dl); 4942 if (!Ins.empty()) 4943 InFlag = Chain.getValue(1); 4944 4945 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 4946 Ins, dl, DAG, InVals); 4947 } 4948 4949 SDValue 4950 PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 4951 SmallVectorImpl<SDValue> &InVals) const { 4952 SelectionDAG &DAG = CLI.DAG; 4953 SDLoc &dl = CLI.DL; 4954 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 4955 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 4956 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 4957 SDValue Chain = CLI.Chain; 4958 SDValue Callee = CLI.Callee; 4959 bool &isTailCall = CLI.IsTailCall; 4960 CallingConv::ID CallConv = CLI.CallConv; 4961 bool isVarArg = CLI.IsVarArg; 4962 bool isPatchPoint = CLI.IsPatchPoint; 4963 ImmutableCallSite *CS = CLI.CS; 4964 4965 if (isTailCall) { 4966 if (Subtarget.useLongCalls() && !(CS && CS->isMustTailCall())) 4967 isTailCall = false; 4968 else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) 4969 isTailCall = 4970 IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS, 4971 isVarArg, Outs, Ins, DAG); 4972 else 4973 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, 4974 Ins, DAG); 4975 if (isTailCall) { 4976 ++NumTailCalls; 4977 if (!getTargetMachine().Options.GuaranteedTailCallOpt) 4978 ++NumSiblingCalls; 4979 4980 assert(isa<GlobalAddressSDNode>(Callee) && 4981 "Callee should be an llvm::Function object."); 4982 DEBUG( 4983 const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 4984 const unsigned Width = 80 - strlen("TCO caller: ") 4985 - strlen(", callee linkage: 0, 0"); 4986 dbgs() << "TCO caller: " 4987 << left_justify(DAG.getMachineFunction().getName(), Width) 4988 << ", callee linkage: " 4989 << GV->getVisibility() << ", " << GV->getLinkage() << "\n" 4990 ); 4991 } 4992 } 4993 4994 if (!isTailCall && CS && CS->isMustTailCall()) 4995 report_fatal_error("failed to perform tail call elimination on a call " 4996 "site marked musttail"); 4997 4998 // When long calls (i.e. indirect calls) are always used, calls are always 4999 // made via function pointer. If we have a function name, first translate it 5000 // into a pointer. 5001 if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) && 5002 !isTailCall) 5003 Callee = LowerGlobalAddress(Callee, DAG); 5004 5005 if (Subtarget.isSVR4ABI()) { 5006 if (Subtarget.isPPC64()) 5007 return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, 5008 isTailCall, isPatchPoint, Outs, OutVals, Ins, 5009 dl, DAG, InVals, CS); 5010 else 5011 return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, 5012 isTailCall, isPatchPoint, Outs, OutVals, Ins, 5013 dl, DAG, InVals, CS); 5014 } 5015 5016 return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, 5017 isTailCall, isPatchPoint, Outs, OutVals, Ins, 5018 dl, DAG, InVals, CS); 5019 } 5020 5021 SDValue PPCTargetLowering::LowerCall_32SVR4( 5022 SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, 5023 bool isTailCall, bool isPatchPoint, 5024 const SmallVectorImpl<ISD::OutputArg> &Outs, 5025 const SmallVectorImpl<SDValue> &OutVals, 5026 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 5027 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 5028 ImmutableCallSite *CS) const { 5029 // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description 5030 // of the 32-bit SVR4 ABI stack frame layout. 5031 5032 assert((CallConv == CallingConv::C || 5033 CallConv == CallingConv::Fast) && "Unknown calling convention!"); 5034 5035 unsigned PtrByteSize = 4; 5036 5037 MachineFunction &MF = DAG.getMachineFunction(); 5038 5039 // Mark this function as potentially containing a function that contains a 5040 // tail call. As a consequence the frame pointer will be used for dynamicalloc 5041 // and restoring the callers stack pointer in this functions epilog. This is 5042 // done because by tail calling the called function might overwrite the value 5043 // in this function's (MF) stack pointer stack slot 0(SP). 5044 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5045 CallConv == CallingConv::Fast) 5046 MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); 5047 5048 // Count how many bytes are to be pushed on the stack, including the linkage 5049 // area, parameter list area and the part of the local variable space which 5050 // contains copies of aggregates which are passed by value. 5051 5052 // Assign locations to all of the outgoing arguments. 5053 SmallVector<CCValAssign, 16> ArgLocs; 5054 PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 5055 5056 // Reserve space for the linkage area on the stack. 5057 CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(), 5058 PtrByteSize); 5059 if (useSoftFloat()) 5060 CCInfo.PreAnalyzeCallOperands(Outs); 5061 5062 if (isVarArg) { 5063 // Handle fixed and variable vector arguments differently. 5064 // Fixed vector arguments go into registers as long as registers are 5065 // available. Variable vector arguments always go into memory. 5066 unsigned NumArgs = Outs.size(); 5067 5068 for (unsigned i = 0; i != NumArgs; ++i) { 5069 MVT ArgVT = Outs[i].VT; 5070 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 5071 bool Result; 5072 5073 if (Outs[i].IsFixed) { 5074 Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, 5075 CCInfo); 5076 } else { 5077 Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full, 5078 ArgFlags, CCInfo); 5079 } 5080 5081 if (Result) { 5082 #ifndef NDEBUG 5083 errs() << "Call operand #" << i << " has unhandled type " 5084 << EVT(ArgVT).getEVTString() << "\n"; 5085 #endif 5086 llvm_unreachable(nullptr); 5087 } 5088 } 5089 } else { 5090 // All arguments are treated the same. 5091 CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4); 5092 } 5093 CCInfo.clearWasPPCF128(); 5094 5095 // Assign locations to all of the outgoing aggregate by value arguments. 5096 SmallVector<CCValAssign, 16> ByValArgLocs; 5097 CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext()); 5098 5099 // Reserve stack space for the allocations in CCInfo. 5100 CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); 5101 5102 CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal); 5103 5104 // Size of the linkage area, parameter list area and the part of the local 5105 // space variable where copies of aggregates which are passed by value are 5106 // stored. 5107 unsigned NumBytes = CCByValInfo.getNextStackOffset(); 5108 5109 // Calculate by how many bytes the stack has to be adjusted in case of tail 5110 // call optimization. 5111 int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); 5112 5113 // Adjust the stack pointer for the new arguments... 5114 // These operations are automatically eliminated by the prolog/epilog pass 5115 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 5116 SDValue CallSeqStart = Chain; 5117 5118 // Load the return address and frame pointer so it can be moved somewhere else 5119 // later. 5120 SDValue LROp, FPOp; 5121 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); 5122 5123 // Set up a copy of the stack pointer for use loading and storing any 5124 // arguments that may not fit in the registers available for argument 5125 // passing. 5126 SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32); 5127 5128 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 5129 SmallVector<TailCallArgumentInfo, 8> TailCallArguments; 5130 SmallVector<SDValue, 8> MemOpChains; 5131 5132 bool seenFloatArg = false; 5133 // Walk the register/memloc assignments, inserting copies/loads. 5134 for (unsigned i = 0, j = 0, e = ArgLocs.size(); 5135 i != e; 5136 ++i) { 5137 CCValAssign &VA = ArgLocs[i]; 5138 SDValue Arg = OutVals[i]; 5139 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5140 5141 if (Flags.isByVal()) { 5142 // Argument is an aggregate which is passed by value, thus we need to 5143 // create a copy of it in the local variable space of the current stack 5144 // frame (which is the stack frame of the caller) and pass the address of 5145 // this copy to the callee. 5146 assert((j < ByValArgLocs.size()) && "Index out of bounds!"); 5147 CCValAssign &ByValVA = ByValArgLocs[j++]; 5148 assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!"); 5149 5150 // Memory reserved in the local variable space of the callers stack frame. 5151 unsigned LocMemOffset = ByValVA.getLocMemOffset(); 5152 5153 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 5154 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), 5155 StackPtr, PtrOff); 5156 5157 // Create a copy of the argument in the local area of the current 5158 // stack frame. 5159 SDValue MemcpyCall = 5160 CreateCopyOfByValArgument(Arg, PtrOff, 5161 CallSeqStart.getNode()->getOperand(0), 5162 Flags, DAG, dl); 5163 5164 // This must go outside the CALLSEQ_START..END. 5165 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0, 5166 SDLoc(MemcpyCall)); 5167 DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), 5168 NewCallSeqStart.getNode()); 5169 Chain = CallSeqStart = NewCallSeqStart; 5170 5171 // Pass the address of the aggregate copy on the stack either in a 5172 // physical register or in the parameter list area of the current stack 5173 // frame to the callee. 5174 Arg = PtrOff; 5175 } 5176 5177 if (VA.isRegLoc()) { 5178 if (Arg.getValueType() == MVT::i1) 5179 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg); 5180 5181 seenFloatArg |= VA.getLocVT().isFloatingPoint(); 5182 // Put argument in a physical register. 5183 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 5184 } else { 5185 // Put argument in the parameter list area of the current stack frame. 5186 assert(VA.isMemLoc()); 5187 unsigned LocMemOffset = VA.getLocMemOffset(); 5188 5189 if (!isTailCall) { 5190 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 5191 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), 5192 StackPtr, PtrOff); 5193 5194 MemOpChains.push_back( 5195 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); 5196 } else { 5197 // Calculate and remember argument location. 5198 CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset, 5199 TailCallArguments); 5200 } 5201 } 5202 } 5203 5204 if (!MemOpChains.empty()) 5205 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 5206 5207 // Build a sequence of copy-to-reg nodes chained together with token chain 5208 // and flag operands which copy the outgoing args into the appropriate regs. 5209 SDValue InFlag; 5210 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 5211 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 5212 RegsToPass[i].second, InFlag); 5213 InFlag = Chain.getValue(1); 5214 } 5215 5216 // Set CR bit 6 to true if this is a vararg call with floating args passed in 5217 // registers. 5218 if (isVarArg) { 5219 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 5220 SDValue Ops[] = { Chain, InFlag }; 5221 5222 Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, 5223 dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1)); 5224 5225 InFlag = Chain.getValue(1); 5226 } 5227 5228 if (isTailCall) 5229 PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, 5230 TailCallArguments); 5231 5232 return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, 5233 /* unused except on PPC64 ELFv1 */ false, DAG, 5234 RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, 5235 NumBytes, Ins, InVals, CS); 5236 } 5237 5238 // Copy an argument into memory, being careful to do this outside the 5239 // call sequence for the call to which the argument belongs. 5240 SDValue PPCTargetLowering::createMemcpyOutsideCallSeq( 5241 SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags, 5242 SelectionDAG &DAG, const SDLoc &dl) const { 5243 SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, 5244 CallSeqStart.getNode()->getOperand(0), 5245 Flags, DAG, dl); 5246 // The MEMCPY must go outside the CALLSEQ_START..END. 5247 int64_t FrameSize = CallSeqStart.getConstantOperandVal(1); 5248 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0, 5249 SDLoc(MemcpyCall)); 5250 DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), 5251 NewCallSeqStart.getNode()); 5252 return NewCallSeqStart; 5253 } 5254 5255 SDValue PPCTargetLowering::LowerCall_64SVR4( 5256 SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, 5257 bool isTailCall, bool isPatchPoint, 5258 const SmallVectorImpl<ISD::OutputArg> &Outs, 5259 const SmallVectorImpl<SDValue> &OutVals, 5260 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 5261 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 5262 ImmutableCallSite *CS) const { 5263 bool isELFv2ABI = Subtarget.isELFv2ABI(); 5264 bool isLittleEndian = Subtarget.isLittleEndian(); 5265 unsigned NumOps = Outs.size(); 5266 bool hasNest = false; 5267 bool IsSibCall = false; 5268 5269 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5270 unsigned PtrByteSize = 8; 5271 5272 MachineFunction &MF = DAG.getMachineFunction(); 5273 5274 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt) 5275 IsSibCall = true; 5276 5277 // Mark this function as potentially containing a function that contains a 5278 // tail call. As a consequence the frame pointer will be used for dynamicalloc 5279 // and restoring the callers stack pointer in this functions epilog. This is 5280 // done because by tail calling the called function might overwrite the value 5281 // in this function's (MF) stack pointer stack slot 0(SP). 5282 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5283 CallConv == CallingConv::Fast) 5284 MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); 5285 5286 assert(!(CallConv == CallingConv::Fast && isVarArg) && 5287 "fastcc not supported on varargs functions"); 5288 5289 // Count how many bytes are to be pushed on the stack, including the linkage 5290 // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes 5291 // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage 5292 // area is 32 bytes reserved space for [SP][CR][LR][TOC]. 5293 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 5294 unsigned NumBytes = LinkageSize; 5295 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 5296 unsigned &QFPR_idx = FPR_idx; 5297 5298 static const MCPhysReg GPR[] = { 5299 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 5300 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 5301 }; 5302 static const MCPhysReg VR[] = { 5303 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 5304 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 5305 }; 5306 5307 const unsigned NumGPRs = array_lengthof(GPR); 5308 const unsigned NumFPRs = useSoftFloat() ? 0 : 13; 5309 const unsigned NumVRs = array_lengthof(VR); 5310 const unsigned NumQFPRs = NumFPRs; 5311 5312 // On ELFv2, we can avoid allocating the parameter area if all the arguments 5313 // can be passed to the callee in registers. 5314 // For the fast calling convention, there is another check below. 5315 // Note: We should keep consistent with LowerFormalArguments_64SVR4() 5316 bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast; 5317 if (!HasParameterArea) { 5318 unsigned ParamAreaSize = NumGPRs * PtrByteSize; 5319 unsigned AvailableFPRs = NumFPRs; 5320 unsigned AvailableVRs = NumVRs; 5321 unsigned NumBytesTmp = NumBytes; 5322 for (unsigned i = 0; i != NumOps; ++i) { 5323 if (Outs[i].Flags.isNest()) continue; 5324 if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags, 5325 PtrByteSize, LinkageSize, ParamAreaSize, 5326 NumBytesTmp, AvailableFPRs, AvailableVRs, 5327 Subtarget.hasQPX())) 5328 HasParameterArea = true; 5329 } 5330 } 5331 5332 // When using the fast calling convention, we don't provide backing for 5333 // arguments that will be in registers. 5334 unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0; 5335 5336 // Add up all the space actually used. 5337 for (unsigned i = 0; i != NumOps; ++i) { 5338 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5339 EVT ArgVT = Outs[i].VT; 5340 EVT OrigVT = Outs[i].ArgVT; 5341 5342 if (Flags.isNest()) 5343 continue; 5344 5345 if (CallConv == CallingConv::Fast) { 5346 if (Flags.isByVal()) 5347 NumGPRsUsed += (Flags.getByValSize()+7)/8; 5348 else 5349 switch (ArgVT.getSimpleVT().SimpleTy) { 5350 default: llvm_unreachable("Unexpected ValueType for argument!"); 5351 case MVT::i1: 5352 case MVT::i32: 5353 case MVT::i64: 5354 if (++NumGPRsUsed <= NumGPRs) 5355 continue; 5356 break; 5357 case MVT::v4i32: 5358 case MVT::v8i16: 5359 case MVT::v16i8: 5360 case MVT::v2f64: 5361 case MVT::v2i64: 5362 case MVT::v1i128: 5363 if (++NumVRsUsed <= NumVRs) 5364 continue; 5365 break; 5366 case MVT::v4f32: 5367 // When using QPX, this is handled like a FP register, otherwise, it 5368 // is an Altivec register. 5369 if (Subtarget.hasQPX()) { 5370 if (++NumFPRsUsed <= NumFPRs) 5371 continue; 5372 } else { 5373 if (++NumVRsUsed <= NumVRs) 5374 continue; 5375 } 5376 break; 5377 case MVT::f32: 5378 case MVT::f64: 5379 case MVT::v4f64: // QPX 5380 case MVT::v4i1: // QPX 5381 if (++NumFPRsUsed <= NumFPRs) 5382 continue; 5383 break; 5384 } 5385 } 5386 5387 /* Respect alignment of argument on the stack. */ 5388 unsigned Align = 5389 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); 5390 NumBytes = ((NumBytes + Align - 1) / Align) * Align; 5391 5392 NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); 5393 if (Flags.isInConsecutiveRegsLast()) 5394 NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 5395 } 5396 5397 unsigned NumBytesActuallyUsed = NumBytes; 5398 5399 // In the old ELFv1 ABI, 5400 // the prolog code of the callee may store up to 8 GPR argument registers to 5401 // the stack, allowing va_start to index over them in memory if its varargs. 5402 // Because we cannot tell if this is needed on the caller side, we have to 5403 // conservatively assume that it is needed. As such, make sure we have at 5404 // least enough stack space for the caller to store the 8 GPRs. 5405 // In the ELFv2 ABI, we allocate the parameter area iff a callee 5406 // really requires memory operands, e.g. a vararg function. 5407 if (HasParameterArea) 5408 NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); 5409 else 5410 NumBytes = LinkageSize; 5411 5412 // Tail call needs the stack to be aligned. 5413 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5414 CallConv == CallingConv::Fast) 5415 NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); 5416 5417 int SPDiff = 0; 5418 5419 // Calculate by how many bytes the stack has to be adjusted in case of tail 5420 // call optimization. 5421 if (!IsSibCall) 5422 SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); 5423 5424 // To protect arguments on the stack from being clobbered in a tail call, 5425 // force all the loads to happen before doing any other lowering. 5426 if (isTailCall) 5427 Chain = DAG.getStackArgumentTokenFactor(Chain); 5428 5429 // Adjust the stack pointer for the new arguments... 5430 // These operations are automatically eliminated by the prolog/epilog pass 5431 if (!IsSibCall) 5432 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 5433 SDValue CallSeqStart = Chain; 5434 5435 // Load the return address and frame pointer so it can be move somewhere else 5436 // later. 5437 SDValue LROp, FPOp; 5438 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); 5439 5440 // Set up a copy of the stack pointer for use loading and storing any 5441 // arguments that may not fit in the registers available for argument 5442 // passing. 5443 SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64); 5444 5445 // Figure out which arguments are going to go in registers, and which in 5446 // memory. Also, if this is a vararg function, floating point operations 5447 // must be stored to our stack, and loaded into integer regs as well, if 5448 // any integer regs are available for argument passing. 5449 unsigned ArgOffset = LinkageSize; 5450 5451 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 5452 SmallVector<TailCallArgumentInfo, 8> TailCallArguments; 5453 5454 SmallVector<SDValue, 8> MemOpChains; 5455 for (unsigned i = 0; i != NumOps; ++i) { 5456 SDValue Arg = OutVals[i]; 5457 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5458 EVT ArgVT = Outs[i].VT; 5459 EVT OrigVT = Outs[i].ArgVT; 5460 5461 // PtrOff will be used to store the current argument to the stack if a 5462 // register cannot be found for it. 5463 SDValue PtrOff; 5464 5465 // We re-align the argument offset for each argument, except when using the 5466 // fast calling convention, when we need to make sure we do that only when 5467 // we'll actually use a stack slot. 5468 auto ComputePtrOff = [&]() { 5469 /* Respect alignment of argument on the stack. */ 5470 unsigned Align = 5471 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); 5472 ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; 5473 5474 PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); 5475 5476 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); 5477 }; 5478 5479 if (CallConv != CallingConv::Fast) { 5480 ComputePtrOff(); 5481 5482 /* Compute GPR index associated with argument offset. */ 5483 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; 5484 GPR_idx = std::min(GPR_idx, NumGPRs); 5485 } 5486 5487 // Promote integers to 64-bit values. 5488 if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) { 5489 // FIXME: Should this use ANY_EXTEND if neither sext nor zext? 5490 unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 5491 Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); 5492 } 5493 5494 // FIXME memcpy is used way more than necessary. Correctness first. 5495 // Note: "by value" is code for passing a structure by value, not 5496 // basic types. 5497 if (Flags.isByVal()) { 5498 // Note: Size includes alignment padding, so 5499 // struct x { short a; char b; } 5500 // will have Size = 4. With #pragma pack(1), it will have Size = 3. 5501 // These are the proper values we need for right-justifying the 5502 // aggregate in a parameter register. 5503 unsigned Size = Flags.getByValSize(); 5504 5505 // An empty aggregate parameter takes up no storage and no 5506 // registers. 5507 if (Size == 0) 5508 continue; 5509 5510 if (CallConv == CallingConv::Fast) 5511 ComputePtrOff(); 5512 5513 // All aggregates smaller than 8 bytes must be passed right-justified. 5514 if (Size==1 || Size==2 || Size==4) { 5515 EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); 5516 if (GPR_idx != NumGPRs) { 5517 SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, 5518 MachinePointerInfo(), VT); 5519 MemOpChains.push_back(Load.getValue(1)); 5520 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5521 5522 ArgOffset += PtrByteSize; 5523 continue; 5524 } 5525 } 5526 5527 if (GPR_idx == NumGPRs && Size < 8) { 5528 SDValue AddPtr = PtrOff; 5529 if (!isLittleEndian) { 5530 SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, 5531 PtrOff.getValueType()); 5532 AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); 5533 } 5534 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, 5535 CallSeqStart, 5536 Flags, DAG, dl); 5537 ArgOffset += PtrByteSize; 5538 continue; 5539 } 5540 // Copy entire object into memory. There are cases where gcc-generated 5541 // code assumes it is there, even if it could be put entirely into 5542 // registers. (This is not what the doc says.) 5543 5544 // FIXME: The above statement is likely due to a misunderstanding of the 5545 // documents. All arguments must be copied into the parameter area BY 5546 // THE CALLEE in the event that the callee takes the address of any 5547 // formal argument. That has not yet been implemented. However, it is 5548 // reasonable to use the stack area as a staging area for the register 5549 // load. 5550 5551 // Skip this for small aggregates, as we will use the same slot for a 5552 // right-justified copy, below. 5553 if (Size >= 8) 5554 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, 5555 CallSeqStart, 5556 Flags, DAG, dl); 5557 5558 // When a register is available, pass a small aggregate right-justified. 5559 if (Size < 8 && GPR_idx != NumGPRs) { 5560 // The easiest way to get this right-justified in a register 5561 // is to copy the structure into the rightmost portion of a 5562 // local variable slot, then load the whole slot into the 5563 // register. 5564 // FIXME: The memcpy seems to produce pretty awful code for 5565 // small aggregates, particularly for packed ones. 5566 // FIXME: It would be preferable to use the slot in the 5567 // parameter save area instead of a new local variable. 5568 SDValue AddPtr = PtrOff; 5569 if (!isLittleEndian) { 5570 SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType()); 5571 AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); 5572 } 5573 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, 5574 CallSeqStart, 5575 Flags, DAG, dl); 5576 5577 // Load the slot into the register. 5578 SDValue Load = 5579 DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo()); 5580 MemOpChains.push_back(Load.getValue(1)); 5581 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5582 5583 // Done with this argument. 5584 ArgOffset += PtrByteSize; 5585 continue; 5586 } 5587 5588 // For aggregates larger than PtrByteSize, copy the pieces of the 5589 // object that fit into registers from the parameter save area. 5590 for (unsigned j=0; j<Size; j+=PtrByteSize) { 5591 SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType()); 5592 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 5593 if (GPR_idx != NumGPRs) { 5594 SDValue Load = 5595 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo()); 5596 MemOpChains.push_back(Load.getValue(1)); 5597 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5598 ArgOffset += PtrByteSize; 5599 } else { 5600 ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; 5601 break; 5602 } 5603 } 5604 continue; 5605 } 5606 5607 switch (Arg.getSimpleValueType().SimpleTy) { 5608 default: llvm_unreachable("Unexpected ValueType for argument!"); 5609 case MVT::i1: 5610 case MVT::i32: 5611 case MVT::i64: 5612 if (Flags.isNest()) { 5613 // The 'nest' parameter, if any, is passed in R11. 5614 RegsToPass.push_back(std::make_pair(PPC::X11, Arg)); 5615 hasNest = true; 5616 break; 5617 } 5618 5619 // These can be scalar arguments or elements of an integer array type 5620 // passed directly. Clang may use those instead of "byval" aggregate 5621 // types to avoid forcing arguments to memory unnecessarily. 5622 if (GPR_idx != NumGPRs) { 5623 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); 5624 } else { 5625 if (CallConv == CallingConv::Fast) 5626 ComputePtrOff(); 5627 5628 assert(HasParameterArea && 5629 "Parameter area must exist to pass an argument in memory."); 5630 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 5631 true, isTailCall, false, MemOpChains, 5632 TailCallArguments, dl); 5633 if (CallConv == CallingConv::Fast) 5634 ArgOffset += PtrByteSize; 5635 } 5636 if (CallConv != CallingConv::Fast) 5637 ArgOffset += PtrByteSize; 5638 break; 5639 case MVT::f32: 5640 case MVT::f64: { 5641 // These can be scalar arguments or elements of a float array type 5642 // passed directly. The latter are used to implement ELFv2 homogenous 5643 // float aggregates. 5644 5645 // Named arguments go into FPRs first, and once they overflow, the 5646 // remaining arguments go into GPRs and then the parameter save area. 5647 // Unnamed arguments for vararg functions always go to GPRs and 5648 // then the parameter save area. For now, put all arguments to vararg 5649 // routines always in both locations (FPR *and* GPR or stack slot). 5650 bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs; 5651 bool NeededLoad = false; 5652 5653 // First load the argument into the next available FPR. 5654 if (FPR_idx != NumFPRs) 5655 RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); 5656 5657 // Next, load the argument into GPR or stack slot if needed. 5658 if (!NeedGPROrStack) 5659 ; 5660 else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) { 5661 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 5662 // once we support fp <-> gpr moves. 5663 5664 // In the non-vararg case, this can only ever happen in the 5665 // presence of f32 array types, since otherwise we never run 5666 // out of FPRs before running out of GPRs. 5667 SDValue ArgVal; 5668 5669 // Double values are always passed in a single GPR. 5670 if (Arg.getValueType() != MVT::f32) { 5671 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 5672 5673 // Non-array float values are extended and passed in a GPR. 5674 } else if (!Flags.isInConsecutiveRegs()) { 5675 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); 5676 ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); 5677 5678 // If we have an array of floats, we collect every odd element 5679 // together with its predecessor into one GPR. 5680 } else if (ArgOffset % PtrByteSize != 0) { 5681 SDValue Lo, Hi; 5682 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); 5683 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); 5684 if (!isLittleEndian) 5685 std::swap(Lo, Hi); 5686 ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 5687 5688 // The final element, if even, goes into the first half of a GPR. 5689 } else if (Flags.isInConsecutiveRegsLast()) { 5690 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); 5691 ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); 5692 if (!isLittleEndian) 5693 ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, 5694 DAG.getConstant(32, dl, MVT::i32)); 5695 5696 // Non-final even elements are skipped; they will be handled 5697 // together the with subsequent argument on the next go-around. 5698 } else 5699 ArgVal = SDValue(); 5700 5701 if (ArgVal.getNode()) 5702 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal)); 5703 } else { 5704 if (CallConv == CallingConv::Fast) 5705 ComputePtrOff(); 5706 5707 // Single-precision floating-point values are mapped to the 5708 // second (rightmost) word of the stack doubleword. 5709 if (Arg.getValueType() == MVT::f32 && 5710 !isLittleEndian && !Flags.isInConsecutiveRegs()) { 5711 SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); 5712 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); 5713 } 5714 5715 assert(HasParameterArea && 5716 "Parameter area must exist to pass an argument in memory."); 5717 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 5718 true, isTailCall, false, MemOpChains, 5719 TailCallArguments, dl); 5720 5721 NeededLoad = true; 5722 } 5723 // When passing an array of floats, the array occupies consecutive 5724 // space in the argument area; only round up to the next doubleword 5725 // at the end of the array. Otherwise, each float takes 8 bytes. 5726 if (CallConv != CallingConv::Fast || NeededLoad) { 5727 ArgOffset += (Arg.getValueType() == MVT::f32 && 5728 Flags.isInConsecutiveRegs()) ? 4 : 8; 5729 if (Flags.isInConsecutiveRegsLast()) 5730 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 5731 } 5732 break; 5733 } 5734 case MVT::v4f32: 5735 case MVT::v4i32: 5736 case MVT::v8i16: 5737 case MVT::v16i8: 5738 case MVT::v2f64: 5739 case MVT::v2i64: 5740 case MVT::v1i128: 5741 if (!Subtarget.hasQPX()) { 5742 // These can be scalar arguments or elements of a vector array type 5743 // passed directly. The latter are used to implement ELFv2 homogenous 5744 // vector aggregates. 5745 5746 // For a varargs call, named arguments go into VRs or on the stack as 5747 // usual; unnamed arguments always go to the stack or the corresponding 5748 // GPRs when within range. For now, we always put the value in both 5749 // locations (or even all three). 5750 if (isVarArg) { 5751 assert(HasParameterArea && 5752 "Parameter area must exist if we have a varargs call."); 5753 // We could elide this store in the case where the object fits 5754 // entirely in R registers. Maybe later. 5755 SDValue Store = 5756 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 5757 MemOpChains.push_back(Store); 5758 if (VR_idx != NumVRs) { 5759 SDValue Load = 5760 DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo()); 5761 MemOpChains.push_back(Load.getValue(1)); 5762 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load)); 5763 } 5764 ArgOffset += 16; 5765 for (unsigned i=0; i<16; i+=PtrByteSize) { 5766 if (GPR_idx == NumGPRs) 5767 break; 5768 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, 5769 DAG.getConstant(i, dl, PtrVT)); 5770 SDValue Load = 5771 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); 5772 MemOpChains.push_back(Load.getValue(1)); 5773 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5774 } 5775 break; 5776 } 5777 5778 // Non-varargs Altivec params go into VRs or on the stack. 5779 if (VR_idx != NumVRs) { 5780 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); 5781 } else { 5782 if (CallConv == CallingConv::Fast) 5783 ComputePtrOff(); 5784 5785 assert(HasParameterArea && 5786 "Parameter area must exist to pass an argument in memory."); 5787 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 5788 true, isTailCall, true, MemOpChains, 5789 TailCallArguments, dl); 5790 if (CallConv == CallingConv::Fast) 5791 ArgOffset += 16; 5792 } 5793 5794 if (CallConv != CallingConv::Fast) 5795 ArgOffset += 16; 5796 break; 5797 } // not QPX 5798 5799 assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && 5800 "Invalid QPX parameter type"); 5801 5802 /* fall through */ 5803 case MVT::v4f64: 5804 case MVT::v4i1: { 5805 bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; 5806 if (isVarArg) { 5807 assert(HasParameterArea && 5808 "Parameter area must exist if we have a varargs call."); 5809 // We could elide this store in the case where the object fits 5810 // entirely in R registers. Maybe later. 5811 SDValue Store = 5812 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 5813 MemOpChains.push_back(Store); 5814 if (QFPR_idx != NumQFPRs) { 5815 SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store, 5816 PtrOff, MachinePointerInfo()); 5817 MemOpChains.push_back(Load.getValue(1)); 5818 RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load)); 5819 } 5820 ArgOffset += (IsF32 ? 16 : 32); 5821 for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) { 5822 if (GPR_idx == NumGPRs) 5823 break; 5824 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, 5825 DAG.getConstant(i, dl, PtrVT)); 5826 SDValue Load = 5827 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); 5828 MemOpChains.push_back(Load.getValue(1)); 5829 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5830 } 5831 break; 5832 } 5833 5834 // Non-varargs QPX params go into registers or on the stack. 5835 if (QFPR_idx != NumQFPRs) { 5836 RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg)); 5837 } else { 5838 if (CallConv == CallingConv::Fast) 5839 ComputePtrOff(); 5840 5841 assert(HasParameterArea && 5842 "Parameter area must exist to pass an argument in memory."); 5843 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 5844 true, isTailCall, true, MemOpChains, 5845 TailCallArguments, dl); 5846 if (CallConv == CallingConv::Fast) 5847 ArgOffset += (IsF32 ? 16 : 32); 5848 } 5849 5850 if (CallConv != CallingConv::Fast) 5851 ArgOffset += (IsF32 ? 16 : 32); 5852 break; 5853 } 5854 } 5855 } 5856 5857 assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) && 5858 "mismatch in size of parameter area"); 5859 (void)NumBytesActuallyUsed; 5860 5861 if (!MemOpChains.empty()) 5862 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 5863 5864 // Check if this is an indirect call (MTCTR/BCTRL). 5865 // See PrepareCall() for more information about calls through function 5866 // pointers in the 64-bit SVR4 ABI. 5867 if (!isTailCall && !isPatchPoint && 5868 !isFunctionGlobalAddress(Callee) && 5869 !isa<ExternalSymbolSDNode>(Callee)) { 5870 // Load r2 into a virtual register and store it to the TOC save area. 5871 setUsesTOCBasePtr(DAG); 5872 SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); 5873 // TOC save area offset. 5874 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); 5875 SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); 5876 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); 5877 Chain = DAG.getStore( 5878 Val.getValue(1), dl, Val, AddPtr, 5879 MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset)); 5880 // In the ELFv2 ABI, R12 must contain the address of an indirect callee. 5881 // This does not mean the MTCTR instruction must use R12; it's easier 5882 // to model this as an extra parameter, so do that. 5883 if (isELFv2ABI && !isPatchPoint) 5884 RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); 5885 } 5886 5887 // Build a sequence of copy-to-reg nodes chained together with token chain 5888 // and flag operands which copy the outgoing args into the appropriate regs. 5889 SDValue InFlag; 5890 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 5891 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 5892 RegsToPass[i].second, InFlag); 5893 InFlag = Chain.getValue(1); 5894 } 5895 5896 if (isTailCall && !IsSibCall) 5897 PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, 5898 TailCallArguments); 5899 5900 return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest, 5901 DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, 5902 SPDiff, NumBytes, Ins, InVals, CS); 5903 } 5904 5905 SDValue PPCTargetLowering::LowerCall_Darwin( 5906 SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, 5907 bool isTailCall, bool isPatchPoint, 5908 const SmallVectorImpl<ISD::OutputArg> &Outs, 5909 const SmallVectorImpl<SDValue> &OutVals, 5910 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 5911 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 5912 ImmutableCallSite *CS) const { 5913 unsigned NumOps = Outs.size(); 5914 5915 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5916 bool isPPC64 = PtrVT == MVT::i64; 5917 unsigned PtrByteSize = isPPC64 ? 8 : 4; 5918 5919 MachineFunction &MF = DAG.getMachineFunction(); 5920 5921 // Mark this function as potentially containing a function that contains a 5922 // tail call. As a consequence the frame pointer will be used for dynamicalloc 5923 // and restoring the callers stack pointer in this functions epilog. This is 5924 // done because by tail calling the called function might overwrite the value 5925 // in this function's (MF) stack pointer stack slot 0(SP). 5926 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5927 CallConv == CallingConv::Fast) 5928 MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); 5929 5930 // Count how many bytes are to be pushed on the stack, including the linkage 5931 // area, and parameter passing area. We start with 24/48 bytes, which is 5932 // prereserved space for [SP][CR][LR][3 x unused]. 5933 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 5934 unsigned NumBytes = LinkageSize; 5935 5936 // Add up all the space actually used. 5937 // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually 5938 // they all go in registers, but we must reserve stack space for them for 5939 // possible use by the caller. In varargs or 64-bit calls, parameters are 5940 // assigned stack space in order, with padding so Altivec parameters are 5941 // 16-byte aligned. 5942 unsigned nAltivecParamsAtEnd = 0; 5943 for (unsigned i = 0; i != NumOps; ++i) { 5944 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5945 EVT ArgVT = Outs[i].VT; 5946 // Varargs Altivec parameters are padded to a 16 byte boundary. 5947 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || 5948 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || 5949 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) { 5950 if (!isVarArg && !isPPC64) { 5951 // Non-varargs Altivec parameters go after all the non-Altivec 5952 // parameters; handle those later so we know how much padding we need. 5953 nAltivecParamsAtEnd++; 5954 continue; 5955 } 5956 // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. 5957 NumBytes = ((NumBytes+15)/16)*16; 5958 } 5959 NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); 5960 } 5961 5962 // Allow for Altivec parameters at the end, if needed. 5963 if (nAltivecParamsAtEnd) { 5964 NumBytes = ((NumBytes+15)/16)*16; 5965 NumBytes += 16*nAltivecParamsAtEnd; 5966 } 5967 5968 // The prolog code of the callee may store up to 8 GPR argument registers to 5969 // the stack, allowing va_start to index over them in memory if its varargs. 5970 // Because we cannot tell if this is needed on the caller side, we have to 5971 // conservatively assume that it is needed. As such, make sure we have at 5972 // least enough stack space for the caller to store the 8 GPRs. 5973 NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); 5974 5975 // Tail call needs the stack to be aligned. 5976 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5977 CallConv == CallingConv::Fast) 5978 NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); 5979 5980 // Calculate by how many bytes the stack has to be adjusted in case of tail 5981 // call optimization. 5982 int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); 5983 5984 // To protect arguments on the stack from being clobbered in a tail call, 5985 // force all the loads to happen before doing any other lowering. 5986 if (isTailCall) 5987 Chain = DAG.getStackArgumentTokenFactor(Chain); 5988 5989 // Adjust the stack pointer for the new arguments... 5990 // These operations are automatically eliminated by the prolog/epilog pass 5991 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 5992 SDValue CallSeqStart = Chain; 5993 5994 // Load the return address and frame pointer so it can be move somewhere else 5995 // later. 5996 SDValue LROp, FPOp; 5997 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); 5998 5999 // Set up a copy of the stack pointer for use loading and storing any 6000 // arguments that may not fit in the registers available for argument 6001 // passing. 6002 SDValue StackPtr; 6003 if (isPPC64) 6004 StackPtr = DAG.getRegister(PPC::X1, MVT::i64); 6005 else 6006 StackPtr = DAG.getRegister(PPC::R1, MVT::i32); 6007 6008 // Figure out which arguments are going to go in registers, and which in 6009 // memory. Also, if this is a vararg function, floating point operations 6010 // must be stored to our stack, and loaded into integer regs as well, if 6011 // any integer regs are available for argument passing. 6012 unsigned ArgOffset = LinkageSize; 6013 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 6014 6015 static const MCPhysReg GPR_32[] = { // 32-bit registers. 6016 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 6017 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 6018 }; 6019 static const MCPhysReg GPR_64[] = { // 64-bit registers. 6020 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 6021 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 6022 }; 6023 static const MCPhysReg VR[] = { 6024 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 6025 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 6026 }; 6027 const unsigned NumGPRs = array_lengthof(GPR_32); 6028 const unsigned NumFPRs = 13; 6029 const unsigned NumVRs = array_lengthof(VR); 6030 6031 const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; 6032 6033 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 6034 SmallVector<TailCallArgumentInfo, 8> TailCallArguments; 6035 6036 SmallVector<SDValue, 8> MemOpChains; 6037 for (unsigned i = 0; i != NumOps; ++i) { 6038 SDValue Arg = OutVals[i]; 6039 ISD::ArgFlagsTy Flags = Outs[i].Flags; 6040 6041 // PtrOff will be used to store the current argument to the stack if a 6042 // register cannot be found for it. 6043 SDValue PtrOff; 6044 6045 PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); 6046 6047 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); 6048 6049 // On PPC64, promote integers to 64-bit values. 6050 if (isPPC64 && Arg.getValueType() == MVT::i32) { 6051 // FIXME: Should this use ANY_EXTEND if neither sext nor zext? 6052 unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 6053 Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); 6054 } 6055 6056 // FIXME memcpy is used way more than necessary. Correctness first. 6057 // Note: "by value" is code for passing a structure by value, not 6058 // basic types. 6059 if (Flags.isByVal()) { 6060 unsigned Size = Flags.getByValSize(); 6061 // Very small objects are passed right-justified. Everything else is 6062 // passed left-justified. 6063 if (Size==1 || Size==2) { 6064 EVT VT = (Size==1) ? MVT::i8 : MVT::i16; 6065 if (GPR_idx != NumGPRs) { 6066 SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, 6067 MachinePointerInfo(), VT); 6068 MemOpChains.push_back(Load.getValue(1)); 6069 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6070 6071 ArgOffset += PtrByteSize; 6072 } else { 6073 SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, 6074 PtrOff.getValueType()); 6075 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); 6076 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, 6077 CallSeqStart, 6078 Flags, DAG, dl); 6079 ArgOffset += PtrByteSize; 6080 } 6081 continue; 6082 } 6083 // Copy entire object into memory. There are cases where gcc-generated 6084 // code assumes it is there, even if it could be put entirely into 6085 // registers. (This is not what the doc says.) 6086 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, 6087 CallSeqStart, 6088 Flags, DAG, dl); 6089 6090 // For small aggregates (Darwin only) and aggregates >= PtrByteSize, 6091 // copy the pieces of the object that fit into registers from the 6092 // parameter save area. 6093 for (unsigned j=0; j<Size; j+=PtrByteSize) { 6094 SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType()); 6095 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 6096 if (GPR_idx != NumGPRs) { 6097 SDValue Load = 6098 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo()); 6099 MemOpChains.push_back(Load.getValue(1)); 6100 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6101 ArgOffset += PtrByteSize; 6102 } else { 6103 ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; 6104 break; 6105 } 6106 } 6107 continue; 6108 } 6109 6110 switch (Arg.getSimpleValueType().SimpleTy) { 6111 default: llvm_unreachable("Unexpected ValueType for argument!"); 6112 case MVT::i1: 6113 case MVT::i32: 6114 case MVT::i64: 6115 if (GPR_idx != NumGPRs) { 6116 if (Arg.getValueType() == MVT::i1) 6117 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg); 6118 6119 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); 6120 } else { 6121 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6122 isPPC64, isTailCall, false, MemOpChains, 6123 TailCallArguments, dl); 6124 } 6125 ArgOffset += PtrByteSize; 6126 break; 6127 case MVT::f32: 6128 case MVT::f64: 6129 if (FPR_idx != NumFPRs) { 6130 RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); 6131 6132 if (isVarArg) { 6133 SDValue Store = 6134 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 6135 MemOpChains.push_back(Store); 6136 6137 // Float varargs are always shadowed in available integer registers 6138 if (GPR_idx != NumGPRs) { 6139 SDValue Load = 6140 DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo()); 6141 MemOpChains.push_back(Load.getValue(1)); 6142 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6143 } 6144 if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){ 6145 SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); 6146 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); 6147 SDValue Load = 6148 DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo()); 6149 MemOpChains.push_back(Load.getValue(1)); 6150 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6151 } 6152 } else { 6153 // If we have any FPRs remaining, we may also have GPRs remaining. 6154 // Args passed in FPRs consume either 1 (f32) or 2 (f64) available 6155 // GPRs. 6156 if (GPR_idx != NumGPRs) 6157 ++GPR_idx; 6158 if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && 6159 !isPPC64) // PPC64 has 64-bit GPR's obviously :) 6160 ++GPR_idx; 6161 } 6162 } else 6163 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6164 isPPC64, isTailCall, false, MemOpChains, 6165 TailCallArguments, dl); 6166 if (isPPC64) 6167 ArgOffset += 8; 6168 else 6169 ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8; 6170 break; 6171 case MVT::v4f32: 6172 case MVT::v4i32: 6173 case MVT::v8i16: 6174 case MVT::v16i8: 6175 if (isVarArg) { 6176 // These go aligned on the stack, or in the corresponding R registers 6177 // when within range. The Darwin PPC ABI doc claims they also go in 6178 // V registers; in fact gcc does this only for arguments that are 6179 // prototyped, not for those that match the ... We do it for all 6180 // arguments, seems to work. 6181 while (ArgOffset % 16 !=0) { 6182 ArgOffset += PtrByteSize; 6183 if (GPR_idx != NumGPRs) 6184 GPR_idx++; 6185 } 6186 // We could elide this store in the case where the object fits 6187 // entirely in R registers. Maybe later. 6188 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, 6189 DAG.getConstant(ArgOffset, dl, PtrVT)); 6190 SDValue Store = 6191 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 6192 MemOpChains.push_back(Store); 6193 if (VR_idx != NumVRs) { 6194 SDValue Load = 6195 DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo()); 6196 MemOpChains.push_back(Load.getValue(1)); 6197 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load)); 6198 } 6199 ArgOffset += 16; 6200 for (unsigned i=0; i<16; i+=PtrByteSize) { 6201 if (GPR_idx == NumGPRs) 6202 break; 6203 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, 6204 DAG.getConstant(i, dl, PtrVT)); 6205 SDValue Load = 6206 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); 6207 MemOpChains.push_back(Load.getValue(1)); 6208 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6209 } 6210 break; 6211 } 6212 6213 // Non-varargs Altivec params generally go in registers, but have 6214 // stack space allocated at the end. 6215 if (VR_idx != NumVRs) { 6216 // Doesn't have GPR space allocated. 6217 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); 6218 } else if (nAltivecParamsAtEnd==0) { 6219 // We are emitting Altivec params in order. 6220 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6221 isPPC64, isTailCall, true, MemOpChains, 6222 TailCallArguments, dl); 6223 ArgOffset += 16; 6224 } 6225 break; 6226 } 6227 } 6228 // If all Altivec parameters fit in registers, as they usually do, 6229 // they get stack space following the non-Altivec parameters. We 6230 // don't track this here because nobody below needs it. 6231 // If there are more Altivec parameters than fit in registers emit 6232 // the stores here. 6233 if (!isVarArg && nAltivecParamsAtEnd > NumVRs) { 6234 unsigned j = 0; 6235 // Offset is aligned; skip 1st 12 params which go in V registers. 6236 ArgOffset = ((ArgOffset+15)/16)*16; 6237 ArgOffset += 12*16; 6238 for (unsigned i = 0; i != NumOps; ++i) { 6239 SDValue Arg = OutVals[i]; 6240 EVT ArgType = Outs[i].VT; 6241 if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 || 6242 ArgType==MVT::v8i16 || ArgType==MVT::v16i8) { 6243 if (++j > NumVRs) { 6244 SDValue PtrOff; 6245 // We are emitting Altivec params in order. 6246 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6247 isPPC64, isTailCall, true, MemOpChains, 6248 TailCallArguments, dl); 6249 ArgOffset += 16; 6250 } 6251 } 6252 } 6253 } 6254 6255 if (!MemOpChains.empty()) 6256 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 6257 6258 // On Darwin, R12 must contain the address of an indirect callee. This does 6259 // not mean the MTCTR instruction must use R12; it's easier to model this as 6260 // an extra parameter, so do that. 6261 if (!isTailCall && 6262 !isFunctionGlobalAddress(Callee) && 6263 !isa<ExternalSymbolSDNode>(Callee) && 6264 !isBLACompatibleAddress(Callee, DAG)) 6265 RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : 6266 PPC::R12), Callee)); 6267 6268 // Build a sequence of copy-to-reg nodes chained together with token chain 6269 // and flag operands which copy the outgoing args into the appropriate regs. 6270 SDValue InFlag; 6271 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 6272 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 6273 RegsToPass[i].second, InFlag); 6274 InFlag = Chain.getValue(1); 6275 } 6276 6277 if (isTailCall) 6278 PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, 6279 TailCallArguments); 6280 6281 return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, 6282 /* unused except on PPC64 ELFv1 */ false, DAG, 6283 RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, 6284 NumBytes, Ins, InVals, CS); 6285 } 6286 6287 bool 6288 PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 6289 MachineFunction &MF, bool isVarArg, 6290 const SmallVectorImpl<ISD::OutputArg> &Outs, 6291 LLVMContext &Context) const { 6292 SmallVector<CCValAssign, 16> RVLocs; 6293 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 6294 return CCInfo.CheckReturn(Outs, RetCC_PPC); 6295 } 6296 6297 SDValue 6298 PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 6299 bool isVarArg, 6300 const SmallVectorImpl<ISD::OutputArg> &Outs, 6301 const SmallVectorImpl<SDValue> &OutVals, 6302 const SDLoc &dl, SelectionDAG &DAG) const { 6303 SmallVector<CCValAssign, 16> RVLocs; 6304 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 6305 *DAG.getContext()); 6306 CCInfo.AnalyzeReturn(Outs, RetCC_PPC); 6307 6308 SDValue Flag; 6309 SmallVector<SDValue, 4> RetOps(1, Chain); 6310 6311 // Copy the result values into the output registers. 6312 for (unsigned i = 0; i != RVLocs.size(); ++i) { 6313 CCValAssign &VA = RVLocs[i]; 6314 assert(VA.isRegLoc() && "Can only return in registers!"); 6315 6316 SDValue Arg = OutVals[i]; 6317 6318 switch (VA.getLocInfo()) { 6319 default: llvm_unreachable("Unknown loc info!"); 6320 case CCValAssign::Full: break; 6321 case CCValAssign::AExt: 6322 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 6323 break; 6324 case CCValAssign::ZExt: 6325 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 6326 break; 6327 case CCValAssign::SExt: 6328 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 6329 break; 6330 } 6331 6332 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 6333 Flag = Chain.getValue(1); 6334 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 6335 } 6336 6337 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); 6338 const MCPhysReg *I = 6339 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 6340 if (I) { 6341 for (; *I; ++I) { 6342 6343 if (PPC::G8RCRegClass.contains(*I)) 6344 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 6345 else if (PPC::F8RCRegClass.contains(*I)) 6346 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 6347 else if (PPC::CRRCRegClass.contains(*I)) 6348 RetOps.push_back(DAG.getRegister(*I, MVT::i1)); 6349 else if (PPC::VRRCRegClass.contains(*I)) 6350 RetOps.push_back(DAG.getRegister(*I, MVT::Other)); 6351 else 6352 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 6353 } 6354 } 6355 6356 RetOps[0] = Chain; // Update chain. 6357 6358 // Add the flag if we have it. 6359 if (Flag.getNode()) 6360 RetOps.push_back(Flag); 6361 6362 return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); 6363 } 6364 6365 SDValue 6366 PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, 6367 SelectionDAG &DAG) const { 6368 SDLoc dl(Op); 6369 6370 // Get the corect type for integers. 6371 EVT IntVT = Op.getValueType(); 6372 6373 // Get the inputs. 6374 SDValue Chain = Op.getOperand(0); 6375 SDValue FPSIdx = getFramePointerFrameIndex(DAG); 6376 // Build a DYNAREAOFFSET node. 6377 SDValue Ops[2] = {Chain, FPSIdx}; 6378 SDVTList VTs = DAG.getVTList(IntVT); 6379 return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops); 6380 } 6381 6382 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, 6383 SelectionDAG &DAG) const { 6384 // When we pop the dynamic allocation we need to restore the SP link. 6385 SDLoc dl(Op); 6386 6387 // Get the corect type for pointers. 6388 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6389 6390 // Construct the stack pointer operand. 6391 bool isPPC64 = Subtarget.isPPC64(); 6392 unsigned SP = isPPC64 ? PPC::X1 : PPC::R1; 6393 SDValue StackPtr = DAG.getRegister(SP, PtrVT); 6394 6395 // Get the operands for the STACKRESTORE. 6396 SDValue Chain = Op.getOperand(0); 6397 SDValue SaveSP = Op.getOperand(1); 6398 6399 // Load the old link SP. 6400 SDValue LoadLinkSP = 6401 DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo()); 6402 6403 // Restore the stack pointer. 6404 Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP); 6405 6406 // Store the old link SP. 6407 return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo()); 6408 } 6409 6410 SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const { 6411 MachineFunction &MF = DAG.getMachineFunction(); 6412 bool isPPC64 = Subtarget.isPPC64(); 6413 EVT PtrVT = getPointerTy(MF.getDataLayout()); 6414 6415 // Get current frame pointer save index. The users of this index will be 6416 // primarily DYNALLOC instructions. 6417 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); 6418 int RASI = FI->getReturnAddrSaveIndex(); 6419 6420 // If the frame pointer save index hasn't been defined yet. 6421 if (!RASI) { 6422 // Find out what the fix offset of the frame pointer save area. 6423 int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset(); 6424 // Allocate the frame index for frame pointer save area. 6425 RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false); 6426 // Save the result. 6427 FI->setReturnAddrSaveIndex(RASI); 6428 } 6429 return DAG.getFrameIndex(RASI, PtrVT); 6430 } 6431 6432 SDValue 6433 PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { 6434 MachineFunction &MF = DAG.getMachineFunction(); 6435 bool isPPC64 = Subtarget.isPPC64(); 6436 EVT PtrVT = getPointerTy(MF.getDataLayout()); 6437 6438 // Get current frame pointer save index. The users of this index will be 6439 // primarily DYNALLOC instructions. 6440 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); 6441 int FPSI = FI->getFramePointerSaveIndex(); 6442 6443 // If the frame pointer save index hasn't been defined yet. 6444 if (!FPSI) { 6445 // Find out what the fix offset of the frame pointer save area. 6446 int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset(); 6447 // Allocate the frame index for frame pointer save area. 6448 FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true); 6449 // Save the result. 6450 FI->setFramePointerSaveIndex(FPSI); 6451 } 6452 return DAG.getFrameIndex(FPSI, PtrVT); 6453 } 6454 6455 SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6456 SelectionDAG &DAG) const { 6457 // Get the inputs. 6458 SDValue Chain = Op.getOperand(0); 6459 SDValue Size = Op.getOperand(1); 6460 SDLoc dl(Op); 6461 6462 // Get the corect type for pointers. 6463 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6464 // Negate the size. 6465 SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT, 6466 DAG.getConstant(0, dl, PtrVT), Size); 6467 // Construct a node for the frame pointer save index. 6468 SDValue FPSIdx = getFramePointerFrameIndex(DAG); 6469 // Build a DYNALLOC node. 6470 SDValue Ops[3] = { Chain, NegSize, FPSIdx }; 6471 SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other); 6472 return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops); 6473 } 6474 6475 SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op, 6476 SelectionDAG &DAG) const { 6477 MachineFunction &MF = DAG.getMachineFunction(); 6478 6479 bool isPPC64 = Subtarget.isPPC64(); 6480 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6481 6482 int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false); 6483 return DAG.getFrameIndex(FI, PtrVT); 6484 } 6485 6486 SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, 6487 SelectionDAG &DAG) const { 6488 SDLoc DL(Op); 6489 return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL, 6490 DAG.getVTList(MVT::i32, MVT::Other), 6491 Op.getOperand(0), Op.getOperand(1)); 6492 } 6493 6494 SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, 6495 SelectionDAG &DAG) const { 6496 SDLoc DL(Op); 6497 return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other, 6498 Op.getOperand(0), Op.getOperand(1)); 6499 } 6500 6501 SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 6502 if (Op.getValueType().isVector()) 6503 return LowerVectorLoad(Op, DAG); 6504 6505 assert(Op.getValueType() == MVT::i1 && 6506 "Custom lowering only for i1 loads"); 6507 6508 // First, load 8 bits into 32 bits, then truncate to 1 bit. 6509 6510 SDLoc dl(Op); 6511 LoadSDNode *LD = cast<LoadSDNode>(Op); 6512 6513 SDValue Chain = LD->getChain(); 6514 SDValue BasePtr = LD->getBasePtr(); 6515 MachineMemOperand *MMO = LD->getMemOperand(); 6516 6517 SDValue NewLD = 6518 DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain, 6519 BasePtr, MVT::i8, MMO); 6520 SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD); 6521 6522 SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) }; 6523 return DAG.getMergeValues(Ops, dl); 6524 } 6525 6526 SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 6527 if (Op.getOperand(1).getValueType().isVector()) 6528 return LowerVectorStore(Op, DAG); 6529 6530 assert(Op.getOperand(1).getValueType() == MVT::i1 && 6531 "Custom lowering only for i1 stores"); 6532 6533 // First, zero extend to 32 bits, then use a truncating store to 8 bits. 6534 6535 SDLoc dl(Op); 6536 StoreSDNode *ST = cast<StoreSDNode>(Op); 6537 6538 SDValue Chain = ST->getChain(); 6539 SDValue BasePtr = ST->getBasePtr(); 6540 SDValue Value = ST->getValue(); 6541 MachineMemOperand *MMO = ST->getMemOperand(); 6542 6543 Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()), 6544 Value); 6545 return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO); 6546 } 6547 6548 // FIXME: Remove this once the ANDI glue bug is fixed: 6549 SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { 6550 assert(Op.getValueType() == MVT::i1 && 6551 "Custom lowering only for i1 results"); 6552 6553 SDLoc DL(Op); 6554 return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1, 6555 Op.getOperand(0)); 6556 } 6557 6558 /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when 6559 /// possible. 6560 SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 6561 // Not FP? Not a fsel. 6562 if (!Op.getOperand(0).getValueType().isFloatingPoint() || 6563 !Op.getOperand(2).getValueType().isFloatingPoint()) 6564 return Op; 6565 6566 // We might be able to do better than this under some circumstances, but in 6567 // general, fsel-based lowering of select is a finite-math-only optimization. 6568 // For more information, see section F.3 of the 2.06 ISA specification. 6569 if (!DAG.getTarget().Options.NoInfsFPMath || 6570 !DAG.getTarget().Options.NoNaNsFPMath) 6571 return Op; 6572 // TODO: Propagate flags from the select rather than global settings. 6573 SDNodeFlags Flags; 6574 Flags.setNoInfs(true); 6575 Flags.setNoNaNs(true); 6576 6577 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 6578 6579 EVT ResVT = Op.getValueType(); 6580 EVT CmpVT = Op.getOperand(0).getValueType(); 6581 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 6582 SDValue TV = Op.getOperand(2), FV = Op.getOperand(3); 6583 SDLoc dl(Op); 6584 6585 // If the RHS of the comparison is a 0.0, we don't need to do the 6586 // subtraction at all. 6587 SDValue Sel1; 6588 if (isFloatingPointZero(RHS)) 6589 switch (CC) { 6590 default: break; // SETUO etc aren't handled by fsel. 6591 case ISD::SETNE: 6592 std::swap(TV, FV); 6593 case ISD::SETEQ: 6594 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits 6595 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); 6596 Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); 6597 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits 6598 Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); 6599 return DAG.getNode(PPCISD::FSEL, dl, ResVT, 6600 DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV); 6601 case ISD::SETULT: 6602 case ISD::SETLT: 6603 std::swap(TV, FV); // fsel is natively setge, swap operands for setlt 6604 case ISD::SETOGE: 6605 case ISD::SETGE: 6606 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits 6607 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); 6608 return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); 6609 case ISD::SETUGT: 6610 case ISD::SETGT: 6611 std::swap(TV, FV); // fsel is natively setge, swap operands for setlt 6612 case ISD::SETOLE: 6613 case ISD::SETLE: 6614 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits 6615 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); 6616 return DAG.getNode(PPCISD::FSEL, dl, ResVT, 6617 DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV); 6618 } 6619 6620 SDValue Cmp; 6621 switch (CC) { 6622 default: break; // SETUO etc aren't handled by fsel. 6623 case ISD::SETNE: 6624 std::swap(TV, FV); 6625 case ISD::SETEQ: 6626 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); 6627 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6628 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6629 Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); 6630 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits 6631 Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); 6632 return DAG.getNode(PPCISD::FSEL, dl, ResVT, 6633 DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); 6634 case ISD::SETULT: 6635 case ISD::SETLT: 6636 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); 6637 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6638 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6639 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); 6640 case ISD::SETOGE: 6641 case ISD::SETGE: 6642 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); 6643 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6644 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6645 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); 6646 case ISD::SETUGT: 6647 case ISD::SETGT: 6648 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); 6649 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6650 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6651 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); 6652 case ISD::SETOLE: 6653 case ISD::SETLE: 6654 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); 6655 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6656 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6657 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); 6658 } 6659 return Op; 6660 } 6661 6662 void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, 6663 SelectionDAG &DAG, 6664 const SDLoc &dl) const { 6665 assert(Op.getOperand(0).getValueType().isFloatingPoint()); 6666 SDValue Src = Op.getOperand(0); 6667 if (Src.getValueType() == MVT::f32) 6668 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); 6669 6670 SDValue Tmp; 6671 switch (Op.getSimpleValueType().SimpleTy) { 6672 default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); 6673 case MVT::i32: 6674 Tmp = DAG.getNode( 6675 Op.getOpcode() == ISD::FP_TO_SINT 6676 ? PPCISD::FCTIWZ 6677 : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), 6678 dl, MVT::f64, Src); 6679 break; 6680 case MVT::i64: 6681 assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && 6682 "i64 FP_TO_UINT is supported only with FPCVT"); 6683 Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : 6684 PPCISD::FCTIDUZ, 6685 dl, MVT::f64, Src); 6686 break; 6687 } 6688 6689 // Convert the FP value to an int value through memory. 6690 bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() && 6691 (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); 6692 SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); 6693 int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex(); 6694 MachinePointerInfo MPI = 6695 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 6696 6697 // Emit a store to the stack slot. 6698 SDValue Chain; 6699 if (i32Stack) { 6700 MachineFunction &MF = DAG.getMachineFunction(); 6701 MachineMemOperand *MMO = 6702 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4); 6703 SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr }; 6704 Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, 6705 DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO); 6706 } else 6707 Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI); 6708 6709 // Result is a load from the stack slot. If loading 4 bytes, make sure to 6710 // add in a bias on big endian. 6711 if (Op.getValueType() == MVT::i32 && !i32Stack) { 6712 FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, 6713 DAG.getConstant(4, dl, FIPtr.getValueType())); 6714 MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4); 6715 } 6716 6717 RLI.Chain = Chain; 6718 RLI.Ptr = FIPtr; 6719 RLI.MPI = MPI; 6720 } 6721 6722 /// \brief Custom lowers floating point to integer conversions to use 6723 /// the direct move instructions available in ISA 2.07 to avoid the 6724 /// need for load/store combinations. 6725 SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op, 6726 SelectionDAG &DAG, 6727 const SDLoc &dl) const { 6728 assert(Op.getOperand(0).getValueType().isFloatingPoint()); 6729 SDValue Src = Op.getOperand(0); 6730 6731 if (Src.getValueType() == MVT::f32) 6732 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); 6733 6734 SDValue Tmp; 6735 switch (Op.getSimpleValueType().SimpleTy) { 6736 default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); 6737 case MVT::i32: 6738 Tmp = DAG.getNode( 6739 Op.getOpcode() == ISD::FP_TO_SINT 6740 ? PPCISD::FCTIWZ 6741 : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), 6742 dl, MVT::f64, Src); 6743 Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp); 6744 break; 6745 case MVT::i64: 6746 assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && 6747 "i64 FP_TO_UINT is supported only with FPCVT"); 6748 Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : 6749 PPCISD::FCTIDUZ, 6750 dl, MVT::f64, Src); 6751 Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp); 6752 break; 6753 } 6754 return Tmp; 6755 } 6756 6757 SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, 6758 const SDLoc &dl) const { 6759 if (Subtarget.hasDirectMove() && Subtarget.isPPC64()) 6760 return LowerFP_TO_INTDirectMove(Op, DAG, dl); 6761 6762 ReuseLoadInfo RLI; 6763 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); 6764 6765 return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI, 6766 RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges); 6767 } 6768 6769 // We're trying to insert a regular store, S, and then a load, L. If the 6770 // incoming value, O, is a load, we might just be able to have our load use the 6771 // address used by O. However, we don't know if anything else will store to 6772 // that address before we can load from it. To prevent this situation, we need 6773 // to insert our load, L, into the chain as a peer of O. To do this, we give L 6774 // the same chain operand as O, we create a token factor from the chain results 6775 // of O and L, and we replace all uses of O's chain result with that token 6776 // factor (see spliceIntoChain below for this last part). 6777 bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, 6778 ReuseLoadInfo &RLI, 6779 SelectionDAG &DAG, 6780 ISD::LoadExtType ET) const { 6781 SDLoc dl(Op); 6782 if (ET == ISD::NON_EXTLOAD && 6783 (Op.getOpcode() == ISD::FP_TO_UINT || 6784 Op.getOpcode() == ISD::FP_TO_SINT) && 6785 isOperationLegalOrCustom(Op.getOpcode(), 6786 Op.getOperand(0).getValueType())) { 6787 6788 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); 6789 return true; 6790 } 6791 6792 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op); 6793 if (!LD || LD->getExtensionType() != ET || LD->isVolatile() || 6794 LD->isNonTemporal()) 6795 return false; 6796 if (LD->getMemoryVT() != MemVT) 6797 return false; 6798 6799 RLI.Ptr = LD->getBasePtr(); 6800 if (LD->isIndexed() && !LD->getOffset().isUndef()) { 6801 assert(LD->getAddressingMode() == ISD::PRE_INC && 6802 "Non-pre-inc AM on PPC?"); 6803 RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr, 6804 LD->getOffset()); 6805 } 6806 6807 RLI.Chain = LD->getChain(); 6808 RLI.MPI = LD->getPointerInfo(); 6809 RLI.IsDereferenceable = LD->isDereferenceable(); 6810 RLI.IsInvariant = LD->isInvariant(); 6811 RLI.Alignment = LD->getAlignment(); 6812 RLI.AAInfo = LD->getAAInfo(); 6813 RLI.Ranges = LD->getRanges(); 6814 6815 RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1); 6816 return true; 6817 } 6818 6819 // Given the head of the old chain, ResChain, insert a token factor containing 6820 // it and NewResChain, and make users of ResChain now be users of that token 6821 // factor. 6822 void PPCTargetLowering::spliceIntoChain(SDValue ResChain, 6823 SDValue NewResChain, 6824 SelectionDAG &DAG) const { 6825 if (!ResChain) 6826 return; 6827 6828 SDLoc dl(NewResChain); 6829 6830 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6831 NewResChain, DAG.getUNDEF(MVT::Other)); 6832 assert(TF.getNode() != NewResChain.getNode() && 6833 "A new TF really is required here"); 6834 6835 DAG.ReplaceAllUsesOfValueWith(ResChain, TF); 6836 DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain); 6837 } 6838 6839 /// \brief Analyze profitability of direct move 6840 /// prefer float load to int load plus direct move 6841 /// when there is no integer use of int load 6842 bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const { 6843 SDNode *Origin = Op.getOperand(0).getNode(); 6844 if (Origin->getOpcode() != ISD::LOAD) 6845 return true; 6846 6847 // If there is no LXSIBZX/LXSIHZX, like Power8, 6848 // prefer direct move if the memory size is 1 or 2 bytes. 6849 MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand(); 6850 if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2) 6851 return true; 6852 6853 for (SDNode::use_iterator UI = Origin->use_begin(), 6854 UE = Origin->use_end(); 6855 UI != UE; ++UI) { 6856 6857 // Only look at the users of the loaded value. 6858 if (UI.getUse().get().getResNo() != 0) 6859 continue; 6860 6861 if (UI->getOpcode() != ISD::SINT_TO_FP && 6862 UI->getOpcode() != ISD::UINT_TO_FP) 6863 return true; 6864 } 6865 6866 return false; 6867 } 6868 6869 /// \brief Custom lowers integer to floating point conversions to use 6870 /// the direct move instructions available in ISA 2.07 to avoid the 6871 /// need for load/store combinations. 6872 SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op, 6873 SelectionDAG &DAG, 6874 const SDLoc &dl) const { 6875 assert((Op.getValueType() == MVT::f32 || 6876 Op.getValueType() == MVT::f64) && 6877 "Invalid floating point type as target of conversion"); 6878 assert(Subtarget.hasFPCVT() && 6879 "Int to FP conversions with direct moves require FPCVT"); 6880 SDValue FP; 6881 SDValue Src = Op.getOperand(0); 6882 bool SinglePrec = Op.getValueType() == MVT::f32; 6883 bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32; 6884 bool Signed = Op.getOpcode() == ISD::SINT_TO_FP; 6885 unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) : 6886 (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU); 6887 6888 if (WordInt) { 6889 FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ, 6890 dl, MVT::f64, Src); 6891 FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); 6892 } 6893 else { 6894 FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src); 6895 FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); 6896 } 6897 6898 return FP; 6899 } 6900 6901 SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, 6902 SelectionDAG &DAG) const { 6903 SDLoc dl(Op); 6904 6905 if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) { 6906 if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64) 6907 return SDValue(); 6908 6909 SDValue Value = Op.getOperand(0); 6910 // The values are now known to be -1 (false) or 1 (true). To convert this 6911 // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). 6912 // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 6913 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); 6914 6915 SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); 6916 6917 Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 6918 6919 if (Op.getValueType() != MVT::v4f64) 6920 Value = DAG.getNode(ISD::FP_ROUND, dl, 6921 Op.getValueType(), Value, 6922 DAG.getIntPtrConstant(1, dl)); 6923 return Value; 6924 } 6925 6926 // Don't handle ppc_fp128 here; let it be lowered to a libcall. 6927 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) 6928 return SDValue(); 6929 6930 if (Op.getOperand(0).getValueType() == MVT::i1) 6931 return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0), 6932 DAG.getConstantFP(1.0, dl, Op.getValueType()), 6933 DAG.getConstantFP(0.0, dl, Op.getValueType())); 6934 6935 // If we have direct moves, we can do all the conversion, skip the store/load 6936 // however, without FPCVT we can't do most conversions. 6937 if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) && 6938 Subtarget.isPPC64() && Subtarget.hasFPCVT()) 6939 return LowerINT_TO_FPDirectMove(Op, DAG, dl); 6940 6941 assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && 6942 "UINT_TO_FP is supported only with FPCVT"); 6943 6944 // If we have FCFIDS, then use it when converting to single-precision. 6945 // Otherwise, convert to double-precision and then round. 6946 unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 6947 ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS 6948 : PPCISD::FCFIDS) 6949 : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU 6950 : PPCISD::FCFID); 6951 MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 6952 ? MVT::f32 6953 : MVT::f64; 6954 6955 if (Op.getOperand(0).getValueType() == MVT::i64) { 6956 SDValue SINT = Op.getOperand(0); 6957 // When converting to single-precision, we actually need to convert 6958 // to double-precision first and then round to single-precision. 6959 // To avoid double-rounding effects during that operation, we have 6960 // to prepare the input operand. Bits that might be truncated when 6961 // converting to double-precision are replaced by a bit that won't 6962 // be lost at this stage, but is below the single-precision rounding 6963 // position. 6964 // 6965 // However, if -enable-unsafe-fp-math is in effect, accept double 6966 // rounding to avoid the extra overhead. 6967 if (Op.getValueType() == MVT::f32 && 6968 !Subtarget.hasFPCVT() && 6969 !DAG.getTarget().Options.UnsafeFPMath) { 6970 6971 // Twiddle input to make sure the low 11 bits are zero. (If this 6972 // is the case, we are guaranteed the value will fit into the 53 bit 6973 // mantissa of an IEEE double-precision value without rounding.) 6974 // If any of those low 11 bits were not zero originally, make sure 6975 // bit 12 (value 2048) is set instead, so that the final rounding 6976 // to single-precision gets the correct result. 6977 SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64, 6978 SINT, DAG.getConstant(2047, dl, MVT::i64)); 6979 Round = DAG.getNode(ISD::ADD, dl, MVT::i64, 6980 Round, DAG.getConstant(2047, dl, MVT::i64)); 6981 Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT); 6982 Round = DAG.getNode(ISD::AND, dl, MVT::i64, 6983 Round, DAG.getConstant(-2048, dl, MVT::i64)); 6984 6985 // However, we cannot use that value unconditionally: if the magnitude 6986 // of the input value is small, the bit-twiddling we did above might 6987 // end up visibly changing the output. Fortunately, in that case, we 6988 // don't need to twiddle bits since the original input will convert 6989 // exactly to double-precision floating-point already. Therefore, 6990 // construct a conditional to use the original value if the top 11 6991 // bits are all sign-bit copies, and use the rounded value computed 6992 // above otherwise. 6993 SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64, 6994 SINT, DAG.getConstant(53, dl, MVT::i32)); 6995 Cond = DAG.getNode(ISD::ADD, dl, MVT::i64, 6996 Cond, DAG.getConstant(1, dl, MVT::i64)); 6997 Cond = DAG.getSetCC(dl, MVT::i32, 6998 Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT); 6999 7000 SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT); 7001 } 7002 7003 ReuseLoadInfo RLI; 7004 SDValue Bits; 7005 7006 MachineFunction &MF = DAG.getMachineFunction(); 7007 if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) { 7008 Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, 7009 RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges); 7010 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); 7011 } else if (Subtarget.hasLFIWAX() && 7012 canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) { 7013 MachineMemOperand *MMO = 7014 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7015 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7016 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7017 Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl, 7018 DAG.getVTList(MVT::f64, MVT::Other), 7019 Ops, MVT::i32, MMO); 7020 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); 7021 } else if (Subtarget.hasFPCVT() && 7022 canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) { 7023 MachineMemOperand *MMO = 7024 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7025 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7026 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7027 Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl, 7028 DAG.getVTList(MVT::f64, MVT::Other), 7029 Ops, MVT::i32, MMO); 7030 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); 7031 } else if (((Subtarget.hasLFIWAX() && 7032 SINT.getOpcode() == ISD::SIGN_EXTEND) || 7033 (Subtarget.hasFPCVT() && 7034 SINT.getOpcode() == ISD::ZERO_EXTEND)) && 7035 SINT.getOperand(0).getValueType() == MVT::i32) { 7036 MachineFrameInfo &MFI = MF.getFrameInfo(); 7037 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 7038 7039 int FrameIdx = MFI.CreateStackObject(4, 4, false); 7040 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7041 7042 SDValue Store = 7043 DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, 7044 MachinePointerInfo::getFixedStack( 7045 DAG.getMachineFunction(), FrameIdx)); 7046 7047 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && 7048 "Expected an i32 store"); 7049 7050 RLI.Ptr = FIdx; 7051 RLI.Chain = Store; 7052 RLI.MPI = 7053 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 7054 RLI.Alignment = 4; 7055 7056 MachineMemOperand *MMO = 7057 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7058 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7059 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7060 Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ? 7061 PPCISD::LFIWZX : PPCISD::LFIWAX, 7062 dl, DAG.getVTList(MVT::f64, MVT::Other), 7063 Ops, MVT::i32, MMO); 7064 } else 7065 Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); 7066 7067 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); 7068 7069 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) 7070 FP = DAG.getNode(ISD::FP_ROUND, dl, 7071 MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); 7072 return FP; 7073 } 7074 7075 assert(Op.getOperand(0).getValueType() == MVT::i32 && 7076 "Unhandled INT_TO_FP type in custom expander!"); 7077 // Since we only generate this in 64-bit mode, we can take advantage of 7078 // 64-bit registers. In particular, sign extend the input value into the 7079 // 64-bit register with extsw, store the WHOLE 64-bit value into the stack 7080 // then lfd it and fcfid it. 7081 MachineFunction &MF = DAG.getMachineFunction(); 7082 MachineFrameInfo &MFI = MF.getFrameInfo(); 7083 EVT PtrVT = getPointerTy(MF.getDataLayout()); 7084 7085 SDValue Ld; 7086 if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { 7087 ReuseLoadInfo RLI; 7088 bool ReusingLoad; 7089 if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI, 7090 DAG))) { 7091 int FrameIdx = MFI.CreateStackObject(4, 4, false); 7092 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7093 7094 SDValue Store = 7095 DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, 7096 MachinePointerInfo::getFixedStack( 7097 DAG.getMachineFunction(), FrameIdx)); 7098 7099 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && 7100 "Expected an i32 store"); 7101 7102 RLI.Ptr = FIdx; 7103 RLI.Chain = Store; 7104 RLI.MPI = 7105 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 7106 RLI.Alignment = 4; 7107 } 7108 7109 MachineMemOperand *MMO = 7110 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7111 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7112 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7113 Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? 7114 PPCISD::LFIWZX : PPCISD::LFIWAX, 7115 dl, DAG.getVTList(MVT::f64, MVT::Other), 7116 Ops, MVT::i32, MMO); 7117 if (ReusingLoad) 7118 spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG); 7119 } else { 7120 assert(Subtarget.isPPC64() && 7121 "i32->FP without LFIWAX supported only on PPC64"); 7122 7123 int FrameIdx = MFI.CreateStackObject(8, 8, false); 7124 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7125 7126 SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, 7127 Op.getOperand(0)); 7128 7129 // STD the extended value into the stack slot. 7130 SDValue Store = DAG.getStore( 7131 DAG.getEntryNode(), dl, Ext64, FIdx, 7132 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); 7133 7134 // Load the value as a double. 7135 Ld = DAG.getLoad( 7136 MVT::f64, dl, Store, FIdx, 7137 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); 7138 } 7139 7140 // FCFID it and return it. 7141 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); 7142 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) 7143 FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, 7144 DAG.getIntPtrConstant(0, dl)); 7145 return FP; 7146 } 7147 7148 SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7149 SelectionDAG &DAG) const { 7150 SDLoc dl(Op); 7151 /* 7152 The rounding mode is in bits 30:31 of FPSR, and has the following 7153 settings: 7154 00 Round to nearest 7155 01 Round to 0 7156 10 Round to +inf 7157 11 Round to -inf 7158 7159 FLT_ROUNDS, on the other hand, expects the following: 7160 -1 Undefined 7161 0 Round to 0 7162 1 Round to nearest 7163 2 Round to +inf 7164 3 Round to -inf 7165 7166 To perform the conversion, we do: 7167 ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1)) 7168 */ 7169 7170 MachineFunction &MF = DAG.getMachineFunction(); 7171 EVT VT = Op.getValueType(); 7172 EVT PtrVT = getPointerTy(MF.getDataLayout()); 7173 7174 // Save FP Control Word to register 7175 EVT NodeTys[] = { 7176 MVT::f64, // return register 7177 MVT::Glue // unused in this context 7178 }; 7179 SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None); 7180 7181 // Save FP register to stack slot 7182 int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false); 7183 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); 7184 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot, 7185 MachinePointerInfo()); 7186 7187 // Load FP Control Word from low 32 bits of stack slot. 7188 SDValue Four = DAG.getConstant(4, dl, PtrVT); 7189 SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four); 7190 SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo()); 7191 7192 // Transform as necessary 7193 SDValue CWD1 = 7194 DAG.getNode(ISD::AND, dl, MVT::i32, 7195 CWD, DAG.getConstant(3, dl, MVT::i32)); 7196 SDValue CWD2 = 7197 DAG.getNode(ISD::SRL, dl, MVT::i32, 7198 DAG.getNode(ISD::AND, dl, MVT::i32, 7199 DAG.getNode(ISD::XOR, dl, MVT::i32, 7200 CWD, DAG.getConstant(3, dl, MVT::i32)), 7201 DAG.getConstant(3, dl, MVT::i32)), 7202 DAG.getConstant(1, dl, MVT::i32)); 7203 7204 SDValue RetVal = 7205 DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2); 7206 7207 return DAG.getNode((VT.getSizeInBits() < 16 ? 7208 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7209 } 7210 7211 SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { 7212 EVT VT = Op.getValueType(); 7213 unsigned BitWidth = VT.getSizeInBits(); 7214 SDLoc dl(Op); 7215 assert(Op.getNumOperands() == 3 && 7216 VT == Op.getOperand(1).getValueType() && 7217 "Unexpected SHL!"); 7218 7219 // Expand into a bunch of logical ops. Note that these ops 7220 // depend on the PPC behavior for oversized shift amounts. 7221 SDValue Lo = Op.getOperand(0); 7222 SDValue Hi = Op.getOperand(1); 7223 SDValue Amt = Op.getOperand(2); 7224 EVT AmtVT = Amt.getValueType(); 7225 7226 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, 7227 DAG.getConstant(BitWidth, dl, AmtVT), Amt); 7228 SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt); 7229 SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1); 7230 SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3); 7231 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, 7232 DAG.getConstant(-BitWidth, dl, AmtVT)); 7233 SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5); 7234 SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); 7235 SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt); 7236 SDValue OutOps[] = { OutLo, OutHi }; 7237 return DAG.getMergeValues(OutOps, dl); 7238 } 7239 7240 SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { 7241 EVT VT = Op.getValueType(); 7242 SDLoc dl(Op); 7243 unsigned BitWidth = VT.getSizeInBits(); 7244 assert(Op.getNumOperands() == 3 && 7245 VT == Op.getOperand(1).getValueType() && 7246 "Unexpected SRL!"); 7247 7248 // Expand into a bunch of logical ops. Note that these ops 7249 // depend on the PPC behavior for oversized shift amounts. 7250 SDValue Lo = Op.getOperand(0); 7251 SDValue Hi = Op.getOperand(1); 7252 SDValue Amt = Op.getOperand(2); 7253 EVT AmtVT = Amt.getValueType(); 7254 7255 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, 7256 DAG.getConstant(BitWidth, dl, AmtVT), Amt); 7257 SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); 7258 SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); 7259 SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); 7260 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, 7261 DAG.getConstant(-BitWidth, dl, AmtVT)); 7262 SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5); 7263 SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); 7264 SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt); 7265 SDValue OutOps[] = { OutLo, OutHi }; 7266 return DAG.getMergeValues(OutOps, dl); 7267 } 7268 7269 SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { 7270 SDLoc dl(Op); 7271 EVT VT = Op.getValueType(); 7272 unsigned BitWidth = VT.getSizeInBits(); 7273 assert(Op.getNumOperands() == 3 && 7274 VT == Op.getOperand(1).getValueType() && 7275 "Unexpected SRA!"); 7276 7277 // Expand into a bunch of logical ops, followed by a select_cc. 7278 SDValue Lo = Op.getOperand(0); 7279 SDValue Hi = Op.getOperand(1); 7280 SDValue Amt = Op.getOperand(2); 7281 EVT AmtVT = Amt.getValueType(); 7282 7283 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, 7284 DAG.getConstant(BitWidth, dl, AmtVT), Amt); 7285 SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); 7286 SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); 7287 SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); 7288 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, 7289 DAG.getConstant(-BitWidth, dl, AmtVT)); 7290 SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5); 7291 SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt); 7292 SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT), 7293 Tmp4, Tmp6, ISD::SETLE); 7294 SDValue OutOps[] = { OutLo, OutHi }; 7295 return DAG.getMergeValues(OutOps, dl); 7296 } 7297 7298 //===----------------------------------------------------------------------===// 7299 // Vector related lowering. 7300 // 7301 7302 /// BuildSplatI - Build a canonical splati of Val with an element size of 7303 /// SplatSize. Cast the result to VT. 7304 static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, 7305 SelectionDAG &DAG, const SDLoc &dl) { 7306 assert(Val >= -16 && Val <= 15 && "vsplti is out of range!"); 7307 7308 static const MVT VTys[] = { // canonical VT to use for each size. 7309 MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32 7310 }; 7311 7312 EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1]; 7313 7314 // Force vspltis[hw] -1 to vspltisb -1 to canonicalize. 7315 if (Val == -1) 7316 SplatSize = 1; 7317 7318 EVT CanonicalVT = VTys[SplatSize-1]; 7319 7320 // Build a canonical splat for this value. 7321 return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT)); 7322 } 7323 7324 /// BuildIntrinsicOp - Return a unary operator intrinsic node with the 7325 /// specified intrinsic ID. 7326 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG, 7327 const SDLoc &dl, EVT DestVT = MVT::Other) { 7328 if (DestVT == MVT::Other) DestVT = Op.getValueType(); 7329 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, 7330 DAG.getConstant(IID, dl, MVT::i32), Op); 7331 } 7332 7333 /// BuildIntrinsicOp - Return a binary operator intrinsic node with the 7334 /// specified intrinsic ID. 7335 static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS, 7336 SelectionDAG &DAG, const SDLoc &dl, 7337 EVT DestVT = MVT::Other) { 7338 if (DestVT == MVT::Other) DestVT = LHS.getValueType(); 7339 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, 7340 DAG.getConstant(IID, dl, MVT::i32), LHS, RHS); 7341 } 7342 7343 /// BuildIntrinsicOp - Return a ternary operator intrinsic node with the 7344 /// specified intrinsic ID. 7345 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1, 7346 SDValue Op2, SelectionDAG &DAG, const SDLoc &dl, 7347 EVT DestVT = MVT::Other) { 7348 if (DestVT == MVT::Other) DestVT = Op0.getValueType(); 7349 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, 7350 DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2); 7351 } 7352 7353 /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified 7354 /// amount. The result has the specified value type. 7355 static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT, 7356 SelectionDAG &DAG, const SDLoc &dl) { 7357 // Force LHS/RHS to be the right type. 7358 LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS); 7359 RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS); 7360 7361 int Ops[16]; 7362 for (unsigned i = 0; i != 16; ++i) 7363 Ops[i] = i + Amt; 7364 SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops); 7365 return DAG.getNode(ISD::BITCAST, dl, VT, T); 7366 } 7367 7368 /// Do we have an efficient pattern in a .td file for this node? 7369 /// 7370 /// \param V - pointer to the BuildVectorSDNode being matched 7371 /// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves? 7372 /// 7373 /// There are some patterns where it is beneficial to keep a BUILD_VECTOR 7374 /// node as a BUILD_VECTOR node rather than expanding it. The patterns where 7375 /// the opposite is true (expansion is beneficial) are: 7376 /// - The node builds a vector out of integers that are not 32 or 64-bits 7377 /// - The node builds a vector out of constants 7378 /// - The node is a "load-and-splat" 7379 /// In all other cases, we will choose to keep the BUILD_VECTOR. 7380 static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V, 7381 bool HasDirectMove) { 7382 EVT VecVT = V->getValueType(0); 7383 bool RightType = VecVT == MVT::v2f64 || VecVT == MVT::v4f32 || 7384 (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32)); 7385 if (!RightType) 7386 return false; 7387 7388 bool IsSplat = true; 7389 bool IsLoad = false; 7390 SDValue Op0 = V->getOperand(0); 7391 7392 // This function is called in a block that confirms the node is not a constant 7393 // splat. So a constant BUILD_VECTOR here means the vector is built out of 7394 // different constants. 7395 if (V->isConstant()) 7396 return false; 7397 for (int i = 0, e = V->getNumOperands(); i < e; ++i) { 7398 if (V->getOperand(i).isUndef()) 7399 return false; 7400 // We want to expand nodes that represent load-and-splat even if the 7401 // loaded value is a floating point truncation or conversion to int. 7402 if (V->getOperand(i).getOpcode() == ISD::LOAD || 7403 (V->getOperand(i).getOpcode() == ISD::FP_ROUND && 7404 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) || 7405 (V->getOperand(i).getOpcode() == ISD::FP_TO_SINT && 7406 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) || 7407 (V->getOperand(i).getOpcode() == ISD::FP_TO_UINT && 7408 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD)) 7409 IsLoad = true; 7410 // If the operands are different or the input is not a load and has more 7411 // uses than just this BV node, then it isn't a splat. 7412 if (V->getOperand(i) != Op0 || 7413 (!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode()))) 7414 IsSplat = false; 7415 } 7416 return !(IsSplat && IsLoad); 7417 } 7418 7419 // If this is a case we can't handle, return null and let the default 7420 // expansion code take care of it. If we CAN select this case, and if it 7421 // selects to a single instruction, return Op. Otherwise, if we can codegen 7422 // this case more efficiently than a constant pool load, lower it to the 7423 // sequence of ops that should be used. 7424 SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, 7425 SelectionDAG &DAG) const { 7426 SDLoc dl(Op); 7427 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 7428 assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); 7429 7430 if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) { 7431 // We first build an i32 vector, load it into a QPX register, 7432 // then convert it to a floating-point vector and compare it 7433 // to a zero vector to get the boolean result. 7434 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 7435 int FrameIdx = MFI.CreateStackObject(16, 16, false); 7436 MachinePointerInfo PtrInfo = 7437 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 7438 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 7439 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7440 7441 assert(BVN->getNumOperands() == 4 && 7442 "BUILD_VECTOR for v4i1 does not have 4 operands"); 7443 7444 bool IsConst = true; 7445 for (unsigned i = 0; i < 4; ++i) { 7446 if (BVN->getOperand(i).isUndef()) continue; 7447 if (!isa<ConstantSDNode>(BVN->getOperand(i))) { 7448 IsConst = false; 7449 break; 7450 } 7451 } 7452 7453 if (IsConst) { 7454 Constant *One = 7455 ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0); 7456 Constant *NegOne = 7457 ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0); 7458 7459 Constant *CV[4]; 7460 for (unsigned i = 0; i < 4; ++i) { 7461 if (BVN->getOperand(i).isUndef()) 7462 CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); 7463 else if (isNullConstant(BVN->getOperand(i))) 7464 CV[i] = NegOne; 7465 else 7466 CV[i] = One; 7467 } 7468 7469 Constant *CP = ConstantVector::get(CV); 7470 SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), 7471 16 /* alignment */); 7472 7473 SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; 7474 SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other}); 7475 return DAG.getMemIntrinsicNode( 7476 PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, 7477 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 7478 } 7479 7480 SmallVector<SDValue, 4> Stores; 7481 for (unsigned i = 0; i < 4; ++i) { 7482 if (BVN->getOperand(i).isUndef()) continue; 7483 7484 unsigned Offset = 4*i; 7485 SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); 7486 Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); 7487 7488 unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize(); 7489 if (StoreSize > 4) { 7490 Stores.push_back( 7491 DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx, 7492 PtrInfo.getWithOffset(Offset), MVT::i32)); 7493 } else { 7494 SDValue StoreValue = BVN->getOperand(i); 7495 if (StoreSize < 4) 7496 StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue); 7497 7498 Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx, 7499 PtrInfo.getWithOffset(Offset))); 7500 } 7501 } 7502 7503 SDValue StoreChain; 7504 if (!Stores.empty()) 7505 StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); 7506 else 7507 StoreChain = DAG.getEntryNode(); 7508 7509 // Now load from v4i32 into the QPX register; this will extend it to 7510 // v4i64 but not yet convert it to a floating point. Nevertheless, this 7511 // is typed as v4f64 because the QPX register integer states are not 7512 // explicitly represented. 7513 7514 SDValue Ops[] = {StoreChain, 7515 DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32), 7516 FIdx}; 7517 SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other}); 7518 7519 SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, 7520 dl, VTs, Ops, MVT::v4i32, PtrInfo); 7521 LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, 7522 DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32), 7523 LoadedVect); 7524 7525 SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64); 7526 7527 return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ); 7528 } 7529 7530 // All other QPX vectors are handled by generic code. 7531 if (Subtarget.hasQPX()) 7532 return SDValue(); 7533 7534 // Check if this is a splat of a constant value. 7535 APInt APSplatBits, APSplatUndef; 7536 unsigned SplatBitSize; 7537 bool HasAnyUndefs; 7538 if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, 7539 HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || 7540 SplatBitSize > 32) { 7541 // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be 7542 // lowered to VSX instructions under certain conditions. 7543 // Without VSX, there is no pattern more efficient than expanding the node. 7544 if (Subtarget.hasVSX() && 7545 haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove())) 7546 return Op; 7547 return SDValue(); 7548 } 7549 7550 unsigned SplatBits = APSplatBits.getZExtValue(); 7551 unsigned SplatUndef = APSplatUndef.getZExtValue(); 7552 unsigned SplatSize = SplatBitSize / 8; 7553 7554 // First, handle single instruction cases. 7555 7556 // All zeros? 7557 if (SplatBits == 0) { 7558 // Canonicalize all zero vectors to be v4i32. 7559 if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) { 7560 SDValue Z = DAG.getConstant(0, dl, MVT::v4i32); 7561 Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z); 7562 } 7563 return Op; 7564 } 7565 7566 // We have XXSPLTIB for constant splats one byte wide 7567 if (Subtarget.hasP9Vector() && SplatSize == 1) { 7568 // This is a splat of 1-byte elements with some elements potentially undef. 7569 // Rather than trying to match undef in the SDAG patterns, ensure that all 7570 // elements are the same constant. 7571 if (HasAnyUndefs || ISD::isBuildVectorAllOnes(BVN)) { 7572 SmallVector<SDValue, 16> Ops(16, DAG.getConstant(SplatBits, 7573 dl, MVT::i32)); 7574 SDValue NewBV = DAG.getBuildVector(MVT::v16i8, dl, Ops); 7575 if (Op.getValueType() != MVT::v16i8) 7576 return DAG.getBitcast(Op.getValueType(), NewBV); 7577 return NewBV; 7578 } 7579 return Op; 7580 } 7581 7582 // If the sign extended value is in the range [-16,15], use VSPLTI[bhw]. 7583 int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >> 7584 (32-SplatBitSize)); 7585 if (SextVal >= -16 && SextVal <= 15) 7586 return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl); 7587 7588 // Two instruction sequences. 7589 7590 // If this value is in the range [-32,30] and is even, use: 7591 // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2) 7592 // If this value is in the range [17,31] and is odd, use: 7593 // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16) 7594 // If this value is in the range [-31,-17] and is odd, use: 7595 // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16) 7596 // Note the last two are three-instruction sequences. 7597 if (SextVal >= -32 && SextVal <= 31) { 7598 // To avoid having these optimizations undone by constant folding, 7599 // we convert to a pseudo that will be expanded later into one of 7600 // the above forms. 7601 SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32); 7602 EVT VT = (SplatSize == 1 ? MVT::v16i8 : 7603 (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32)); 7604 SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32); 7605 SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize); 7606 if (VT == Op.getValueType()) 7607 return RetVal; 7608 else 7609 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal); 7610 } 7611 7612 // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is 7613 // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important 7614 // for fneg/fabs. 7615 if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) { 7616 // Make -1 and vspltisw -1: 7617 SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl); 7618 7619 // Make the VSLW intrinsic, computing 0x8000_0000. 7620 SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV, 7621 OnesV, DAG, dl); 7622 7623 // xor by OnesV to invert it. 7624 Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV); 7625 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 7626 } 7627 7628 // Check to see if this is a wide variety of vsplti*, binop self cases. 7629 static const signed char SplatCsts[] = { 7630 -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, 7631 -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16 7632 }; 7633 7634 for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) { 7635 // Indirect through the SplatCsts array so that we favor 'vsplti -1' for 7636 // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1' 7637 int i = SplatCsts[idx]; 7638 7639 // Figure out what shift amount will be used by altivec if shifted by i in 7640 // this splat size. 7641 unsigned TypeShiftAmt = i & (SplatBitSize-1); 7642 7643 // vsplti + shl self. 7644 if (SextVal == (int)((unsigned)i << TypeShiftAmt)) { 7645 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 7646 static const unsigned IIDs[] = { // Intrinsic to use for each size. 7647 Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0, 7648 Intrinsic::ppc_altivec_vslw 7649 }; 7650 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 7651 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 7652 } 7653 7654 // vsplti + srl self. 7655 if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { 7656 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 7657 static const unsigned IIDs[] = { // Intrinsic to use for each size. 7658 Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0, 7659 Intrinsic::ppc_altivec_vsrw 7660 }; 7661 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 7662 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 7663 } 7664 7665 // vsplti + sra self. 7666 if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { 7667 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 7668 static const unsigned IIDs[] = { // Intrinsic to use for each size. 7669 Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0, 7670 Intrinsic::ppc_altivec_vsraw 7671 }; 7672 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 7673 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 7674 } 7675 7676 // vsplti + rol self. 7677 if (SextVal == (int)(((unsigned)i << TypeShiftAmt) | 7678 ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) { 7679 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 7680 static const unsigned IIDs[] = { // Intrinsic to use for each size. 7681 Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0, 7682 Intrinsic::ppc_altivec_vrlw 7683 }; 7684 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 7685 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 7686 } 7687 7688 // t = vsplti c, result = vsldoi t, t, 1 7689 if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) { 7690 SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); 7691 unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1; 7692 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); 7693 } 7694 // t = vsplti c, result = vsldoi t, t, 2 7695 if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) { 7696 SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); 7697 unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2; 7698 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); 7699 } 7700 // t = vsplti c, result = vsldoi t, t, 3 7701 if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) { 7702 SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); 7703 unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3; 7704 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); 7705 } 7706 } 7707 7708 return SDValue(); 7709 } 7710 7711 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 7712 /// the specified operations to build the shuffle. 7713 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 7714 SDValue RHS, SelectionDAG &DAG, 7715 const SDLoc &dl) { 7716 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7717 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 7718 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 7719 7720 enum { 7721 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 7722 OP_VMRGHW, 7723 OP_VMRGLW, 7724 OP_VSPLTISW0, 7725 OP_VSPLTISW1, 7726 OP_VSPLTISW2, 7727 OP_VSPLTISW3, 7728 OP_VSLDOI4, 7729 OP_VSLDOI8, 7730 OP_VSLDOI12 7731 }; 7732 7733 if (OpNum == OP_COPY) { 7734 if (LHSID == (1*9+2)*9+3) return LHS; 7735 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 7736 return RHS; 7737 } 7738 7739 SDValue OpLHS, OpRHS; 7740 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 7741 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 7742 7743 int ShufIdxs[16]; 7744 switch (OpNum) { 7745 default: llvm_unreachable("Unknown i32 permute!"); 7746 case OP_VMRGHW: 7747 ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3; 7748 ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19; 7749 ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7; 7750 ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23; 7751 break; 7752 case OP_VMRGLW: 7753 ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11; 7754 ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27; 7755 ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15; 7756 ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31; 7757 break; 7758 case OP_VSPLTISW0: 7759 for (unsigned i = 0; i != 16; ++i) 7760 ShufIdxs[i] = (i&3)+0; 7761 break; 7762 case OP_VSPLTISW1: 7763 for (unsigned i = 0; i != 16; ++i) 7764 ShufIdxs[i] = (i&3)+4; 7765 break; 7766 case OP_VSPLTISW2: 7767 for (unsigned i = 0; i != 16; ++i) 7768 ShufIdxs[i] = (i&3)+8; 7769 break; 7770 case OP_VSPLTISW3: 7771 for (unsigned i = 0; i != 16; ++i) 7772 ShufIdxs[i] = (i&3)+12; 7773 break; 7774 case OP_VSLDOI4: 7775 return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl); 7776 case OP_VSLDOI8: 7777 return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl); 7778 case OP_VSLDOI12: 7779 return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl); 7780 } 7781 EVT VT = OpLHS.getValueType(); 7782 OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS); 7783 OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS); 7784 SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs); 7785 return DAG.getNode(ISD::BITCAST, dl, VT, T); 7786 } 7787 7788 /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this 7789 /// is a shuffle we can handle in a single instruction, return it. Otherwise, 7790 /// return the code it can be lowered into. Worst case, it can always be 7791 /// lowered into a vperm. 7792 SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 7793 SelectionDAG &DAG) const { 7794 SDLoc dl(Op); 7795 SDValue V1 = Op.getOperand(0); 7796 SDValue V2 = Op.getOperand(1); 7797 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7798 EVT VT = Op.getValueType(); 7799 bool isLittleEndian = Subtarget.isLittleEndian(); 7800 7801 unsigned ShiftElts, InsertAtByte; 7802 bool Swap; 7803 if (Subtarget.hasP9Vector() && 7804 PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, 7805 isLittleEndian)) { 7806 if (Swap) 7807 std::swap(V1, V2); 7808 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 7809 SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2); 7810 if (ShiftElts) { 7811 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2, 7812 DAG.getConstant(ShiftElts, dl, MVT::i32)); 7813 SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Shl, 7814 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 7815 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); 7816 } 7817 SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Conv2, 7818 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 7819 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); 7820 } 7821 7822 7823 if (Subtarget.hasVSX() && 7824 PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { 7825 if (Swap) 7826 std::swap(V1, V2); 7827 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 7828 SDValue Conv2 = 7829 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2); 7830 7831 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2, 7832 DAG.getConstant(ShiftElts, dl, MVT::i32)); 7833 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl); 7834 } 7835 7836 if (Subtarget.hasVSX() && 7837 PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { 7838 if (Swap) 7839 std::swap(V1, V2); 7840 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); 7841 SDValue Conv2 = 7842 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2); 7843 7844 SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2, 7845 DAG.getConstant(ShiftElts, dl, MVT::i32)); 7846 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI); 7847 } 7848 7849 if (Subtarget.hasVSX()) { 7850 if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { 7851 int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); 7852 7853 // If the source for the shuffle is a scalar_to_vector that came from a 7854 // 32-bit load, it will have used LXVWSX so we don't need to splat again. 7855 if (Subtarget.hasP9Vector() && 7856 ((isLittleEndian && SplatIdx == 3) || 7857 (!isLittleEndian && SplatIdx == 0))) { 7858 SDValue Src = V1.getOperand(0); 7859 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR && 7860 Src.getOperand(0).getOpcode() == ISD::LOAD && 7861 Src.getOperand(0).hasOneUse()) 7862 return V1; 7863 } 7864 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 7865 SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv, 7866 DAG.getConstant(SplatIdx, dl, MVT::i32)); 7867 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat); 7868 } 7869 7870 // Left shifts of 8 bytes are actually swaps. Convert accordingly. 7871 if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) { 7872 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 7873 SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv); 7874 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap); 7875 } 7876 } 7877 7878 if (Subtarget.hasQPX()) { 7879 if (VT.getVectorNumElements() != 4) 7880 return SDValue(); 7881 7882 if (V2.isUndef()) V2 = V1; 7883 7884 int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp); 7885 if (AlignIdx != -1) { 7886 return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2, 7887 DAG.getConstant(AlignIdx, dl, MVT::i32)); 7888 } else if (SVOp->isSplat()) { 7889 int SplatIdx = SVOp->getSplatIndex(); 7890 if (SplatIdx >= 4) { 7891 std::swap(V1, V2); 7892 SplatIdx -= 4; 7893 } 7894 7895 return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, 7896 DAG.getConstant(SplatIdx, dl, MVT::i32)); 7897 } 7898 7899 // Lower this into a qvgpci/qvfperm pair. 7900 7901 // Compute the qvgpci literal 7902 unsigned idx = 0; 7903 for (unsigned i = 0; i < 4; ++i) { 7904 int m = SVOp->getMaskElt(i); 7905 unsigned mm = m >= 0 ? (unsigned) m : i; 7906 idx |= mm << (3-i)*3; 7907 } 7908 7909 SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64, 7910 DAG.getConstant(idx, dl, MVT::i32)); 7911 return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3); 7912 } 7913 7914 // Cases that are handled by instructions that take permute immediates 7915 // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be 7916 // selected by the instruction selector. 7917 if (V2.isUndef()) { 7918 if (PPC::isSplatShuffleMask(SVOp, 1) || 7919 PPC::isSplatShuffleMask(SVOp, 2) || 7920 PPC::isSplatShuffleMask(SVOp, 4) || 7921 PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) || 7922 PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) || 7923 PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 || 7924 PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) || 7925 PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) || 7926 PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) || 7927 PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) || 7928 PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) || 7929 PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) || 7930 (Subtarget.hasP8Altivec() && ( 7931 PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) || 7932 PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) || 7933 PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) { 7934 return Op; 7935 } 7936 } 7937 7938 // Altivec has a variety of "shuffle immediates" that take two vector inputs 7939 // and produce a fixed permutation. If any of these match, do not lower to 7940 // VPERM. 7941 unsigned int ShuffleKind = isLittleEndian ? 2 : 0; 7942 if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) || 7943 PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) || 7944 PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 || 7945 PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) || 7946 PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) || 7947 PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) || 7948 PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) || 7949 PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) || 7950 PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) || 7951 (Subtarget.hasP8Altivec() && ( 7952 PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) || 7953 PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) || 7954 PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG)))) 7955 return Op; 7956 7957 // Check to see if this is a shuffle of 4-byte values. If so, we can use our 7958 // perfect shuffle table to emit an optimal matching sequence. 7959 ArrayRef<int> PermMask = SVOp->getMask(); 7960 7961 unsigned PFIndexes[4]; 7962 bool isFourElementShuffle = true; 7963 for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number 7964 unsigned EltNo = 8; // Start out undef. 7965 for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. 7966 if (PermMask[i*4+j] < 0) 7967 continue; // Undef, ignore it. 7968 7969 unsigned ByteSource = PermMask[i*4+j]; 7970 if ((ByteSource & 3) != j) { 7971 isFourElementShuffle = false; 7972 break; 7973 } 7974 7975 if (EltNo == 8) { 7976 EltNo = ByteSource/4; 7977 } else if (EltNo != ByteSource/4) { 7978 isFourElementShuffle = false; 7979 break; 7980 } 7981 } 7982 PFIndexes[i] = EltNo; 7983 } 7984 7985 // If this shuffle can be expressed as a shuffle of 4-byte elements, use the 7986 // perfect shuffle vector to determine if it is cost effective to do this as 7987 // discrete instructions, or whether we should use a vperm. 7988 // For now, we skip this for little endian until such time as we have a 7989 // little-endian perfect shuffle table. 7990 if (isFourElementShuffle && !isLittleEndian) { 7991 // Compute the index in the perfect shuffle table. 7992 unsigned PFTableIndex = 7993 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 7994 7995 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7996 unsigned Cost = (PFEntry >> 30); 7997 7998 // Determining when to avoid vperm is tricky. Many things affect the cost 7999 // of vperm, particularly how many times the perm mask needs to be computed. 8000 // For example, if the perm mask can be hoisted out of a loop or is already 8001 // used (perhaps because there are multiple permutes with the same shuffle 8002 // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of 8003 // the loop requires an extra register. 8004 // 8005 // As a compromise, we only emit discrete instructions if the shuffle can be 8006 // generated in 3 or fewer operations. When we have loop information 8007 // available, if this block is within a loop, we should avoid using vperm 8008 // for 3-operation perms and use a constant pool load instead. 8009 if (Cost < 3) 8010 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8011 } 8012 8013 // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant 8014 // vector that will get spilled to the constant pool. 8015 if (V2.isUndef()) V2 = V1; 8016 8017 // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except 8018 // that it is in input element units, not in bytes. Convert now. 8019 8020 // For little endian, the order of the input vectors is reversed, and 8021 // the permutation mask is complemented with respect to 31. This is 8022 // necessary to produce proper semantics with the big-endian-biased vperm 8023 // instruction. 8024 EVT EltVT = V1.getValueType().getVectorElementType(); 8025 unsigned BytesPerElement = EltVT.getSizeInBits()/8; 8026 8027 SmallVector<SDValue, 16> ResultMask; 8028 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { 8029 unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; 8030 8031 for (unsigned j = 0; j != BytesPerElement; ++j) 8032 if (isLittleEndian) 8033 ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j), 8034 dl, MVT::i32)); 8035 else 8036 ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl, 8037 MVT::i32)); 8038 } 8039 8040 SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask); 8041 if (isLittleEndian) 8042 return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), 8043 V2, V1, VPermMask); 8044 else 8045 return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), 8046 V1, V2, VPermMask); 8047 } 8048 8049 /// getVectorCompareInfo - Given an intrinsic, return false if it is not a 8050 /// vector comparison. If it is, return true and fill in Opc/isDot with 8051 /// information about the intrinsic. 8052 static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, 8053 bool &isDot, const PPCSubtarget &Subtarget) { 8054 unsigned IntrinsicID = 8055 cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); 8056 CompareOpc = -1; 8057 isDot = false; 8058 switch (IntrinsicID) { 8059 default: 8060 return false; 8061 // Comparison predicates. 8062 case Intrinsic::ppc_altivec_vcmpbfp_p: 8063 CompareOpc = 966; 8064 isDot = true; 8065 break; 8066 case Intrinsic::ppc_altivec_vcmpeqfp_p: 8067 CompareOpc = 198; 8068 isDot = true; 8069 break; 8070 case Intrinsic::ppc_altivec_vcmpequb_p: 8071 CompareOpc = 6; 8072 isDot = true; 8073 break; 8074 case Intrinsic::ppc_altivec_vcmpequh_p: 8075 CompareOpc = 70; 8076 isDot = true; 8077 break; 8078 case Intrinsic::ppc_altivec_vcmpequw_p: 8079 CompareOpc = 134; 8080 isDot = true; 8081 break; 8082 case Intrinsic::ppc_altivec_vcmpequd_p: 8083 if (Subtarget.hasP8Altivec()) { 8084 CompareOpc = 199; 8085 isDot = true; 8086 } else 8087 return false; 8088 break; 8089 case Intrinsic::ppc_altivec_vcmpneb_p: 8090 case Intrinsic::ppc_altivec_vcmpneh_p: 8091 case Intrinsic::ppc_altivec_vcmpnew_p: 8092 case Intrinsic::ppc_altivec_vcmpnezb_p: 8093 case Intrinsic::ppc_altivec_vcmpnezh_p: 8094 case Intrinsic::ppc_altivec_vcmpnezw_p: 8095 if (Subtarget.hasP9Altivec()) { 8096 switch (IntrinsicID) { 8097 default: 8098 llvm_unreachable("Unknown comparison intrinsic."); 8099 case Intrinsic::ppc_altivec_vcmpneb_p: 8100 CompareOpc = 7; 8101 break; 8102 case Intrinsic::ppc_altivec_vcmpneh_p: 8103 CompareOpc = 71; 8104 break; 8105 case Intrinsic::ppc_altivec_vcmpnew_p: 8106 CompareOpc = 135; 8107 break; 8108 case Intrinsic::ppc_altivec_vcmpnezb_p: 8109 CompareOpc = 263; 8110 break; 8111 case Intrinsic::ppc_altivec_vcmpnezh_p: 8112 CompareOpc = 327; 8113 break; 8114 case Intrinsic::ppc_altivec_vcmpnezw_p: 8115 CompareOpc = 391; 8116 break; 8117 } 8118 isDot = true; 8119 } else 8120 return false; 8121 break; 8122 case Intrinsic::ppc_altivec_vcmpgefp_p: 8123 CompareOpc = 454; 8124 isDot = true; 8125 break; 8126 case Intrinsic::ppc_altivec_vcmpgtfp_p: 8127 CompareOpc = 710; 8128 isDot = true; 8129 break; 8130 case Intrinsic::ppc_altivec_vcmpgtsb_p: 8131 CompareOpc = 774; 8132 isDot = true; 8133 break; 8134 case Intrinsic::ppc_altivec_vcmpgtsh_p: 8135 CompareOpc = 838; 8136 isDot = true; 8137 break; 8138 case Intrinsic::ppc_altivec_vcmpgtsw_p: 8139 CompareOpc = 902; 8140 isDot = true; 8141 break; 8142 case Intrinsic::ppc_altivec_vcmpgtsd_p: 8143 if (Subtarget.hasP8Altivec()) { 8144 CompareOpc = 967; 8145 isDot = true; 8146 } else 8147 return false; 8148 break; 8149 case Intrinsic::ppc_altivec_vcmpgtub_p: 8150 CompareOpc = 518; 8151 isDot = true; 8152 break; 8153 case Intrinsic::ppc_altivec_vcmpgtuh_p: 8154 CompareOpc = 582; 8155 isDot = true; 8156 break; 8157 case Intrinsic::ppc_altivec_vcmpgtuw_p: 8158 CompareOpc = 646; 8159 isDot = true; 8160 break; 8161 case Intrinsic::ppc_altivec_vcmpgtud_p: 8162 if (Subtarget.hasP8Altivec()) { 8163 CompareOpc = 711; 8164 isDot = true; 8165 } else 8166 return false; 8167 break; 8168 8169 // VSX predicate comparisons use the same infrastructure 8170 case Intrinsic::ppc_vsx_xvcmpeqdp_p: 8171 case Intrinsic::ppc_vsx_xvcmpgedp_p: 8172 case Intrinsic::ppc_vsx_xvcmpgtdp_p: 8173 case Intrinsic::ppc_vsx_xvcmpeqsp_p: 8174 case Intrinsic::ppc_vsx_xvcmpgesp_p: 8175 case Intrinsic::ppc_vsx_xvcmpgtsp_p: 8176 if (Subtarget.hasVSX()) { 8177 switch (IntrinsicID) { 8178 case Intrinsic::ppc_vsx_xvcmpeqdp_p: 8179 CompareOpc = 99; 8180 break; 8181 case Intrinsic::ppc_vsx_xvcmpgedp_p: 8182 CompareOpc = 115; 8183 break; 8184 case Intrinsic::ppc_vsx_xvcmpgtdp_p: 8185 CompareOpc = 107; 8186 break; 8187 case Intrinsic::ppc_vsx_xvcmpeqsp_p: 8188 CompareOpc = 67; 8189 break; 8190 case Intrinsic::ppc_vsx_xvcmpgesp_p: 8191 CompareOpc = 83; 8192 break; 8193 case Intrinsic::ppc_vsx_xvcmpgtsp_p: 8194 CompareOpc = 75; 8195 break; 8196 } 8197 isDot = true; 8198 } else 8199 return false; 8200 break; 8201 8202 // Normal Comparisons. 8203 case Intrinsic::ppc_altivec_vcmpbfp: 8204 CompareOpc = 966; 8205 break; 8206 case Intrinsic::ppc_altivec_vcmpeqfp: 8207 CompareOpc = 198; 8208 break; 8209 case Intrinsic::ppc_altivec_vcmpequb: 8210 CompareOpc = 6; 8211 break; 8212 case Intrinsic::ppc_altivec_vcmpequh: 8213 CompareOpc = 70; 8214 break; 8215 case Intrinsic::ppc_altivec_vcmpequw: 8216 CompareOpc = 134; 8217 break; 8218 case Intrinsic::ppc_altivec_vcmpequd: 8219 if (Subtarget.hasP8Altivec()) 8220 CompareOpc = 199; 8221 else 8222 return false; 8223 break; 8224 case Intrinsic::ppc_altivec_vcmpneb: 8225 case Intrinsic::ppc_altivec_vcmpneh: 8226 case Intrinsic::ppc_altivec_vcmpnew: 8227 case Intrinsic::ppc_altivec_vcmpnezb: 8228 case Intrinsic::ppc_altivec_vcmpnezh: 8229 case Intrinsic::ppc_altivec_vcmpnezw: 8230 if (Subtarget.hasP9Altivec()) 8231 switch (IntrinsicID) { 8232 default: 8233 llvm_unreachable("Unknown comparison intrinsic."); 8234 case Intrinsic::ppc_altivec_vcmpneb: 8235 CompareOpc = 7; 8236 break; 8237 case Intrinsic::ppc_altivec_vcmpneh: 8238 CompareOpc = 71; 8239 break; 8240 case Intrinsic::ppc_altivec_vcmpnew: 8241 CompareOpc = 135; 8242 break; 8243 case Intrinsic::ppc_altivec_vcmpnezb: 8244 CompareOpc = 263; 8245 break; 8246 case Intrinsic::ppc_altivec_vcmpnezh: 8247 CompareOpc = 327; 8248 break; 8249 case Intrinsic::ppc_altivec_vcmpnezw: 8250 CompareOpc = 391; 8251 break; 8252 } 8253 else 8254 return false; 8255 break; 8256 case Intrinsic::ppc_altivec_vcmpgefp: 8257 CompareOpc = 454; 8258 break; 8259 case Intrinsic::ppc_altivec_vcmpgtfp: 8260 CompareOpc = 710; 8261 break; 8262 case Intrinsic::ppc_altivec_vcmpgtsb: 8263 CompareOpc = 774; 8264 break; 8265 case Intrinsic::ppc_altivec_vcmpgtsh: 8266 CompareOpc = 838; 8267 break; 8268 case Intrinsic::ppc_altivec_vcmpgtsw: 8269 CompareOpc = 902; 8270 break; 8271 case Intrinsic::ppc_altivec_vcmpgtsd: 8272 if (Subtarget.hasP8Altivec()) 8273 CompareOpc = 967; 8274 else 8275 return false; 8276 break; 8277 case Intrinsic::ppc_altivec_vcmpgtub: 8278 CompareOpc = 518; 8279 break; 8280 case Intrinsic::ppc_altivec_vcmpgtuh: 8281 CompareOpc = 582; 8282 break; 8283 case Intrinsic::ppc_altivec_vcmpgtuw: 8284 CompareOpc = 646; 8285 break; 8286 case Intrinsic::ppc_altivec_vcmpgtud: 8287 if (Subtarget.hasP8Altivec()) 8288 CompareOpc = 711; 8289 else 8290 return false; 8291 break; 8292 } 8293 return true; 8294 } 8295 8296 /// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom 8297 /// lower, do it, otherwise return null. 8298 SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 8299 SelectionDAG &DAG) const { 8300 unsigned IntrinsicID = 8301 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8302 8303 if (IntrinsicID == Intrinsic::thread_pointer) { 8304 // Reads the thread pointer register, used for __builtin_thread_pointer. 8305 bool is64bit = Subtarget.isPPC64(); 8306 return DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, 8307 is64bit ? MVT::i64 : MVT::i32); 8308 } 8309 8310 // If this is a lowered altivec predicate compare, CompareOpc is set to the 8311 // opcode number of the comparison. 8312 SDLoc dl(Op); 8313 int CompareOpc; 8314 bool isDot; 8315 if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget)) 8316 return SDValue(); // Don't custom lower most intrinsics. 8317 8318 // If this is a non-dot comparison, make the VCMP node and we are done. 8319 if (!isDot) { 8320 SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(), 8321 Op.getOperand(1), Op.getOperand(2), 8322 DAG.getConstant(CompareOpc, dl, MVT::i32)); 8323 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp); 8324 } 8325 8326 // Create the PPCISD altivec 'dot' comparison node. 8327 SDValue Ops[] = { 8328 Op.getOperand(2), // LHS 8329 Op.getOperand(3), // RHS 8330 DAG.getConstant(CompareOpc, dl, MVT::i32) 8331 }; 8332 EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue }; 8333 SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); 8334 8335 // Now that we have the comparison, emit a copy from the CR to a GPR. 8336 // This is flagged to the above dot comparison. 8337 SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32, 8338 DAG.getRegister(PPC::CR6, MVT::i32), 8339 CompNode.getValue(1)); 8340 8341 // Unpack the result based on how the target uses it. 8342 unsigned BitNo; // Bit # of CR6. 8343 bool InvertBit; // Invert result? 8344 switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) { 8345 default: // Can't happen, don't crash on invalid number though. 8346 case 0: // Return the value of the EQ bit of CR6. 8347 BitNo = 0; InvertBit = false; 8348 break; 8349 case 1: // Return the inverted value of the EQ bit of CR6. 8350 BitNo = 0; InvertBit = true; 8351 break; 8352 case 2: // Return the value of the LT bit of CR6. 8353 BitNo = 2; InvertBit = false; 8354 break; 8355 case 3: // Return the inverted value of the LT bit of CR6. 8356 BitNo = 2; InvertBit = true; 8357 break; 8358 } 8359 8360 // Shift the bit into the low position. 8361 Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags, 8362 DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32)); 8363 // Isolate the bit. 8364 Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags, 8365 DAG.getConstant(1, dl, MVT::i32)); 8366 8367 // If we are supposed to, toggle the bit. 8368 if (InvertBit) 8369 Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags, 8370 DAG.getConstant(1, dl, MVT::i32)); 8371 return Flags; 8372 } 8373 8374 SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, 8375 SelectionDAG &DAG) const { 8376 // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to 8377 // the beginning of the argument list. 8378 int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1; 8379 SDLoc DL(Op); 8380 switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) { 8381 case Intrinsic::ppc_cfence: { 8382 assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument."); 8383 assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); 8384 return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, 8385 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, 8386 Op.getOperand(ArgStart + 1)), 8387 Op.getOperand(0)), 8388 0); 8389 } 8390 default: 8391 break; 8392 } 8393 return SDValue(); 8394 } 8395 8396 SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 8397 SelectionDAG &DAG) const { 8398 SDLoc dl(Op); 8399 // For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int 8400 // instructions), but for smaller types, we need to first extend up to v2i32 8401 // before doing going farther. 8402 if (Op.getValueType() == MVT::v2i64) { 8403 EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 8404 if (ExtVT != MVT::v2i32) { 8405 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)); 8406 Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op, 8407 DAG.getValueType(EVT::getVectorVT(*DAG.getContext(), 8408 ExtVT.getVectorElementType(), 4))); 8409 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op); 8410 Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op, 8411 DAG.getValueType(MVT::v2i32)); 8412 } 8413 8414 return Op; 8415 } 8416 8417 return SDValue(); 8418 } 8419 8420 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, 8421 SelectionDAG &DAG) const { 8422 SDLoc dl(Op); 8423 // Create a stack slot that is 16-byte aligned. 8424 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8425 int FrameIdx = MFI.CreateStackObject(16, 16, false); 8426 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8427 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 8428 8429 // Store the input value into Value#0 of the stack slot. 8430 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, 8431 MachinePointerInfo()); 8432 // Load it out. 8433 return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo()); 8434 } 8435 8436 SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8437 SelectionDAG &DAG) const { 8438 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && 8439 "Should only be called for ISD::INSERT_VECTOR_ELT"); 8440 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 8441 // We have legal lowering for constant indices but not for variable ones. 8442 if (C) 8443 return Op; 8444 return SDValue(); 8445 } 8446 8447 SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 8448 SelectionDAG &DAG) const { 8449 SDLoc dl(Op); 8450 SDNode *N = Op.getNode(); 8451 8452 assert(N->getOperand(0).getValueType() == MVT::v4i1 && 8453 "Unknown extract_vector_elt type"); 8454 8455 SDValue Value = N->getOperand(0); 8456 8457 // The first part of this is like the store lowering except that we don't 8458 // need to track the chain. 8459 8460 // The values are now known to be -1 (false) or 1 (true). To convert this 8461 // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). 8462 // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 8463 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); 8464 8465 // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to 8466 // understand how to form the extending load. 8467 SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); 8468 8469 Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 8470 8471 // Now convert to an integer and store. 8472 Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, 8473 DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), 8474 Value); 8475 8476 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8477 int FrameIdx = MFI.CreateStackObject(16, 16, false); 8478 MachinePointerInfo PtrInfo = 8479 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 8480 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8481 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 8482 8483 SDValue StoreChain = DAG.getEntryNode(); 8484 SDValue Ops[] = {StoreChain, 8485 DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), 8486 Value, FIdx}; 8487 SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); 8488 8489 StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, 8490 dl, VTs, Ops, MVT::v4i32, PtrInfo); 8491 8492 // Extract the value requested. 8493 unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 8494 SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); 8495 Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); 8496 8497 SDValue IntVal = 8498 DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); 8499 8500 if (!Subtarget.useCRBits()) 8501 return IntVal; 8502 8503 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal); 8504 } 8505 8506 /// Lowering for QPX v4i1 loads 8507 SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, 8508 SelectionDAG &DAG) const { 8509 SDLoc dl(Op); 8510 LoadSDNode *LN = cast<LoadSDNode>(Op.getNode()); 8511 SDValue LoadChain = LN->getChain(); 8512 SDValue BasePtr = LN->getBasePtr(); 8513 8514 if (Op.getValueType() == MVT::v4f64 || 8515 Op.getValueType() == MVT::v4f32) { 8516 EVT MemVT = LN->getMemoryVT(); 8517 unsigned Alignment = LN->getAlignment(); 8518 8519 // If this load is properly aligned, then it is legal. 8520 if (Alignment >= MemVT.getStoreSize()) 8521 return Op; 8522 8523 EVT ScalarVT = Op.getValueType().getScalarType(), 8524 ScalarMemVT = MemVT.getScalarType(); 8525 unsigned Stride = ScalarMemVT.getStoreSize(); 8526 8527 SDValue Vals[4], LoadChains[4]; 8528 for (unsigned Idx = 0; Idx < 4; ++Idx) { 8529 SDValue Load; 8530 if (ScalarVT != ScalarMemVT) 8531 Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain, 8532 BasePtr, 8533 LN->getPointerInfo().getWithOffset(Idx * Stride), 8534 ScalarMemVT, MinAlign(Alignment, Idx * Stride), 8535 LN->getMemOperand()->getFlags(), LN->getAAInfo()); 8536 else 8537 Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr, 8538 LN->getPointerInfo().getWithOffset(Idx * Stride), 8539 MinAlign(Alignment, Idx * Stride), 8540 LN->getMemOperand()->getFlags(), LN->getAAInfo()); 8541 8542 if (Idx == 0 && LN->isIndexed()) { 8543 assert(LN->getAddressingMode() == ISD::PRE_INC && 8544 "Unknown addressing mode on vector load"); 8545 Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(), 8546 LN->getAddressingMode()); 8547 } 8548 8549 Vals[Idx] = Load; 8550 LoadChains[Idx] = Load.getValue(1); 8551 8552 BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 8553 DAG.getConstant(Stride, dl, 8554 BasePtr.getValueType())); 8555 } 8556 8557 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); 8558 SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals); 8559 8560 if (LN->isIndexed()) { 8561 SDValue RetOps[] = { Value, Vals[0].getValue(1), TF }; 8562 return DAG.getMergeValues(RetOps, dl); 8563 } 8564 8565 SDValue RetOps[] = { Value, TF }; 8566 return DAG.getMergeValues(RetOps, dl); 8567 } 8568 8569 assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower"); 8570 assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported"); 8571 8572 // To lower v4i1 from a byte array, we load the byte elements of the 8573 // vector and then reuse the BUILD_VECTOR logic. 8574 8575 SDValue VectElmts[4], VectElmtChains[4]; 8576 for (unsigned i = 0; i < 4; ++i) { 8577 SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); 8578 Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); 8579 8580 VectElmts[i] = DAG.getExtLoad( 8581 ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx, 8582 LN->getPointerInfo().getWithOffset(i), MVT::i8, 8583 /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo()); 8584 VectElmtChains[i] = VectElmts[i].getValue(1); 8585 } 8586 8587 LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains); 8588 SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts); 8589 8590 SDValue RVals[] = { Value, LoadChain }; 8591 return DAG.getMergeValues(RVals, dl); 8592 } 8593 8594 /// Lowering for QPX v4i1 stores 8595 SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, 8596 SelectionDAG &DAG) const { 8597 SDLoc dl(Op); 8598 StoreSDNode *SN = cast<StoreSDNode>(Op.getNode()); 8599 SDValue StoreChain = SN->getChain(); 8600 SDValue BasePtr = SN->getBasePtr(); 8601 SDValue Value = SN->getValue(); 8602 8603 if (Value.getValueType() == MVT::v4f64 || 8604 Value.getValueType() == MVT::v4f32) { 8605 EVT MemVT = SN->getMemoryVT(); 8606 unsigned Alignment = SN->getAlignment(); 8607 8608 // If this store is properly aligned, then it is legal. 8609 if (Alignment >= MemVT.getStoreSize()) 8610 return Op; 8611 8612 EVT ScalarVT = Value.getValueType().getScalarType(), 8613 ScalarMemVT = MemVT.getScalarType(); 8614 unsigned Stride = ScalarMemVT.getStoreSize(); 8615 8616 SDValue Stores[4]; 8617 for (unsigned Idx = 0; Idx < 4; ++Idx) { 8618 SDValue Ex = DAG.getNode( 8619 ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, 8620 DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout()))); 8621 SDValue Store; 8622 if (ScalarVT != ScalarMemVT) 8623 Store = 8624 DAG.getTruncStore(StoreChain, dl, Ex, BasePtr, 8625 SN->getPointerInfo().getWithOffset(Idx * Stride), 8626 ScalarMemVT, MinAlign(Alignment, Idx * Stride), 8627 SN->getMemOperand()->getFlags(), SN->getAAInfo()); 8628 else 8629 Store = DAG.getStore(StoreChain, dl, Ex, BasePtr, 8630 SN->getPointerInfo().getWithOffset(Idx * Stride), 8631 MinAlign(Alignment, Idx * Stride), 8632 SN->getMemOperand()->getFlags(), SN->getAAInfo()); 8633 8634 if (Idx == 0 && SN->isIndexed()) { 8635 assert(SN->getAddressingMode() == ISD::PRE_INC && 8636 "Unknown addressing mode on vector store"); 8637 Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(), 8638 SN->getAddressingMode()); 8639 } 8640 8641 BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 8642 DAG.getConstant(Stride, dl, 8643 BasePtr.getValueType())); 8644 Stores[Idx] = Store; 8645 } 8646 8647 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); 8648 8649 if (SN->isIndexed()) { 8650 SDValue RetOps[] = { TF, Stores[0].getValue(1) }; 8651 return DAG.getMergeValues(RetOps, dl); 8652 } 8653 8654 return TF; 8655 } 8656 8657 assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported"); 8658 assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower"); 8659 8660 // The values are now known to be -1 (false) or 1 (true). To convert this 8661 // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). 8662 // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 8663 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); 8664 8665 // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to 8666 // understand how to form the extending load. 8667 SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); 8668 8669 Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 8670 8671 // Now convert to an integer and store. 8672 Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, 8673 DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), 8674 Value); 8675 8676 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8677 int FrameIdx = MFI.CreateStackObject(16, 16, false); 8678 MachinePointerInfo PtrInfo = 8679 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 8680 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8681 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 8682 8683 SDValue Ops[] = {StoreChain, 8684 DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), 8685 Value, FIdx}; 8686 SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); 8687 8688 StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, 8689 dl, VTs, Ops, MVT::v4i32, PtrInfo); 8690 8691 // Move data into the byte array. 8692 SDValue Loads[4], LoadChains[4]; 8693 for (unsigned i = 0; i < 4; ++i) { 8694 unsigned Offset = 4*i; 8695 SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); 8696 Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); 8697 8698 Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, 8699 PtrInfo.getWithOffset(Offset)); 8700 LoadChains[i] = Loads[i].getValue(1); 8701 } 8702 8703 StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); 8704 8705 SDValue Stores[4]; 8706 for (unsigned i = 0; i < 4; ++i) { 8707 SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); 8708 Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); 8709 8710 Stores[i] = DAG.getTruncStore( 8711 StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), 8712 MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(), 8713 SN->getAAInfo()); 8714 } 8715 8716 StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); 8717 8718 return StoreChain; 8719 } 8720 8721 SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { 8722 SDLoc dl(Op); 8723 if (Op.getValueType() == MVT::v4i32) { 8724 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 8725 8726 SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl); 8727 SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt. 8728 8729 SDValue RHSSwap = // = vrlw RHS, 16 8730 BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl); 8731 8732 // Shrinkify inputs to v8i16. 8733 LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS); 8734 RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS); 8735 RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap); 8736 8737 // Low parts multiplied together, generating 32-bit results (we ignore the 8738 // top parts). 8739 SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh, 8740 LHS, RHS, DAG, dl, MVT::v4i32); 8741 8742 SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm, 8743 LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32); 8744 // Shift the high parts up 16 bits. 8745 HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd, 8746 Neg16, DAG, dl); 8747 return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd); 8748 } else if (Op.getValueType() == MVT::v8i16) { 8749 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 8750 8751 SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl); 8752 8753 return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm, 8754 LHS, RHS, Zero, DAG, dl); 8755 } else if (Op.getValueType() == MVT::v16i8) { 8756 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 8757 bool isLittleEndian = Subtarget.isLittleEndian(); 8758 8759 // Multiply the even 8-bit parts, producing 16-bit sums. 8760 SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub, 8761 LHS, RHS, DAG, dl, MVT::v8i16); 8762 EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts); 8763 8764 // Multiply the odd 8-bit parts, producing 16-bit sums. 8765 SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub, 8766 LHS, RHS, DAG, dl, MVT::v8i16); 8767 OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts); 8768 8769 // Merge the results together. Because vmuleub and vmuloub are 8770 // instructions with a big-endian bias, we must reverse the 8771 // element numbering and reverse the meaning of "odd" and "even" 8772 // when generating little endian code. 8773 int Ops[16]; 8774 for (unsigned i = 0; i != 8; ++i) { 8775 if (isLittleEndian) { 8776 Ops[i*2 ] = 2*i; 8777 Ops[i*2+1] = 2*i+16; 8778 } else { 8779 Ops[i*2 ] = 2*i+1; 8780 Ops[i*2+1] = 2*i+1+16; 8781 } 8782 } 8783 if (isLittleEndian) 8784 return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops); 8785 else 8786 return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops); 8787 } else { 8788 llvm_unreachable("Unknown mul to lower!"); 8789 } 8790 } 8791 8792 /// LowerOperation - Provide custom lowering hooks for some operations. 8793 /// 8794 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 8795 switch (Op.getOpcode()) { 8796 default: llvm_unreachable("Wasn't expecting to be able to lower this!"); 8797 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 8798 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 8799 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 8800 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 8801 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 8802 case ISD::SETCC: return LowerSETCC(Op, DAG); 8803 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 8804 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 8805 case ISD::VASTART: 8806 return LowerVASTART(Op, DAG); 8807 8808 case ISD::VAARG: 8809 return LowerVAARG(Op, DAG); 8810 8811 case ISD::VACOPY: 8812 return LowerVACOPY(Op, DAG); 8813 8814 case ISD::STACKRESTORE: 8815 return LowerSTACKRESTORE(Op, DAG); 8816 8817 case ISD::DYNAMIC_STACKALLOC: 8818 return LowerDYNAMIC_STACKALLOC(Op, DAG); 8819 8820 case ISD::GET_DYNAMIC_AREA_OFFSET: 8821 return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG); 8822 8823 case ISD::EH_DWARF_CFA: 8824 return LowerEH_DWARF_CFA(Op, DAG); 8825 8826 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); 8827 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); 8828 8829 case ISD::LOAD: return LowerLOAD(Op, DAG); 8830 case ISD::STORE: return LowerSTORE(Op, DAG); 8831 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); 8832 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 8833 case ISD::FP_TO_UINT: 8834 case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, 8835 SDLoc(Op)); 8836 case ISD::UINT_TO_FP: 8837 case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 8838 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 8839 8840 // Lower 64-bit shifts. 8841 case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG); 8842 case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG); 8843 case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG); 8844 8845 // Vector-related lowering. 8846 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 8847 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 8848 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 8849 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 8850 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 8851 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 8852 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 8853 case ISD::MUL: return LowerMUL(Op, DAG); 8854 8855 // For counter-based loop handling. 8856 case ISD::INTRINSIC_W_CHAIN: return SDValue(); 8857 8858 // Frame & Return address. 8859 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 8860 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 8861 8862 case ISD::INTRINSIC_VOID: 8863 return LowerINTRINSIC_VOID(Op, DAG); 8864 } 8865 } 8866 8867 void PPCTargetLowering::ReplaceNodeResults(SDNode *N, 8868 SmallVectorImpl<SDValue>&Results, 8869 SelectionDAG &DAG) const { 8870 SDLoc dl(N); 8871 switch (N->getOpcode()) { 8872 default: 8873 llvm_unreachable("Do not know how to custom type legalize this operation!"); 8874 case ISD::READCYCLECOUNTER: { 8875 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 8876 SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0)); 8877 8878 Results.push_back(RTB); 8879 Results.push_back(RTB.getValue(1)); 8880 Results.push_back(RTB.getValue(2)); 8881 break; 8882 } 8883 case ISD::INTRINSIC_W_CHAIN: { 8884 if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 8885 Intrinsic::ppc_is_decremented_ctr_nonzero) 8886 break; 8887 8888 assert(N->getValueType(0) == MVT::i1 && 8889 "Unexpected result type for CTR decrement intrinsic"); 8890 EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), 8891 N->getValueType(0)); 8892 SDVTList VTs = DAG.getVTList(SVT, MVT::Other); 8893 SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0), 8894 N->getOperand(1)); 8895 8896 Results.push_back(NewInt); 8897 Results.push_back(NewInt.getValue(1)); 8898 break; 8899 } 8900 case ISD::VAARG: { 8901 if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64()) 8902 return; 8903 8904 EVT VT = N->getValueType(0); 8905 8906 if (VT == MVT::i64) { 8907 SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG); 8908 8909 Results.push_back(NewNode); 8910 Results.push_back(NewNode.getValue(1)); 8911 } 8912 return; 8913 } 8914 case ISD::FP_ROUND_INREG: { 8915 assert(N->getValueType(0) == MVT::ppcf128); 8916 assert(N->getOperand(0).getValueType() == MVT::ppcf128); 8917 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, 8918 MVT::f64, N->getOperand(0), 8919 DAG.getIntPtrConstant(0, dl)); 8920 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, 8921 MVT::f64, N->getOperand(0), 8922 DAG.getIntPtrConstant(1, dl)); 8923 8924 // Add the two halves of the long double in round-to-zero mode. 8925 SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi); 8926 8927 // We know the low half is about to be thrown away, so just use something 8928 // convenient. 8929 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128, 8930 FPreg, FPreg)); 8931 return; 8932 } 8933 case ISD::FP_TO_SINT: 8934 case ISD::FP_TO_UINT: 8935 // LowerFP_TO_INT() can only handle f32 and f64. 8936 if (N->getOperand(0).getValueType() == MVT::ppcf128) 8937 return; 8938 Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); 8939 return; 8940 } 8941 } 8942 8943 //===----------------------------------------------------------------------===// 8944 // Other Lowering Code 8945 //===----------------------------------------------------------------------===// 8946 8947 static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { 8948 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 8949 Function *Func = Intrinsic::getDeclaration(M, Id); 8950 return Builder.CreateCall(Func, {}); 8951 } 8952 8953 // The mappings for emitLeading/TrailingFence is taken from 8954 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 8955 Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 8956 Instruction *Inst, 8957 AtomicOrdering Ord) const { 8958 if (Ord == AtomicOrdering::SequentiallyConsistent) 8959 return callIntrinsic(Builder, Intrinsic::ppc_sync); 8960 if (isReleaseOrStronger(Ord)) 8961 return callIntrinsic(Builder, Intrinsic::ppc_lwsync); 8962 return nullptr; 8963 } 8964 8965 Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 8966 Instruction *Inst, 8967 AtomicOrdering Ord) const { 8968 if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { 8969 // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and 8970 // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html 8971 // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. 8972 if (isa<LoadInst>(Inst) && Subtarget.isPPC64()) 8973 return Builder.CreateCall( 8974 Intrinsic::getDeclaration( 8975 Builder.GetInsertBlock()->getParent()->getParent(), 8976 Intrinsic::ppc_cfence, {Inst->getType()}), 8977 {Inst}); 8978 // FIXME: Can use isync for rmw operation. 8979 return callIntrinsic(Builder, Intrinsic::ppc_lwsync); 8980 } 8981 return nullptr; 8982 } 8983 8984 MachineBasicBlock * 8985 PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, 8986 unsigned AtomicSize, 8987 unsigned BinOpcode, 8988 unsigned CmpOpcode, 8989 unsigned CmpPred) const { 8990 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 8991 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 8992 8993 auto LoadMnemonic = PPC::LDARX; 8994 auto StoreMnemonic = PPC::STDCX; 8995 switch (AtomicSize) { 8996 default: 8997 llvm_unreachable("Unexpected size of atomic entity"); 8998 case 1: 8999 LoadMnemonic = PPC::LBARX; 9000 StoreMnemonic = PPC::STBCX; 9001 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); 9002 break; 9003 case 2: 9004 LoadMnemonic = PPC::LHARX; 9005 StoreMnemonic = PPC::STHCX; 9006 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); 9007 break; 9008 case 4: 9009 LoadMnemonic = PPC::LWARX; 9010 StoreMnemonic = PPC::STWCX; 9011 break; 9012 case 8: 9013 LoadMnemonic = PPC::LDARX; 9014 StoreMnemonic = PPC::STDCX; 9015 break; 9016 } 9017 9018 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9019 MachineFunction *F = BB->getParent(); 9020 MachineFunction::iterator It = ++BB->getIterator(); 9021 9022 unsigned dest = MI.getOperand(0).getReg(); 9023 unsigned ptrA = MI.getOperand(1).getReg(); 9024 unsigned ptrB = MI.getOperand(2).getReg(); 9025 unsigned incr = MI.getOperand(3).getReg(); 9026 DebugLoc dl = MI.getDebugLoc(); 9027 9028 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); 9029 MachineBasicBlock *loop2MBB = 9030 CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr; 9031 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 9032 F->insert(It, loopMBB); 9033 if (CmpOpcode) 9034 F->insert(It, loop2MBB); 9035 F->insert(It, exitMBB); 9036 exitMBB->splice(exitMBB->begin(), BB, 9037 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9038 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 9039 9040 MachineRegisterInfo &RegInfo = F->getRegInfo(); 9041 unsigned TmpReg = (!BinOpcode) ? incr : 9042 RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass 9043 : &PPC::GPRCRegClass); 9044 9045 // thisMBB: 9046 // ... 9047 // fallthrough --> loopMBB 9048 BB->addSuccessor(loopMBB); 9049 9050 // loopMBB: 9051 // l[wd]arx dest, ptr 9052 // add r0, dest, incr 9053 // st[wd]cx. r0, ptr 9054 // bne- loopMBB 9055 // fallthrough --> exitMBB 9056 9057 // For max/min... 9058 // loopMBB: 9059 // l[wd]arx dest, ptr 9060 // cmpl?[wd] incr, dest 9061 // bgt exitMBB 9062 // loop2MBB: 9063 // st[wd]cx. dest, ptr 9064 // bne- loopMBB 9065 // fallthrough --> exitMBB 9066 9067 BB = loopMBB; 9068 BuildMI(BB, dl, TII->get(LoadMnemonic), dest) 9069 .addReg(ptrA).addReg(ptrB); 9070 if (BinOpcode) 9071 BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest); 9072 if (CmpOpcode) { 9073 // Signed comparisons of byte or halfword values must be sign-extended. 9074 if (CmpOpcode == PPC::CMPW && AtomicSize < 4) { 9075 unsigned ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); 9076 BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH), 9077 ExtReg).addReg(dest); 9078 BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) 9079 .addReg(incr).addReg(ExtReg); 9080 } else 9081 BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) 9082 .addReg(incr).addReg(dest); 9083 9084 BuildMI(BB, dl, TII->get(PPC::BCC)) 9085 .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB); 9086 BB->addSuccessor(loop2MBB); 9087 BB->addSuccessor(exitMBB); 9088 BB = loop2MBB; 9089 } 9090 BuildMI(BB, dl, TII->get(StoreMnemonic)) 9091 .addReg(TmpReg).addReg(ptrA).addReg(ptrB); 9092 BuildMI(BB, dl, TII->get(PPC::BCC)) 9093 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); 9094 BB->addSuccessor(loopMBB); 9095 BB->addSuccessor(exitMBB); 9096 9097 // exitMBB: 9098 // ... 9099 BB = exitMBB; 9100 return BB; 9101 } 9102 9103 MachineBasicBlock * 9104 PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI, 9105 MachineBasicBlock *BB, 9106 bool is8bit, // operation 9107 unsigned BinOpcode, 9108 unsigned CmpOpcode, 9109 unsigned CmpPred) const { 9110 // If we support part-word atomic mnemonics, just use them 9111 if (Subtarget.hasPartwordAtomics()) 9112 return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, 9113 CmpOpcode, CmpPred); 9114 9115 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 9116 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9117 // In 64 bit mode we have to use 64 bits for addresses, even though the 9118 // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address 9119 // registers without caring whether they're 32 or 64, but here we're 9120 // doing actual arithmetic on the addresses. 9121 bool is64bit = Subtarget.isPPC64(); 9122 bool isLittleEndian = Subtarget.isLittleEndian(); 9123 unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; 9124 9125 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9126 MachineFunction *F = BB->getParent(); 9127 MachineFunction::iterator It = ++BB->getIterator(); 9128 9129 unsigned dest = MI.getOperand(0).getReg(); 9130 unsigned ptrA = MI.getOperand(1).getReg(); 9131 unsigned ptrB = MI.getOperand(2).getReg(); 9132 unsigned incr = MI.getOperand(3).getReg(); 9133 DebugLoc dl = MI.getDebugLoc(); 9134 9135 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); 9136 MachineBasicBlock *loop2MBB = 9137 CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr; 9138 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 9139 F->insert(It, loopMBB); 9140 if (CmpOpcode) 9141 F->insert(It, loop2MBB); 9142 F->insert(It, exitMBB); 9143 exitMBB->splice(exitMBB->begin(), BB, 9144 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9145 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 9146 9147 MachineRegisterInfo &RegInfo = F->getRegInfo(); 9148 const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass 9149 : &PPC::GPRCRegClass; 9150 unsigned PtrReg = RegInfo.createVirtualRegister(RC); 9151 unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); 9152 unsigned ShiftReg = 9153 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC); 9154 unsigned Incr2Reg = RegInfo.createVirtualRegister(RC); 9155 unsigned MaskReg = RegInfo.createVirtualRegister(RC); 9156 unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); 9157 unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); 9158 unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); 9159 unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC); 9160 unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); 9161 unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); 9162 unsigned Ptr1Reg; 9163 unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC); 9164 9165 // thisMBB: 9166 // ... 9167 // fallthrough --> loopMBB 9168 BB->addSuccessor(loopMBB); 9169 9170 // The 4-byte load must be aligned, while a char or short may be 9171 // anywhere in the word. Hence all this nasty bookkeeping code. 9172 // add ptr1, ptrA, ptrB [copy if ptrA==0] 9173 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] 9174 // xori shift, shift1, 24 [16] 9175 // rlwinm ptr, ptr1, 0, 0, 29 9176 // slw incr2, incr, shift 9177 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] 9178 // slw mask, mask2, shift 9179 // loopMBB: 9180 // lwarx tmpDest, ptr 9181 // add tmp, tmpDest, incr2 9182 // andc tmp2, tmpDest, mask 9183 // and tmp3, tmp, mask 9184 // or tmp4, tmp3, tmp2 9185 // stwcx. tmp4, ptr 9186 // bne- loopMBB 9187 // fallthrough --> exitMBB 9188 // srw dest, tmpDest, shift 9189 if (ptrA != ZeroReg) { 9190 Ptr1Reg = RegInfo.createVirtualRegister(RC); 9191 BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) 9192 .addReg(ptrA).addReg(ptrB); 9193 } else { 9194 Ptr1Reg = ptrB; 9195 } 9196 BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) 9197 .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); 9198 if (!isLittleEndian) 9199 BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) 9200 .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); 9201 if (is64bit) 9202 BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) 9203 .addReg(Ptr1Reg).addImm(0).addImm(61); 9204 else 9205 BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) 9206 .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); 9207 BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg) 9208 .addReg(incr).addReg(ShiftReg); 9209 if (is8bit) 9210 BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); 9211 else { 9212 BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); 9213 BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535); 9214 } 9215 BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) 9216 .addReg(Mask2Reg).addReg(ShiftReg); 9217 9218 BB = loopMBB; 9219 BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) 9220 .addReg(ZeroReg).addReg(PtrReg); 9221 if (BinOpcode) 9222 BuildMI(BB, dl, TII->get(BinOpcode), TmpReg) 9223 .addReg(Incr2Reg).addReg(TmpDestReg); 9224 BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg) 9225 .addReg(TmpDestReg).addReg(MaskReg); 9226 BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg) 9227 .addReg(TmpReg).addReg(MaskReg); 9228 if (CmpOpcode) { 9229 // For unsigned comparisons, we can directly compare the shifted values. 9230 // For signed comparisons we shift and sign extend. 9231 unsigned SReg = RegInfo.createVirtualRegister(RC); 9232 BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), SReg) 9233 .addReg(TmpDestReg).addReg(MaskReg); 9234 unsigned ValueReg = SReg; 9235 unsigned CmpReg = Incr2Reg; 9236 if (CmpOpcode == PPC::CMPW) { 9237 ValueReg = RegInfo.createVirtualRegister(RC); 9238 BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg) 9239 .addReg(SReg).addReg(ShiftReg); 9240 unsigned ValueSReg = RegInfo.createVirtualRegister(RC); 9241 BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg) 9242 .addReg(ValueReg); 9243 ValueReg = ValueSReg; 9244 CmpReg = incr; 9245 } 9246 BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) 9247 .addReg(CmpReg).addReg(ValueReg); 9248 BuildMI(BB, dl, TII->get(PPC::BCC)) 9249 .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB); 9250 BB->addSuccessor(loop2MBB); 9251 BB->addSuccessor(exitMBB); 9252 BB = loop2MBB; 9253 } 9254 BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg) 9255 .addReg(Tmp3Reg).addReg(Tmp2Reg); 9256 BuildMI(BB, dl, TII->get(PPC::STWCX)) 9257 .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg); 9258 BuildMI(BB, dl, TII->get(PPC::BCC)) 9259 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); 9260 BB->addSuccessor(loopMBB); 9261 BB->addSuccessor(exitMBB); 9262 9263 // exitMBB: 9264 // ... 9265 BB = exitMBB; 9266 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg) 9267 .addReg(ShiftReg); 9268 return BB; 9269 } 9270 9271 llvm::MachineBasicBlock * 9272 PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, 9273 MachineBasicBlock *MBB) const { 9274 DebugLoc DL = MI.getDebugLoc(); 9275 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9276 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); 9277 9278 MachineFunction *MF = MBB->getParent(); 9279 MachineRegisterInfo &MRI = MF->getRegInfo(); 9280 9281 const BasicBlock *BB = MBB->getBasicBlock(); 9282 MachineFunction::iterator I = ++MBB->getIterator(); 9283 9284 // Memory Reference 9285 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); 9286 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); 9287 9288 unsigned DstReg = MI.getOperand(0).getReg(); 9289 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 9290 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); 9291 unsigned mainDstReg = MRI.createVirtualRegister(RC); 9292 unsigned restoreDstReg = MRI.createVirtualRegister(RC); 9293 9294 MVT PVT = getPointerTy(MF->getDataLayout()); 9295 assert((PVT == MVT::i64 || PVT == MVT::i32) && 9296 "Invalid Pointer Size!"); 9297 // For v = setjmp(buf), we generate 9298 // 9299 // thisMBB: 9300 // SjLjSetup mainMBB 9301 // bl mainMBB 9302 // v_restore = 1 9303 // b sinkMBB 9304 // 9305 // mainMBB: 9306 // buf[LabelOffset] = LR 9307 // v_main = 0 9308 // 9309 // sinkMBB: 9310 // v = phi(main, restore) 9311 // 9312 9313 MachineBasicBlock *thisMBB = MBB; 9314 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 9315 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 9316 MF->insert(I, mainMBB); 9317 MF->insert(I, sinkMBB); 9318 9319 MachineInstrBuilder MIB; 9320 9321 // Transfer the remainder of BB and its successor edges to sinkMBB. 9322 sinkMBB->splice(sinkMBB->begin(), MBB, 9323 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 9324 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 9325 9326 // Note that the structure of the jmp_buf used here is not compatible 9327 // with that used by libc, and is not designed to be. Specifically, it 9328 // stores only those 'reserved' registers that LLVM does not otherwise 9329 // understand how to spill. Also, by convention, by the time this 9330 // intrinsic is called, Clang has already stored the frame address in the 9331 // first slot of the buffer and stack address in the third. Following the 9332 // X86 target code, we'll store the jump address in the second slot. We also 9333 // need to save the TOC pointer (R2) to handle jumps between shared 9334 // libraries, and that will be stored in the fourth slot. The thread 9335 // identifier (R13) is not affected. 9336 9337 // thisMBB: 9338 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 9339 const int64_t TOCOffset = 3 * PVT.getStoreSize(); 9340 const int64_t BPOffset = 4 * PVT.getStoreSize(); 9341 9342 // Prepare IP either in reg. 9343 const TargetRegisterClass *PtrRC = getRegClassFor(PVT); 9344 unsigned LabelReg = MRI.createVirtualRegister(PtrRC); 9345 unsigned BufReg = MI.getOperand(1).getReg(); 9346 9347 if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { 9348 setUsesTOCBasePtr(*MBB->getParent()); 9349 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) 9350 .addReg(PPC::X2) 9351 .addImm(TOCOffset) 9352 .addReg(BufReg); 9353 MIB.setMemRefs(MMOBegin, MMOEnd); 9354 } 9355 9356 // Naked functions never have a base pointer, and so we use r1. For all 9357 // other functions, this decision must be delayed until during PEI. 9358 unsigned BaseReg; 9359 if (MF->getFunction()->hasFnAttribute(Attribute::Naked)) 9360 BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1; 9361 else 9362 BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP; 9363 9364 MIB = BuildMI(*thisMBB, MI, DL, 9365 TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW)) 9366 .addReg(BaseReg) 9367 .addImm(BPOffset) 9368 .addReg(BufReg); 9369 MIB.setMemRefs(MMOBegin, MMOEnd); 9370 9371 // Setup 9372 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); 9373 MIB.addRegMask(TRI->getNoPreservedMask()); 9374 9375 BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); 9376 9377 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup)) 9378 .addMBB(mainMBB); 9379 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB); 9380 9381 thisMBB->addSuccessor(mainMBB, BranchProbability::getZero()); 9382 thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne()); 9383 9384 // mainMBB: 9385 // mainDstReg = 0 9386 MIB = 9387 BuildMI(mainMBB, DL, 9388 TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); 9389 9390 // Store IP 9391 if (Subtarget.isPPC64()) { 9392 MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD)) 9393 .addReg(LabelReg) 9394 .addImm(LabelOffset) 9395 .addReg(BufReg); 9396 } else { 9397 MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW)) 9398 .addReg(LabelReg) 9399 .addImm(LabelOffset) 9400 .addReg(BufReg); 9401 } 9402 9403 MIB.setMemRefs(MMOBegin, MMOEnd); 9404 9405 BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0); 9406 mainMBB->addSuccessor(sinkMBB); 9407 9408 // sinkMBB: 9409 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 9410 TII->get(PPC::PHI), DstReg) 9411 .addReg(mainDstReg).addMBB(mainMBB) 9412 .addReg(restoreDstReg).addMBB(thisMBB); 9413 9414 MI.eraseFromParent(); 9415 return sinkMBB; 9416 } 9417 9418 MachineBasicBlock * 9419 PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, 9420 MachineBasicBlock *MBB) const { 9421 DebugLoc DL = MI.getDebugLoc(); 9422 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9423 9424 MachineFunction *MF = MBB->getParent(); 9425 MachineRegisterInfo &MRI = MF->getRegInfo(); 9426 9427 // Memory Reference 9428 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); 9429 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); 9430 9431 MVT PVT = getPointerTy(MF->getDataLayout()); 9432 assert((PVT == MVT::i64 || PVT == MVT::i32) && 9433 "Invalid Pointer Size!"); 9434 9435 const TargetRegisterClass *RC = 9436 (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; 9437 unsigned Tmp = MRI.createVirtualRegister(RC); 9438 // Since FP is only updated here but NOT referenced, it's treated as GPR. 9439 unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; 9440 unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; 9441 unsigned BP = 9442 (PVT == MVT::i64) 9443 ? PPC::X30 9444 : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29 9445 : PPC::R30); 9446 9447 MachineInstrBuilder MIB; 9448 9449 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 9450 const int64_t SPOffset = 2 * PVT.getStoreSize(); 9451 const int64_t TOCOffset = 3 * PVT.getStoreSize(); 9452 const int64_t BPOffset = 4 * PVT.getStoreSize(); 9453 9454 unsigned BufReg = MI.getOperand(0).getReg(); 9455 9456 // Reload FP (the jumped-to function may not have had a 9457 // frame pointer, and if so, then its r31 will be restored 9458 // as necessary). 9459 if (PVT == MVT::i64) { 9460 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP) 9461 .addImm(0) 9462 .addReg(BufReg); 9463 } else { 9464 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP) 9465 .addImm(0) 9466 .addReg(BufReg); 9467 } 9468 MIB.setMemRefs(MMOBegin, MMOEnd); 9469 9470 // Reload IP 9471 if (PVT == MVT::i64) { 9472 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp) 9473 .addImm(LabelOffset) 9474 .addReg(BufReg); 9475 } else { 9476 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp) 9477 .addImm(LabelOffset) 9478 .addReg(BufReg); 9479 } 9480 MIB.setMemRefs(MMOBegin, MMOEnd); 9481 9482 // Reload SP 9483 if (PVT == MVT::i64) { 9484 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP) 9485 .addImm(SPOffset) 9486 .addReg(BufReg); 9487 } else { 9488 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP) 9489 .addImm(SPOffset) 9490 .addReg(BufReg); 9491 } 9492 MIB.setMemRefs(MMOBegin, MMOEnd); 9493 9494 // Reload BP 9495 if (PVT == MVT::i64) { 9496 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP) 9497 .addImm(BPOffset) 9498 .addReg(BufReg); 9499 } else { 9500 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP) 9501 .addImm(BPOffset) 9502 .addReg(BufReg); 9503 } 9504 MIB.setMemRefs(MMOBegin, MMOEnd); 9505 9506 // Reload TOC 9507 if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) { 9508 setUsesTOCBasePtr(*MBB->getParent()); 9509 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) 9510 .addImm(TOCOffset) 9511 .addReg(BufReg); 9512 9513 MIB.setMemRefs(MMOBegin, MMOEnd); 9514 } 9515 9516 // Jump 9517 BuildMI(*MBB, MI, DL, 9518 TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp); 9519 BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR)); 9520 9521 MI.eraseFromParent(); 9522 return MBB; 9523 } 9524 9525 MachineBasicBlock * 9526 PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 9527 MachineBasicBlock *BB) const { 9528 if (MI.getOpcode() == TargetOpcode::STACKMAP || 9529 MI.getOpcode() == TargetOpcode::PATCHPOINT) { 9530 if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() && 9531 MI.getOpcode() == TargetOpcode::PATCHPOINT) { 9532 // Call lowering should have added an r2 operand to indicate a dependence 9533 // on the TOC base pointer value. It can't however, because there is no 9534 // way to mark the dependence as implicit there, and so the stackmap code 9535 // will confuse it with a regular operand. Instead, add the dependence 9536 // here. 9537 setUsesTOCBasePtr(*BB->getParent()); 9538 MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true)); 9539 } 9540 9541 return emitPatchPoint(MI, BB); 9542 } 9543 9544 if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 || 9545 MI.getOpcode() == PPC::EH_SjLj_SetJmp64) { 9546 return emitEHSjLjSetJmp(MI, BB); 9547 } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 || 9548 MI.getOpcode() == PPC::EH_SjLj_LongJmp64) { 9549 return emitEHSjLjLongJmp(MI, BB); 9550 } 9551 9552 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9553 9554 // To "insert" these instructions we actually have to insert their 9555 // control-flow patterns. 9556 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9557 MachineFunction::iterator It = ++BB->getIterator(); 9558 9559 MachineFunction *F = BB->getParent(); 9560 9561 if (MI.getOpcode() == PPC::SELECT_CC_I4 || 9562 MI.getOpcode() == PPC::SELECT_CC_I8 || 9563 MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8) { 9564 SmallVector<MachineOperand, 2> Cond; 9565 if (MI.getOpcode() == PPC::SELECT_CC_I4 || 9566 MI.getOpcode() == PPC::SELECT_CC_I8) 9567 Cond.push_back(MI.getOperand(4)); 9568 else 9569 Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET)); 9570 Cond.push_back(MI.getOperand(1)); 9571 9572 DebugLoc dl = MI.getDebugLoc(); 9573 TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond, 9574 MI.getOperand(2).getReg(), MI.getOperand(3).getReg()); 9575 } else if (MI.getOpcode() == PPC::SELECT_CC_I4 || 9576 MI.getOpcode() == PPC::SELECT_CC_I8 || 9577 MI.getOpcode() == PPC::SELECT_CC_F4 || 9578 MI.getOpcode() == PPC::SELECT_CC_F8 || 9579 MI.getOpcode() == PPC::SELECT_CC_QFRC || 9580 MI.getOpcode() == PPC::SELECT_CC_QSRC || 9581 MI.getOpcode() == PPC::SELECT_CC_QBRC || 9582 MI.getOpcode() == PPC::SELECT_CC_VRRC || 9583 MI.getOpcode() == PPC::SELECT_CC_VSFRC || 9584 MI.getOpcode() == PPC::SELECT_CC_VSSRC || 9585 MI.getOpcode() == PPC::SELECT_CC_VSRC || 9586 MI.getOpcode() == PPC::SELECT_I4 || 9587 MI.getOpcode() == PPC::SELECT_I8 || 9588 MI.getOpcode() == PPC::SELECT_F4 || 9589 MI.getOpcode() == PPC::SELECT_F8 || 9590 MI.getOpcode() == PPC::SELECT_QFRC || 9591 MI.getOpcode() == PPC::SELECT_QSRC || 9592 MI.getOpcode() == PPC::SELECT_QBRC || 9593 MI.getOpcode() == PPC::SELECT_VRRC || 9594 MI.getOpcode() == PPC::SELECT_VSFRC || 9595 MI.getOpcode() == PPC::SELECT_VSSRC || 9596 MI.getOpcode() == PPC::SELECT_VSRC) { 9597 // The incoming instruction knows the destination vreg to set, the 9598 // condition code register to branch on, the true/false values to 9599 // select between, and a branch opcode to use. 9600 9601 // thisMBB: 9602 // ... 9603 // TrueVal = ... 9604 // cmpTY ccX, r1, r2 9605 // bCC copy1MBB 9606 // fallthrough --> copy0MBB 9607 MachineBasicBlock *thisMBB = BB; 9608 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 9609 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 9610 DebugLoc dl = MI.getDebugLoc(); 9611 F->insert(It, copy0MBB); 9612 F->insert(It, sinkMBB); 9613 9614 // Transfer the remainder of BB and its successor edges to sinkMBB. 9615 sinkMBB->splice(sinkMBB->begin(), BB, 9616 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9617 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 9618 9619 // Next, add the true and fallthrough blocks as its successors. 9620 BB->addSuccessor(copy0MBB); 9621 BB->addSuccessor(sinkMBB); 9622 9623 if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 || 9624 MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || 9625 MI.getOpcode() == PPC::SELECT_QFRC || 9626 MI.getOpcode() == PPC::SELECT_QSRC || 9627 MI.getOpcode() == PPC::SELECT_QBRC || 9628 MI.getOpcode() == PPC::SELECT_VRRC || 9629 MI.getOpcode() == PPC::SELECT_VSFRC || 9630 MI.getOpcode() == PPC::SELECT_VSSRC || 9631 MI.getOpcode() == PPC::SELECT_VSRC) { 9632 BuildMI(BB, dl, TII->get(PPC::BC)) 9633 .addReg(MI.getOperand(1).getReg()) 9634 .addMBB(sinkMBB); 9635 } else { 9636 unsigned SelectPred = MI.getOperand(4).getImm(); 9637 BuildMI(BB, dl, TII->get(PPC::BCC)) 9638 .addImm(SelectPred) 9639 .addReg(MI.getOperand(1).getReg()) 9640 .addMBB(sinkMBB); 9641 } 9642 9643 // copy0MBB: 9644 // %FalseValue = ... 9645 // # fallthrough to sinkMBB 9646 BB = copy0MBB; 9647 9648 // Update machine-CFG edges 9649 BB->addSuccessor(sinkMBB); 9650 9651 // sinkMBB: 9652 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 9653 // ... 9654 BB = sinkMBB; 9655 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg()) 9656 .addReg(MI.getOperand(3).getReg()) 9657 .addMBB(copy0MBB) 9658 .addReg(MI.getOperand(2).getReg()) 9659 .addMBB(thisMBB); 9660 } else if (MI.getOpcode() == PPC::ReadTB) { 9661 // To read the 64-bit time-base register on a 32-bit target, we read the 9662 // two halves. Should the counter have wrapped while it was being read, we 9663 // need to try again. 9664 // ... 9665 // readLoop: 9666 // mfspr Rx,TBU # load from TBU 9667 // mfspr Ry,TB # load from TB 9668 // mfspr Rz,TBU # load from TBU 9669 // cmpw crX,Rx,Rz # check if 'old'='new' 9670 // bne readLoop # branch if they're not equal 9671 // ... 9672 9673 MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB); 9674 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 9675 DebugLoc dl = MI.getDebugLoc(); 9676 F->insert(It, readMBB); 9677 F->insert(It, sinkMBB); 9678 9679 // Transfer the remainder of BB and its successor edges to sinkMBB. 9680 sinkMBB->splice(sinkMBB->begin(), BB, 9681 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9682 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 9683 9684 BB->addSuccessor(readMBB); 9685 BB = readMBB; 9686 9687 MachineRegisterInfo &RegInfo = F->getRegInfo(); 9688 unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); 9689 unsigned LoReg = MI.getOperand(0).getReg(); 9690 unsigned HiReg = MI.getOperand(1).getReg(); 9691 9692 BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269); 9693 BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268); 9694 BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269); 9695 9696 unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); 9697 9698 BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg) 9699 .addReg(HiReg).addReg(ReadAgainReg); 9700 BuildMI(BB, dl, TII->get(PPC::BCC)) 9701 .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB); 9702 9703 BB->addSuccessor(readMBB); 9704 BB->addSuccessor(sinkMBB); 9705 } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8) 9706 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4); 9707 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16) 9708 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4); 9709 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32) 9710 BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4); 9711 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64) 9712 BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8); 9713 9714 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8) 9715 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND); 9716 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16) 9717 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND); 9718 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32) 9719 BB = EmitAtomicBinary(MI, BB, 4, PPC::AND); 9720 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64) 9721 BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8); 9722 9723 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8) 9724 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR); 9725 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16) 9726 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR); 9727 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32) 9728 BB = EmitAtomicBinary(MI, BB, 4, PPC::OR); 9729 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64) 9730 BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8); 9731 9732 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8) 9733 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR); 9734 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16) 9735 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR); 9736 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32) 9737 BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR); 9738 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64) 9739 BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8); 9740 9741 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) 9742 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); 9743 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) 9744 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); 9745 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) 9746 BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND); 9747 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) 9748 BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8); 9749 9750 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) 9751 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); 9752 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16) 9753 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF); 9754 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32) 9755 BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF); 9756 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64) 9757 BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8); 9758 9759 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8) 9760 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE); 9761 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16) 9762 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE); 9763 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32) 9764 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE); 9765 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64) 9766 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE); 9767 9768 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8) 9769 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE); 9770 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16) 9771 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE); 9772 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32) 9773 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE); 9774 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64) 9775 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE); 9776 9777 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8) 9778 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE); 9779 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16) 9780 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE); 9781 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32) 9782 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE); 9783 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64) 9784 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE); 9785 9786 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8) 9787 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE); 9788 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16) 9789 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE); 9790 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32) 9791 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE); 9792 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64) 9793 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE); 9794 9795 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8) 9796 BB = EmitPartwordAtomicBinary(MI, BB, true, 0); 9797 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16) 9798 BB = EmitPartwordAtomicBinary(MI, BB, false, 0); 9799 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32) 9800 BB = EmitAtomicBinary(MI, BB, 4, 0); 9801 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64) 9802 BB = EmitAtomicBinary(MI, BB, 8, 0); 9803 else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || 9804 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 || 9805 (Subtarget.hasPartwordAtomics() && 9806 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) || 9807 (Subtarget.hasPartwordAtomics() && 9808 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) { 9809 bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64; 9810 9811 auto LoadMnemonic = PPC::LDARX; 9812 auto StoreMnemonic = PPC::STDCX; 9813 switch (MI.getOpcode()) { 9814 default: 9815 llvm_unreachable("Compare and swap of unknown size"); 9816 case PPC::ATOMIC_CMP_SWAP_I8: 9817 LoadMnemonic = PPC::LBARX; 9818 StoreMnemonic = PPC::STBCX; 9819 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); 9820 break; 9821 case PPC::ATOMIC_CMP_SWAP_I16: 9822 LoadMnemonic = PPC::LHARX; 9823 StoreMnemonic = PPC::STHCX; 9824 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); 9825 break; 9826 case PPC::ATOMIC_CMP_SWAP_I32: 9827 LoadMnemonic = PPC::LWARX; 9828 StoreMnemonic = PPC::STWCX; 9829 break; 9830 case PPC::ATOMIC_CMP_SWAP_I64: 9831 LoadMnemonic = PPC::LDARX; 9832 StoreMnemonic = PPC::STDCX; 9833 break; 9834 } 9835 unsigned dest = MI.getOperand(0).getReg(); 9836 unsigned ptrA = MI.getOperand(1).getReg(); 9837 unsigned ptrB = MI.getOperand(2).getReg(); 9838 unsigned oldval = MI.getOperand(3).getReg(); 9839 unsigned newval = MI.getOperand(4).getReg(); 9840 DebugLoc dl = MI.getDebugLoc(); 9841 9842 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); 9843 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); 9844 MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); 9845 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 9846 F->insert(It, loop1MBB); 9847 F->insert(It, loop2MBB); 9848 F->insert(It, midMBB); 9849 F->insert(It, exitMBB); 9850 exitMBB->splice(exitMBB->begin(), BB, 9851 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9852 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 9853 9854 // thisMBB: 9855 // ... 9856 // fallthrough --> loopMBB 9857 BB->addSuccessor(loop1MBB); 9858 9859 // loop1MBB: 9860 // l[bhwd]arx dest, ptr 9861 // cmp[wd] dest, oldval 9862 // bne- midMBB 9863 // loop2MBB: 9864 // st[bhwd]cx. newval, ptr 9865 // bne- loopMBB 9866 // b exitBB 9867 // midMBB: 9868 // st[bhwd]cx. dest, ptr 9869 // exitBB: 9870 BB = loop1MBB; 9871 BuildMI(BB, dl, TII->get(LoadMnemonic), dest) 9872 .addReg(ptrA).addReg(ptrB); 9873 BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0) 9874 .addReg(oldval).addReg(dest); 9875 BuildMI(BB, dl, TII->get(PPC::BCC)) 9876 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); 9877 BB->addSuccessor(loop2MBB); 9878 BB->addSuccessor(midMBB); 9879 9880 BB = loop2MBB; 9881 BuildMI(BB, dl, TII->get(StoreMnemonic)) 9882 .addReg(newval).addReg(ptrA).addReg(ptrB); 9883 BuildMI(BB, dl, TII->get(PPC::BCC)) 9884 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); 9885 BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); 9886 BB->addSuccessor(loop1MBB); 9887 BB->addSuccessor(exitMBB); 9888 9889 BB = midMBB; 9890 BuildMI(BB, dl, TII->get(StoreMnemonic)) 9891 .addReg(dest).addReg(ptrA).addReg(ptrB); 9892 BB->addSuccessor(exitMBB); 9893 9894 // exitMBB: 9895 // ... 9896 BB = exitMBB; 9897 } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 || 9898 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) { 9899 // We must use 64-bit registers for addresses when targeting 64-bit, 9900 // since we're actually doing arithmetic on them. Other registers 9901 // can be 32-bit. 9902 bool is64bit = Subtarget.isPPC64(); 9903 bool isLittleEndian = Subtarget.isLittleEndian(); 9904 bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; 9905 9906 unsigned dest = MI.getOperand(0).getReg(); 9907 unsigned ptrA = MI.getOperand(1).getReg(); 9908 unsigned ptrB = MI.getOperand(2).getReg(); 9909 unsigned oldval = MI.getOperand(3).getReg(); 9910 unsigned newval = MI.getOperand(4).getReg(); 9911 DebugLoc dl = MI.getDebugLoc(); 9912 9913 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); 9914 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); 9915 MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); 9916 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 9917 F->insert(It, loop1MBB); 9918 F->insert(It, loop2MBB); 9919 F->insert(It, midMBB); 9920 F->insert(It, exitMBB); 9921 exitMBB->splice(exitMBB->begin(), BB, 9922 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9923 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 9924 9925 MachineRegisterInfo &RegInfo = F->getRegInfo(); 9926 const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass 9927 : &PPC::GPRCRegClass; 9928 unsigned PtrReg = RegInfo.createVirtualRegister(RC); 9929 unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); 9930 unsigned ShiftReg = 9931 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC); 9932 unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC); 9933 unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC); 9934 unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC); 9935 unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC); 9936 unsigned MaskReg = RegInfo.createVirtualRegister(RC); 9937 unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); 9938 unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); 9939 unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); 9940 unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); 9941 unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); 9942 unsigned Ptr1Reg; 9943 unsigned TmpReg = RegInfo.createVirtualRegister(RC); 9944 unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; 9945 // thisMBB: 9946 // ... 9947 // fallthrough --> loopMBB 9948 BB->addSuccessor(loop1MBB); 9949 9950 // The 4-byte load must be aligned, while a char or short may be 9951 // anywhere in the word. Hence all this nasty bookkeeping code. 9952 // add ptr1, ptrA, ptrB [copy if ptrA==0] 9953 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] 9954 // xori shift, shift1, 24 [16] 9955 // rlwinm ptr, ptr1, 0, 0, 29 9956 // slw newval2, newval, shift 9957 // slw oldval2, oldval,shift 9958 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] 9959 // slw mask, mask2, shift 9960 // and newval3, newval2, mask 9961 // and oldval3, oldval2, mask 9962 // loop1MBB: 9963 // lwarx tmpDest, ptr 9964 // and tmp, tmpDest, mask 9965 // cmpw tmp, oldval3 9966 // bne- midMBB 9967 // loop2MBB: 9968 // andc tmp2, tmpDest, mask 9969 // or tmp4, tmp2, newval3 9970 // stwcx. tmp4, ptr 9971 // bne- loop1MBB 9972 // b exitBB 9973 // midMBB: 9974 // stwcx. tmpDest, ptr 9975 // exitBB: 9976 // srw dest, tmpDest, shift 9977 if (ptrA != ZeroReg) { 9978 Ptr1Reg = RegInfo.createVirtualRegister(RC); 9979 BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) 9980 .addReg(ptrA).addReg(ptrB); 9981 } else { 9982 Ptr1Reg = ptrB; 9983 } 9984 BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) 9985 .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); 9986 if (!isLittleEndian) 9987 BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) 9988 .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); 9989 if (is64bit) 9990 BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) 9991 .addReg(Ptr1Reg).addImm(0).addImm(61); 9992 else 9993 BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) 9994 .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); 9995 BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg) 9996 .addReg(newval).addReg(ShiftReg); 9997 BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg) 9998 .addReg(oldval).addReg(ShiftReg); 9999 if (is8bit) 10000 BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); 10001 else { 10002 BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); 10003 BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) 10004 .addReg(Mask3Reg).addImm(65535); 10005 } 10006 BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) 10007 .addReg(Mask2Reg).addReg(ShiftReg); 10008 BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg) 10009 .addReg(NewVal2Reg).addReg(MaskReg); 10010 BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg) 10011 .addReg(OldVal2Reg).addReg(MaskReg); 10012 10013 BB = loop1MBB; 10014 BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) 10015 .addReg(ZeroReg).addReg(PtrReg); 10016 BuildMI(BB, dl, TII->get(PPC::AND),TmpReg) 10017 .addReg(TmpDestReg).addReg(MaskReg); 10018 BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0) 10019 .addReg(TmpReg).addReg(OldVal3Reg); 10020 BuildMI(BB, dl, TII->get(PPC::BCC)) 10021 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); 10022 BB->addSuccessor(loop2MBB); 10023 BB->addSuccessor(midMBB); 10024 10025 BB = loop2MBB; 10026 BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg) 10027 .addReg(TmpDestReg).addReg(MaskReg); 10028 BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg) 10029 .addReg(Tmp2Reg).addReg(NewVal3Reg); 10030 BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg) 10031 .addReg(ZeroReg).addReg(PtrReg); 10032 BuildMI(BB, dl, TII->get(PPC::BCC)) 10033 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); 10034 BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); 10035 BB->addSuccessor(loop1MBB); 10036 BB->addSuccessor(exitMBB); 10037 10038 BB = midMBB; 10039 BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg) 10040 .addReg(ZeroReg).addReg(PtrReg); 10041 BB->addSuccessor(exitMBB); 10042 10043 // exitMBB: 10044 // ... 10045 BB = exitMBB; 10046 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg) 10047 .addReg(ShiftReg); 10048 } else if (MI.getOpcode() == PPC::FADDrtz) { 10049 // This pseudo performs an FADD with rounding mode temporarily forced 10050 // to round-to-zero. We emit this via custom inserter since the FPSCR 10051 // is not modeled at the SelectionDAG level. 10052 unsigned Dest = MI.getOperand(0).getReg(); 10053 unsigned Src1 = MI.getOperand(1).getReg(); 10054 unsigned Src2 = MI.getOperand(2).getReg(); 10055 DebugLoc dl = MI.getDebugLoc(); 10056 10057 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10058 unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); 10059 10060 // Save FPSCR value. 10061 BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg); 10062 10063 // Set rounding mode to round-to-zero. 10064 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31); 10065 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30); 10066 10067 // Perform addition. 10068 BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2); 10069 10070 // Restore FPSCR value. 10071 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg); 10072 } else if (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || 10073 MI.getOpcode() == PPC::ANDIo_1_GT_BIT || 10074 MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || 10075 MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) { 10076 unsigned Opcode = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || 10077 MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) 10078 ? PPC::ANDIo8 10079 : PPC::ANDIo; 10080 bool isEQ = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || 10081 MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8); 10082 10083 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10084 unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ? 10085 &PPC::GPRCRegClass : 10086 &PPC::G8RCRegClass); 10087 10088 DebugLoc dl = MI.getDebugLoc(); 10089 BuildMI(*BB, MI, dl, TII->get(Opcode), Dest) 10090 .addReg(MI.getOperand(1).getReg()) 10091 .addImm(1); 10092 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), 10093 MI.getOperand(0).getReg()) 10094 .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT); 10095 } else if (MI.getOpcode() == PPC::TCHECK_RET) { 10096 DebugLoc Dl = MI.getDebugLoc(); 10097 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10098 unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); 10099 BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg); 10100 return BB; 10101 } else { 10102 llvm_unreachable("Unexpected instr type to insert"); 10103 } 10104 10105 MI.eraseFromParent(); // The pseudo instruction is gone now. 10106 return BB; 10107 } 10108 10109 //===----------------------------------------------------------------------===// 10110 // Target Optimization Hooks 10111 //===----------------------------------------------------------------------===// 10112 10113 static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) { 10114 // For the estimates, convergence is quadratic, so we essentially double the 10115 // number of digits correct after every iteration. For both FRE and FRSQRTE, 10116 // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(), 10117 // this is 2^-14. IEEE float has 23 digits and double has 52 digits. 10118 int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3; 10119 if (VT.getScalarType() == MVT::f64) 10120 RefinementSteps++; 10121 return RefinementSteps; 10122 } 10123 10124 SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 10125 int Enabled, int &RefinementSteps, 10126 bool &UseOneConstNR, 10127 bool Reciprocal) const { 10128 EVT VT = Operand.getValueType(); 10129 if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || 10130 (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || 10131 (VT == MVT::v4f32 && Subtarget.hasAltivec()) || 10132 (VT == MVT::v2f64 && Subtarget.hasVSX()) || 10133 (VT == MVT::v4f32 && Subtarget.hasQPX()) || 10134 (VT == MVT::v4f64 && Subtarget.hasQPX())) { 10135 if (RefinementSteps == ReciprocalEstimate::Unspecified) 10136 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); 10137 10138 UseOneConstNR = true; 10139 return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); 10140 } 10141 return SDValue(); 10142 } 10143 10144 SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, 10145 int Enabled, 10146 int &RefinementSteps) const { 10147 EVT VT = Operand.getValueType(); 10148 if ((VT == MVT::f32 && Subtarget.hasFRES()) || 10149 (VT == MVT::f64 && Subtarget.hasFRE()) || 10150 (VT == MVT::v4f32 && Subtarget.hasAltivec()) || 10151 (VT == MVT::v2f64 && Subtarget.hasVSX()) || 10152 (VT == MVT::v4f32 && Subtarget.hasQPX()) || 10153 (VT == MVT::v4f64 && Subtarget.hasQPX())) { 10154 if (RefinementSteps == ReciprocalEstimate::Unspecified) 10155 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); 10156 return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); 10157 } 10158 return SDValue(); 10159 } 10160 10161 unsigned PPCTargetLowering::combineRepeatedFPDivisors() const { 10162 // Note: This functionality is used only when unsafe-fp-math is enabled, and 10163 // on cores with reciprocal estimates (which are used when unsafe-fp-math is 10164 // enabled for division), this functionality is redundant with the default 10165 // combiner logic (once the division -> reciprocal/multiply transformation 10166 // has taken place). As a result, this matters more for older cores than for 10167 // newer ones. 10168 10169 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 10170 // reciprocal if there are two or more FDIVs (for embedded cores with only 10171 // one FP pipeline) for three or more FDIVs (for generic OOO cores). 10172 switch (Subtarget.getDarwinDirective()) { 10173 default: 10174 return 3; 10175 case PPC::DIR_440: 10176 case PPC::DIR_A2: 10177 case PPC::DIR_E500mc: 10178 case PPC::DIR_E5500: 10179 return 2; 10180 } 10181 } 10182 10183 // isConsecutiveLSLoc needs to work even if all adds have not yet been 10184 // collapsed, and so we need to look through chains of them. 10185 static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base, 10186 int64_t& Offset, SelectionDAG &DAG) { 10187 if (DAG.isBaseWithConstantOffset(Loc)) { 10188 Base = Loc.getOperand(0); 10189 Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue(); 10190 10191 // The base might itself be a base plus an offset, and if so, accumulate 10192 // that as well. 10193 getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG); 10194 } 10195 } 10196 10197 static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, 10198 unsigned Bytes, int Dist, 10199 SelectionDAG &DAG) { 10200 if (VT.getSizeInBits() / 8 != Bytes) 10201 return false; 10202 10203 SDValue BaseLoc = Base->getBasePtr(); 10204 if (Loc.getOpcode() == ISD::FrameIndex) { 10205 if (BaseLoc.getOpcode() != ISD::FrameIndex) 10206 return false; 10207 const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 10208 int FI = cast<FrameIndexSDNode>(Loc)->getIndex(); 10209 int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex(); 10210 int FS = MFI.getObjectSize(FI); 10211 int BFS = MFI.getObjectSize(BFI); 10212 if (FS != BFS || FS != (int)Bytes) return false; 10213 return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes); 10214 } 10215 10216 SDValue Base1 = Loc, Base2 = BaseLoc; 10217 int64_t Offset1 = 0, Offset2 = 0; 10218 getBaseWithConstantOffset(Loc, Base1, Offset1, DAG); 10219 getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG); 10220 if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes)) 10221 return true; 10222 10223 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10224 const GlobalValue *GV1 = nullptr; 10225 const GlobalValue *GV2 = nullptr; 10226 Offset1 = 0; 10227 Offset2 = 0; 10228 bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1); 10229 bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); 10230 if (isGA1 && isGA2 && GV1 == GV2) 10231 return Offset1 == (Offset2 + Dist*Bytes); 10232 return false; 10233 } 10234 10235 // Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does 10236 // not enforce equality of the chain operands. 10237 static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, 10238 unsigned Bytes, int Dist, 10239 SelectionDAG &DAG) { 10240 if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) { 10241 EVT VT = LS->getMemoryVT(); 10242 SDValue Loc = LS->getBasePtr(); 10243 return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG); 10244 } 10245 10246 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 10247 EVT VT; 10248 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 10249 default: return false; 10250 case Intrinsic::ppc_qpx_qvlfd: 10251 case Intrinsic::ppc_qpx_qvlfda: 10252 VT = MVT::v4f64; 10253 break; 10254 case Intrinsic::ppc_qpx_qvlfs: 10255 case Intrinsic::ppc_qpx_qvlfsa: 10256 VT = MVT::v4f32; 10257 break; 10258 case Intrinsic::ppc_qpx_qvlfcd: 10259 case Intrinsic::ppc_qpx_qvlfcda: 10260 VT = MVT::v2f64; 10261 break; 10262 case Intrinsic::ppc_qpx_qvlfcs: 10263 case Intrinsic::ppc_qpx_qvlfcsa: 10264 VT = MVT::v2f32; 10265 break; 10266 case Intrinsic::ppc_qpx_qvlfiwa: 10267 case Intrinsic::ppc_qpx_qvlfiwz: 10268 case Intrinsic::ppc_altivec_lvx: 10269 case Intrinsic::ppc_altivec_lvxl: 10270 case Intrinsic::ppc_vsx_lxvw4x: 10271 case Intrinsic::ppc_vsx_lxvw4x_be: 10272 VT = MVT::v4i32; 10273 break; 10274 case Intrinsic::ppc_vsx_lxvd2x: 10275 case Intrinsic::ppc_vsx_lxvd2x_be: 10276 VT = MVT::v2f64; 10277 break; 10278 case Intrinsic::ppc_altivec_lvebx: 10279 VT = MVT::i8; 10280 break; 10281 case Intrinsic::ppc_altivec_lvehx: 10282 VT = MVT::i16; 10283 break; 10284 case Intrinsic::ppc_altivec_lvewx: 10285 VT = MVT::i32; 10286 break; 10287 } 10288 10289 return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG); 10290 } 10291 10292 if (N->getOpcode() == ISD::INTRINSIC_VOID) { 10293 EVT VT; 10294 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 10295 default: return false; 10296 case Intrinsic::ppc_qpx_qvstfd: 10297 case Intrinsic::ppc_qpx_qvstfda: 10298 VT = MVT::v4f64; 10299 break; 10300 case Intrinsic::ppc_qpx_qvstfs: 10301 case Intrinsic::ppc_qpx_qvstfsa: 10302 VT = MVT::v4f32; 10303 break; 10304 case Intrinsic::ppc_qpx_qvstfcd: 10305 case Intrinsic::ppc_qpx_qvstfcda: 10306 VT = MVT::v2f64; 10307 break; 10308 case Intrinsic::ppc_qpx_qvstfcs: 10309 case Intrinsic::ppc_qpx_qvstfcsa: 10310 VT = MVT::v2f32; 10311 break; 10312 case Intrinsic::ppc_qpx_qvstfiw: 10313 case Intrinsic::ppc_qpx_qvstfiwa: 10314 case Intrinsic::ppc_altivec_stvx: 10315 case Intrinsic::ppc_altivec_stvxl: 10316 case Intrinsic::ppc_vsx_stxvw4x: 10317 VT = MVT::v4i32; 10318 break; 10319 case Intrinsic::ppc_vsx_stxvd2x: 10320 VT = MVT::v2f64; 10321 break; 10322 case Intrinsic::ppc_vsx_stxvw4x_be: 10323 VT = MVT::v4i32; 10324 break; 10325 case Intrinsic::ppc_vsx_stxvd2x_be: 10326 VT = MVT::v2f64; 10327 break; 10328 case Intrinsic::ppc_altivec_stvebx: 10329 VT = MVT::i8; 10330 break; 10331 case Intrinsic::ppc_altivec_stvehx: 10332 VT = MVT::i16; 10333 break; 10334 case Intrinsic::ppc_altivec_stvewx: 10335 VT = MVT::i32; 10336 break; 10337 } 10338 10339 return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG); 10340 } 10341 10342 return false; 10343 } 10344 10345 // Return true is there is a nearyby consecutive load to the one provided 10346 // (regardless of alignment). We search up and down the chain, looking though 10347 // token factors and other loads (but nothing else). As a result, a true result 10348 // indicates that it is safe to create a new consecutive load adjacent to the 10349 // load provided. 10350 static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { 10351 SDValue Chain = LD->getChain(); 10352 EVT VT = LD->getMemoryVT(); 10353 10354 SmallSet<SDNode *, 16> LoadRoots; 10355 SmallVector<SDNode *, 8> Queue(1, Chain.getNode()); 10356 SmallSet<SDNode *, 16> Visited; 10357 10358 // First, search up the chain, branching to follow all token-factor operands. 10359 // If we find a consecutive load, then we're done, otherwise, record all 10360 // nodes just above the top-level loads and token factors. 10361 while (!Queue.empty()) { 10362 SDNode *ChainNext = Queue.pop_back_val(); 10363 if (!Visited.insert(ChainNext).second) 10364 continue; 10365 10366 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) { 10367 if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) 10368 return true; 10369 10370 if (!Visited.count(ChainLD->getChain().getNode())) 10371 Queue.push_back(ChainLD->getChain().getNode()); 10372 } else if (ChainNext->getOpcode() == ISD::TokenFactor) { 10373 for (const SDUse &O : ChainNext->ops()) 10374 if (!Visited.count(O.getNode())) 10375 Queue.push_back(O.getNode()); 10376 } else 10377 LoadRoots.insert(ChainNext); 10378 } 10379 10380 // Second, search down the chain, starting from the top-level nodes recorded 10381 // in the first phase. These top-level nodes are the nodes just above all 10382 // loads and token factors. Starting with their uses, recursively look though 10383 // all loads (just the chain uses) and token factors to find a consecutive 10384 // load. 10385 Visited.clear(); 10386 Queue.clear(); 10387 10388 for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(), 10389 IE = LoadRoots.end(); I != IE; ++I) { 10390 Queue.push_back(*I); 10391 10392 while (!Queue.empty()) { 10393 SDNode *LoadRoot = Queue.pop_back_val(); 10394 if (!Visited.insert(LoadRoot).second) 10395 continue; 10396 10397 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot)) 10398 if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) 10399 return true; 10400 10401 for (SDNode::use_iterator UI = LoadRoot->use_begin(), 10402 UE = LoadRoot->use_end(); UI != UE; ++UI) 10403 if (((isa<MemSDNode>(*UI) && 10404 cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) || 10405 UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) 10406 Queue.push_back(*UI); 10407 } 10408 } 10409 10410 return false; 10411 } 10412 10413 /// This function is called when we have proved that a SETCC node can be replaced 10414 /// by subtraction (and other supporting instructions) so that the result of 10415 /// comparison is kept in a GPR instead of CR. This function is purely for 10416 /// codegen purposes and has some flags to guide the codegen process. 10417 static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement, 10418 bool Swap, SDLoc &DL, SelectionDAG &DAG) { 10419 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); 10420 10421 // Zero extend the operands to the largest legal integer. Originally, they 10422 // must be of a strictly smaller size. 10423 auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0), 10424 DAG.getConstant(Size, DL, MVT::i32)); 10425 auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1), 10426 DAG.getConstant(Size, DL, MVT::i32)); 10427 10428 // Swap if needed. Depends on the condition code. 10429 if (Swap) 10430 std::swap(Op0, Op1); 10431 10432 // Subtract extended integers. 10433 auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1); 10434 10435 // Move the sign bit to the least significant position and zero out the rest. 10436 // Now the least significant bit carries the result of original comparison. 10437 auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode, 10438 DAG.getConstant(Size - 1, DL, MVT::i32)); 10439 auto Final = Shifted; 10440 10441 // Complement the result if needed. Based on the condition code. 10442 if (Complement) 10443 Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted, 10444 DAG.getConstant(1, DL, MVT::i64)); 10445 10446 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final); 10447 } 10448 10449 SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N, 10450 DAGCombinerInfo &DCI) const { 10451 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); 10452 10453 SelectionDAG &DAG = DCI.DAG; 10454 SDLoc DL(N); 10455 10456 // Size of integers being compared has a critical role in the following 10457 // analysis, so we prefer to do this when all types are legal. 10458 if (!DCI.isAfterLegalizeVectorOps()) 10459 return SDValue(); 10460 10461 // If all users of SETCC extend its value to a legal integer type 10462 // then we replace SETCC with a subtraction 10463 for (SDNode::use_iterator UI = N->use_begin(), 10464 UE = N->use_end(); UI != UE; ++UI) { 10465 if (UI->getOpcode() != ISD::ZERO_EXTEND) 10466 return SDValue(); 10467 } 10468 10469 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 10470 auto OpSize = N->getOperand(0).getValueSizeInBits(); 10471 10472 unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits(); 10473 10474 if (OpSize < Size) { 10475 switch (CC) { 10476 default: break; 10477 case ISD::SETULT: 10478 return generateEquivalentSub(N, Size, false, false, DL, DAG); 10479 case ISD::SETULE: 10480 return generateEquivalentSub(N, Size, true, true, DL, DAG); 10481 case ISD::SETUGT: 10482 return generateEquivalentSub(N, Size, false, true, DL, DAG); 10483 case ISD::SETUGE: 10484 return generateEquivalentSub(N, Size, true, false, DL, DAG); 10485 } 10486 } 10487 10488 return SDValue(); 10489 } 10490 10491 SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, 10492 DAGCombinerInfo &DCI) const { 10493 SelectionDAG &DAG = DCI.DAG; 10494 SDLoc dl(N); 10495 10496 assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits"); 10497 // If we're tracking CR bits, we need to be careful that we don't have: 10498 // trunc(binary-ops(zext(x), zext(y))) 10499 // or 10500 // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) 10501 // such that we're unnecessarily moving things into GPRs when it would be 10502 // better to keep them in CR bits. 10503 10504 // Note that trunc here can be an actual i1 trunc, or can be the effective 10505 // truncation that comes from a setcc or select_cc. 10506 if (N->getOpcode() == ISD::TRUNCATE && 10507 N->getValueType(0) != MVT::i1) 10508 return SDValue(); 10509 10510 if (N->getOperand(0).getValueType() != MVT::i32 && 10511 N->getOperand(0).getValueType() != MVT::i64) 10512 return SDValue(); 10513 10514 if (N->getOpcode() == ISD::SETCC || 10515 N->getOpcode() == ISD::SELECT_CC) { 10516 // If we're looking at a comparison, then we need to make sure that the 10517 // high bits (all except for the first) don't matter the result. 10518 ISD::CondCode CC = 10519 cast<CondCodeSDNode>(N->getOperand( 10520 N->getOpcode() == ISD::SETCC ? 2 : 4))->get(); 10521 unsigned OpBits = N->getOperand(0).getValueSizeInBits(); 10522 10523 if (ISD::isSignedIntSetCC(CC)) { 10524 if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits || 10525 DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits) 10526 return SDValue(); 10527 } else if (ISD::isUnsignedIntSetCC(CC)) { 10528 if (!DAG.MaskedValueIsZero(N->getOperand(0), 10529 APInt::getHighBitsSet(OpBits, OpBits-1)) || 10530 !DAG.MaskedValueIsZero(N->getOperand(1), 10531 APInt::getHighBitsSet(OpBits, OpBits-1))) 10532 return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI) 10533 : SDValue()); 10534 } else { 10535 // This is neither a signed nor an unsigned comparison, just make sure 10536 // that the high bits are equal. 10537 KnownBits Op1Known, Op2Known; 10538 DAG.computeKnownBits(N->getOperand(0), Op1Known); 10539 DAG.computeKnownBits(N->getOperand(1), Op2Known); 10540 10541 // We don't really care about what is known about the first bit (if 10542 // anything), so clear it in all masks prior to comparing them. 10543 Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0); 10544 Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0); 10545 10546 if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One) 10547 return SDValue(); 10548 } 10549 } 10550 10551 // We now know that the higher-order bits are irrelevant, we just need to 10552 // make sure that all of the intermediate operations are bit operations, and 10553 // all inputs are extensions. 10554 if (N->getOperand(0).getOpcode() != ISD::AND && 10555 N->getOperand(0).getOpcode() != ISD::OR && 10556 N->getOperand(0).getOpcode() != ISD::XOR && 10557 N->getOperand(0).getOpcode() != ISD::SELECT && 10558 N->getOperand(0).getOpcode() != ISD::SELECT_CC && 10559 N->getOperand(0).getOpcode() != ISD::TRUNCATE && 10560 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND && 10561 N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && 10562 N->getOperand(0).getOpcode() != ISD::ANY_EXTEND) 10563 return SDValue(); 10564 10565 if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) && 10566 N->getOperand(1).getOpcode() != ISD::AND && 10567 N->getOperand(1).getOpcode() != ISD::OR && 10568 N->getOperand(1).getOpcode() != ISD::XOR && 10569 N->getOperand(1).getOpcode() != ISD::SELECT && 10570 N->getOperand(1).getOpcode() != ISD::SELECT_CC && 10571 N->getOperand(1).getOpcode() != ISD::TRUNCATE && 10572 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND && 10573 N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && 10574 N->getOperand(1).getOpcode() != ISD::ANY_EXTEND) 10575 return SDValue(); 10576 10577 SmallVector<SDValue, 4> Inputs; 10578 SmallVector<SDValue, 8> BinOps, PromOps; 10579 SmallPtrSet<SDNode *, 16> Visited; 10580 10581 for (unsigned i = 0; i < 2; ++i) { 10582 if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND || 10583 N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND || 10584 N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) && 10585 N->getOperand(i).getOperand(0).getValueType() == MVT::i1) || 10586 isa<ConstantSDNode>(N->getOperand(i))) 10587 Inputs.push_back(N->getOperand(i)); 10588 else 10589 BinOps.push_back(N->getOperand(i)); 10590 10591 if (N->getOpcode() == ISD::TRUNCATE) 10592 break; 10593 } 10594 10595 // Visit all inputs, collect all binary operations (and, or, xor and 10596 // select) that are all fed by extensions. 10597 while (!BinOps.empty()) { 10598 SDValue BinOp = BinOps.back(); 10599 BinOps.pop_back(); 10600 10601 if (!Visited.insert(BinOp.getNode()).second) 10602 continue; 10603 10604 PromOps.push_back(BinOp); 10605 10606 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { 10607 // The condition of the select is not promoted. 10608 if (BinOp.getOpcode() == ISD::SELECT && i == 0) 10609 continue; 10610 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) 10611 continue; 10612 10613 if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || 10614 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || 10615 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) && 10616 BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) || 10617 isa<ConstantSDNode>(BinOp.getOperand(i))) { 10618 Inputs.push_back(BinOp.getOperand(i)); 10619 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || 10620 BinOp.getOperand(i).getOpcode() == ISD::OR || 10621 BinOp.getOperand(i).getOpcode() == ISD::XOR || 10622 BinOp.getOperand(i).getOpcode() == ISD::SELECT || 10623 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC || 10624 BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || 10625 BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || 10626 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || 10627 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) { 10628 BinOps.push_back(BinOp.getOperand(i)); 10629 } else { 10630 // We have an input that is not an extension or another binary 10631 // operation; we'll abort this transformation. 10632 return SDValue(); 10633 } 10634 } 10635 } 10636 10637 // Make sure that this is a self-contained cluster of operations (which 10638 // is not quite the same thing as saying that everything has only one 10639 // use). 10640 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 10641 if (isa<ConstantSDNode>(Inputs[i])) 10642 continue; 10643 10644 for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), 10645 UE = Inputs[i].getNode()->use_end(); 10646 UI != UE; ++UI) { 10647 SDNode *User = *UI; 10648 if (User != N && !Visited.count(User)) 10649 return SDValue(); 10650 10651 // Make sure that we're not going to promote the non-output-value 10652 // operand(s) or SELECT or SELECT_CC. 10653 // FIXME: Although we could sometimes handle this, and it does occur in 10654 // practice that one of the condition inputs to the select is also one of 10655 // the outputs, we currently can't deal with this. 10656 if (User->getOpcode() == ISD::SELECT) { 10657 if (User->getOperand(0) == Inputs[i]) 10658 return SDValue(); 10659 } else if (User->getOpcode() == ISD::SELECT_CC) { 10660 if (User->getOperand(0) == Inputs[i] || 10661 User->getOperand(1) == Inputs[i]) 10662 return SDValue(); 10663 } 10664 } 10665 } 10666 10667 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { 10668 for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), 10669 UE = PromOps[i].getNode()->use_end(); 10670 UI != UE; ++UI) { 10671 SDNode *User = *UI; 10672 if (User != N && !Visited.count(User)) 10673 return SDValue(); 10674 10675 // Make sure that we're not going to promote the non-output-value 10676 // operand(s) or SELECT or SELECT_CC. 10677 // FIXME: Although we could sometimes handle this, and it does occur in 10678 // practice that one of the condition inputs to the select is also one of 10679 // the outputs, we currently can't deal with this. 10680 if (User->getOpcode() == ISD::SELECT) { 10681 if (User->getOperand(0) == PromOps[i]) 10682 return SDValue(); 10683 } else if (User->getOpcode() == ISD::SELECT_CC) { 10684 if (User->getOperand(0) == PromOps[i] || 10685 User->getOperand(1) == PromOps[i]) 10686 return SDValue(); 10687 } 10688 } 10689 } 10690 10691 // Replace all inputs with the extension operand. 10692 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 10693 // Constants may have users outside the cluster of to-be-promoted nodes, 10694 // and so we need to replace those as we do the promotions. 10695 if (isa<ConstantSDNode>(Inputs[i])) 10696 continue; 10697 else 10698 DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); 10699 } 10700 10701 std::list<HandleSDNode> PromOpHandles; 10702 for (auto &PromOp : PromOps) 10703 PromOpHandles.emplace_back(PromOp); 10704 10705 // Replace all operations (these are all the same, but have a different 10706 // (i1) return type). DAG.getNode will validate that the types of 10707 // a binary operator match, so go through the list in reverse so that 10708 // we've likely promoted both operands first. Any intermediate truncations or 10709 // extensions disappear. 10710 while (!PromOpHandles.empty()) { 10711 SDValue PromOp = PromOpHandles.back().getValue(); 10712 PromOpHandles.pop_back(); 10713 10714 if (PromOp.getOpcode() == ISD::TRUNCATE || 10715 PromOp.getOpcode() == ISD::SIGN_EXTEND || 10716 PromOp.getOpcode() == ISD::ZERO_EXTEND || 10717 PromOp.getOpcode() == ISD::ANY_EXTEND) { 10718 if (!isa<ConstantSDNode>(PromOp.getOperand(0)) && 10719 PromOp.getOperand(0).getValueType() != MVT::i1) { 10720 // The operand is not yet ready (see comment below). 10721 PromOpHandles.emplace_front(PromOp); 10722 continue; 10723 } 10724 10725 SDValue RepValue = PromOp.getOperand(0); 10726 if (isa<ConstantSDNode>(RepValue)) 10727 RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue); 10728 10729 DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue); 10730 continue; 10731 } 10732 10733 unsigned C; 10734 switch (PromOp.getOpcode()) { 10735 default: C = 0; break; 10736 case ISD::SELECT: C = 1; break; 10737 case ISD::SELECT_CC: C = 2; break; 10738 } 10739 10740 if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) && 10741 PromOp.getOperand(C).getValueType() != MVT::i1) || 10742 (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) && 10743 PromOp.getOperand(C+1).getValueType() != MVT::i1)) { 10744 // The to-be-promoted operands of this node have not yet been 10745 // promoted (this should be rare because we're going through the 10746 // list backward, but if one of the operands has several users in 10747 // this cluster of to-be-promoted nodes, it is possible). 10748 PromOpHandles.emplace_front(PromOp); 10749 continue; 10750 } 10751 10752 SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), 10753 PromOp.getNode()->op_end()); 10754 10755 // If there are any constant inputs, make sure they're replaced now. 10756 for (unsigned i = 0; i < 2; ++i) 10757 if (isa<ConstantSDNode>(Ops[C+i])) 10758 Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]); 10759 10760 DAG.ReplaceAllUsesOfValueWith(PromOp, 10761 DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops)); 10762 } 10763 10764 // Now we're left with the initial truncation itself. 10765 if (N->getOpcode() == ISD::TRUNCATE) 10766 return N->getOperand(0); 10767 10768 // Otherwise, this is a comparison. The operands to be compared have just 10769 // changed type (to i1), but everything else is the same. 10770 return SDValue(N, 0); 10771 } 10772 10773 SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, 10774 DAGCombinerInfo &DCI) const { 10775 SelectionDAG &DAG = DCI.DAG; 10776 SDLoc dl(N); 10777 10778 // If we're tracking CR bits, we need to be careful that we don't have: 10779 // zext(binary-ops(trunc(x), trunc(y))) 10780 // or 10781 // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) 10782 // such that we're unnecessarily moving things into CR bits that can more 10783 // efficiently stay in GPRs. Note that if we're not certain that the high 10784 // bits are set as required by the final extension, we still may need to do 10785 // some masking to get the proper behavior. 10786 10787 // This same functionality is important on PPC64 when dealing with 10788 // 32-to-64-bit extensions; these occur often when 32-bit values are used as 10789 // the return values of functions. Because it is so similar, it is handled 10790 // here as well. 10791 10792 if (N->getValueType(0) != MVT::i32 && 10793 N->getValueType(0) != MVT::i64) 10794 return SDValue(); 10795 10796 if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) || 10797 (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64()))) 10798 return SDValue(); 10799 10800 if (N->getOperand(0).getOpcode() != ISD::AND && 10801 N->getOperand(0).getOpcode() != ISD::OR && 10802 N->getOperand(0).getOpcode() != ISD::XOR && 10803 N->getOperand(0).getOpcode() != ISD::SELECT && 10804 N->getOperand(0).getOpcode() != ISD::SELECT_CC) 10805 return SDValue(); 10806 10807 SmallVector<SDValue, 4> Inputs; 10808 SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps; 10809 SmallPtrSet<SDNode *, 16> Visited; 10810 10811 // Visit all inputs, collect all binary operations (and, or, xor and 10812 // select) that are all fed by truncations. 10813 while (!BinOps.empty()) { 10814 SDValue BinOp = BinOps.back(); 10815 BinOps.pop_back(); 10816 10817 if (!Visited.insert(BinOp.getNode()).second) 10818 continue; 10819 10820 PromOps.push_back(BinOp); 10821 10822 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { 10823 // The condition of the select is not promoted. 10824 if (BinOp.getOpcode() == ISD::SELECT && i == 0) 10825 continue; 10826 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) 10827 continue; 10828 10829 if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || 10830 isa<ConstantSDNode>(BinOp.getOperand(i))) { 10831 Inputs.push_back(BinOp.getOperand(i)); 10832 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || 10833 BinOp.getOperand(i).getOpcode() == ISD::OR || 10834 BinOp.getOperand(i).getOpcode() == ISD::XOR || 10835 BinOp.getOperand(i).getOpcode() == ISD::SELECT || 10836 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) { 10837 BinOps.push_back(BinOp.getOperand(i)); 10838 } else { 10839 // We have an input that is not a truncation or another binary 10840 // operation; we'll abort this transformation. 10841 return SDValue(); 10842 } 10843 } 10844 } 10845 10846 // The operands of a select that must be truncated when the select is 10847 // promoted because the operand is actually part of the to-be-promoted set. 10848 DenseMap<SDNode *, EVT> SelectTruncOp[2]; 10849 10850 // Make sure that this is a self-contained cluster of operations (which 10851 // is not quite the same thing as saying that everything has only one 10852 // use). 10853 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 10854 if (isa<ConstantSDNode>(Inputs[i])) 10855 continue; 10856 10857 for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), 10858 UE = Inputs[i].getNode()->use_end(); 10859 UI != UE; ++UI) { 10860 SDNode *User = *UI; 10861 if (User != N && !Visited.count(User)) 10862 return SDValue(); 10863 10864 // If we're going to promote the non-output-value operand(s) or SELECT or 10865 // SELECT_CC, record them for truncation. 10866 if (User->getOpcode() == ISD::SELECT) { 10867 if (User->getOperand(0) == Inputs[i]) 10868 SelectTruncOp[0].insert(std::make_pair(User, 10869 User->getOperand(0).getValueType())); 10870 } else if (User->getOpcode() == ISD::SELECT_CC) { 10871 if (User->getOperand(0) == Inputs[i]) 10872 SelectTruncOp[0].insert(std::make_pair(User, 10873 User->getOperand(0).getValueType())); 10874 if (User->getOperand(1) == Inputs[i]) 10875 SelectTruncOp[1].insert(std::make_pair(User, 10876 User->getOperand(1).getValueType())); 10877 } 10878 } 10879 } 10880 10881 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { 10882 for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), 10883 UE = PromOps[i].getNode()->use_end(); 10884 UI != UE; ++UI) { 10885 SDNode *User = *UI; 10886 if (User != N && !Visited.count(User)) 10887 return SDValue(); 10888 10889 // If we're going to promote the non-output-value operand(s) or SELECT or 10890 // SELECT_CC, record them for truncation. 10891 if (User->getOpcode() == ISD::SELECT) { 10892 if (User->getOperand(0) == PromOps[i]) 10893 SelectTruncOp[0].insert(std::make_pair(User, 10894 User->getOperand(0).getValueType())); 10895 } else if (User->getOpcode() == ISD::SELECT_CC) { 10896 if (User->getOperand(0) == PromOps[i]) 10897 SelectTruncOp[0].insert(std::make_pair(User, 10898 User->getOperand(0).getValueType())); 10899 if (User->getOperand(1) == PromOps[i]) 10900 SelectTruncOp[1].insert(std::make_pair(User, 10901 User->getOperand(1).getValueType())); 10902 } 10903 } 10904 } 10905 10906 unsigned PromBits = N->getOperand(0).getValueSizeInBits(); 10907 bool ReallyNeedsExt = false; 10908 if (N->getOpcode() != ISD::ANY_EXTEND) { 10909 // If all of the inputs are not already sign/zero extended, then 10910 // we'll still need to do that at the end. 10911 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 10912 if (isa<ConstantSDNode>(Inputs[i])) 10913 continue; 10914 10915 unsigned OpBits = 10916 Inputs[i].getOperand(0).getValueSizeInBits(); 10917 assert(PromBits < OpBits && "Truncation not to a smaller bit count?"); 10918 10919 if ((N->getOpcode() == ISD::ZERO_EXTEND && 10920 !DAG.MaskedValueIsZero(Inputs[i].getOperand(0), 10921 APInt::getHighBitsSet(OpBits, 10922 OpBits-PromBits))) || 10923 (N->getOpcode() == ISD::SIGN_EXTEND && 10924 DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) < 10925 (OpBits-(PromBits-1)))) { 10926 ReallyNeedsExt = true; 10927 break; 10928 } 10929 } 10930 } 10931 10932 // Replace all inputs, either with the truncation operand, or a 10933 // truncation or extension to the final output type. 10934 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 10935 // Constant inputs need to be replaced with the to-be-promoted nodes that 10936 // use them because they might have users outside of the cluster of 10937 // promoted nodes. 10938 if (isa<ConstantSDNode>(Inputs[i])) 10939 continue; 10940 10941 SDValue InSrc = Inputs[i].getOperand(0); 10942 if (Inputs[i].getValueType() == N->getValueType(0)) 10943 DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc); 10944 else if (N->getOpcode() == ISD::SIGN_EXTEND) 10945 DAG.ReplaceAllUsesOfValueWith(Inputs[i], 10946 DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0))); 10947 else if (N->getOpcode() == ISD::ZERO_EXTEND) 10948 DAG.ReplaceAllUsesOfValueWith(Inputs[i], 10949 DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0))); 10950 else 10951 DAG.ReplaceAllUsesOfValueWith(Inputs[i], 10952 DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0))); 10953 } 10954 10955 std::list<HandleSDNode> PromOpHandles; 10956 for (auto &PromOp : PromOps) 10957 PromOpHandles.emplace_back(PromOp); 10958 10959 // Replace all operations (these are all the same, but have a different 10960 // (promoted) return type). DAG.getNode will validate that the types of 10961 // a binary operator match, so go through the list in reverse so that 10962 // we've likely promoted both operands first. 10963 while (!PromOpHandles.empty()) { 10964 SDValue PromOp = PromOpHandles.back().getValue(); 10965 PromOpHandles.pop_back(); 10966 10967 unsigned C; 10968 switch (PromOp.getOpcode()) { 10969 default: C = 0; break; 10970 case ISD::SELECT: C = 1; break; 10971 case ISD::SELECT_CC: C = 2; break; 10972 } 10973 10974 if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) && 10975 PromOp.getOperand(C).getValueType() != N->getValueType(0)) || 10976 (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) && 10977 PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) { 10978 // The to-be-promoted operands of this node have not yet been 10979 // promoted (this should be rare because we're going through the 10980 // list backward, but if one of the operands has several users in 10981 // this cluster of to-be-promoted nodes, it is possible). 10982 PromOpHandles.emplace_front(PromOp); 10983 continue; 10984 } 10985 10986 // For SELECT and SELECT_CC nodes, we do a similar check for any 10987 // to-be-promoted comparison inputs. 10988 if (PromOp.getOpcode() == ISD::SELECT || 10989 PromOp.getOpcode() == ISD::SELECT_CC) { 10990 if ((SelectTruncOp[0].count(PromOp.getNode()) && 10991 PromOp.getOperand(0).getValueType() != N->getValueType(0)) || 10992 (SelectTruncOp[1].count(PromOp.getNode()) && 10993 PromOp.getOperand(1).getValueType() != N->getValueType(0))) { 10994 PromOpHandles.emplace_front(PromOp); 10995 continue; 10996 } 10997 } 10998 10999 SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), 11000 PromOp.getNode()->op_end()); 11001 11002 // If this node has constant inputs, then they'll need to be promoted here. 11003 for (unsigned i = 0; i < 2; ++i) { 11004 if (!isa<ConstantSDNode>(Ops[C+i])) 11005 continue; 11006 if (Ops[C+i].getValueType() == N->getValueType(0)) 11007 continue; 11008 11009 if (N->getOpcode() == ISD::SIGN_EXTEND) 11010 Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); 11011 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11012 Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); 11013 else 11014 Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); 11015 } 11016 11017 // If we've promoted the comparison inputs of a SELECT or SELECT_CC, 11018 // truncate them again to the original value type. 11019 if (PromOp.getOpcode() == ISD::SELECT || 11020 PromOp.getOpcode() == ISD::SELECT_CC) { 11021 auto SI0 = SelectTruncOp[0].find(PromOp.getNode()); 11022 if (SI0 != SelectTruncOp[0].end()) 11023 Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]); 11024 auto SI1 = SelectTruncOp[1].find(PromOp.getNode()); 11025 if (SI1 != SelectTruncOp[1].end()) 11026 Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]); 11027 } 11028 11029 DAG.ReplaceAllUsesOfValueWith(PromOp, 11030 DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops)); 11031 } 11032 11033 // Now we're left with the initial extension itself. 11034 if (!ReallyNeedsExt) 11035 return N->getOperand(0); 11036 11037 // To zero extend, just mask off everything except for the first bit (in the 11038 // i1 case). 11039 if (N->getOpcode() == ISD::ZERO_EXTEND) 11040 return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0), 11041 DAG.getConstant(APInt::getLowBitsSet( 11042 N->getValueSizeInBits(0), PromBits), 11043 dl, N->getValueType(0))); 11044 11045 assert(N->getOpcode() == ISD::SIGN_EXTEND && 11046 "Invalid extension type"); 11047 EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout()); 11048 SDValue ShiftCst = 11049 DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy); 11050 return DAG.getNode( 11051 ISD::SRA, dl, N->getValueType(0), 11052 DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst), 11053 ShiftCst); 11054 } 11055 11056 /// \brief Reduces the number of fp-to-int conversion when building a vector. 11057 /// 11058 /// If this vector is built out of floating to integer conversions, 11059 /// transform it to a vector built out of floating point values followed by a 11060 /// single floating to integer conversion of the vector. 11061 /// Namely (build_vector (fptosi $A), (fptosi $B), ...) 11062 /// becomes (fptosi (build_vector ($A, $B, ...))) 11063 SDValue PPCTargetLowering:: 11064 combineElementTruncationToVectorTruncation(SDNode *N, 11065 DAGCombinerInfo &DCI) const { 11066 assert(N->getOpcode() == ISD::BUILD_VECTOR && 11067 "Should be called with a BUILD_VECTOR node"); 11068 11069 SelectionDAG &DAG = DCI.DAG; 11070 SDLoc dl(N); 11071 11072 SDValue FirstInput = N->getOperand(0); 11073 assert(FirstInput.getOpcode() == PPCISD::MFVSR && 11074 "The input operand must be an fp-to-int conversion."); 11075 11076 // This combine happens after legalization so the fp_to_[su]i nodes are 11077 // already converted to PPCSISD nodes. 11078 unsigned FirstConversion = FirstInput.getOperand(0).getOpcode(); 11079 if (FirstConversion == PPCISD::FCTIDZ || 11080 FirstConversion == PPCISD::FCTIDUZ || 11081 FirstConversion == PPCISD::FCTIWZ || 11082 FirstConversion == PPCISD::FCTIWUZ) { 11083 bool IsSplat = true; 11084 bool Is32Bit = FirstConversion == PPCISD::FCTIWZ || 11085 FirstConversion == PPCISD::FCTIWUZ; 11086 EVT SrcVT = FirstInput.getOperand(0).getValueType(); 11087 SmallVector<SDValue, 4> Ops; 11088 EVT TargetVT = N->getValueType(0); 11089 for (int i = 0, e = N->getNumOperands(); i < e; ++i) { 11090 if (N->getOperand(i).getOpcode() != PPCISD::MFVSR) 11091 return SDValue(); 11092 unsigned NextConversion = N->getOperand(i).getOperand(0).getOpcode(); 11093 if (NextConversion != FirstConversion) 11094 return SDValue(); 11095 if (N->getOperand(i) != FirstInput) 11096 IsSplat = false; 11097 } 11098 11099 // If this is a splat, we leave it as-is since there will be only a single 11100 // fp-to-int conversion followed by a splat of the integer. This is better 11101 // for 32-bit and smaller ints and neutral for 64-bit ints. 11102 if (IsSplat) 11103 return SDValue(); 11104 11105 // Now that we know we have the right type of node, get its operands 11106 for (int i = 0, e = N->getNumOperands(); i < e; ++i) { 11107 SDValue In = N->getOperand(i).getOperand(0); 11108 // For 32-bit values, we need to add an FP_ROUND node. 11109 if (Is32Bit) { 11110 if (In.isUndef()) 11111 Ops.push_back(DAG.getUNDEF(SrcVT)); 11112 else { 11113 SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl, 11114 MVT::f32, In.getOperand(0), 11115 DAG.getIntPtrConstant(1, dl)); 11116 Ops.push_back(Trunc); 11117 } 11118 } else 11119 Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0)); 11120 } 11121 11122 unsigned Opcode; 11123 if (FirstConversion == PPCISD::FCTIDZ || 11124 FirstConversion == PPCISD::FCTIWZ) 11125 Opcode = ISD::FP_TO_SINT; 11126 else 11127 Opcode = ISD::FP_TO_UINT; 11128 11129 EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32; 11130 SDValue BV = DAG.getBuildVector(NewVT, dl, Ops); 11131 return DAG.getNode(Opcode, dl, TargetVT, BV); 11132 } 11133 return SDValue(); 11134 } 11135 11136 /// \brief Reduce the number of loads when building a vector. 11137 /// 11138 /// Building a vector out of multiple loads can be converted to a load 11139 /// of the vector type if the loads are consecutive. If the loads are 11140 /// consecutive but in descending order, a shuffle is added at the end 11141 /// to reorder the vector. 11142 static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { 11143 assert(N->getOpcode() == ISD::BUILD_VECTOR && 11144 "Should be called with a BUILD_VECTOR node"); 11145 11146 SDLoc dl(N); 11147 bool InputsAreConsecutiveLoads = true; 11148 bool InputsAreReverseConsecutive = true; 11149 unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8; 11150 SDValue FirstInput = N->getOperand(0); 11151 bool IsRoundOfExtLoad = false; 11152 11153 if (FirstInput.getOpcode() == ISD::FP_ROUND && 11154 FirstInput.getOperand(0).getOpcode() == ISD::LOAD) { 11155 LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0)); 11156 IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD; 11157 } 11158 // Not a build vector of (possibly fp_rounded) loads. 11159 if (!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) 11160 return SDValue(); 11161 11162 for (int i = 1, e = N->getNumOperands(); i < e; ++i) { 11163 // If any inputs are fp_round(extload), they all must be. 11164 if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND) 11165 return SDValue(); 11166 11167 SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) : 11168 N->getOperand(i); 11169 if (NextInput.getOpcode() != ISD::LOAD) 11170 return SDValue(); 11171 11172 SDValue PreviousInput = 11173 IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1); 11174 LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput); 11175 LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput); 11176 11177 // If any inputs are fp_round(extload), they all must be. 11178 if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD) 11179 return SDValue(); 11180 11181 if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG)) 11182 InputsAreConsecutiveLoads = false; 11183 if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG)) 11184 InputsAreReverseConsecutive = false; 11185 11186 // Exit early if the loads are neither consecutive nor reverse consecutive. 11187 if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive) 11188 return SDValue(); 11189 } 11190 11191 assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) && 11192 "The loads cannot be both consecutive and reverse consecutive."); 11193 11194 SDValue FirstLoadOp = 11195 IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput; 11196 SDValue LastLoadOp = 11197 IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) : 11198 N->getOperand(N->getNumOperands()-1); 11199 11200 LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp); 11201 LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp); 11202 if (InputsAreConsecutiveLoads) { 11203 assert(LD1 && "Input needs to be a LoadSDNode."); 11204 return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(), 11205 LD1->getBasePtr(), LD1->getPointerInfo(), 11206 LD1->getAlignment()); 11207 } 11208 if (InputsAreReverseConsecutive) { 11209 assert(LDL && "Input needs to be a LoadSDNode."); 11210 SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(), 11211 LDL->getBasePtr(), LDL->getPointerInfo(), 11212 LDL->getAlignment()); 11213 SmallVector<int, 16> Ops; 11214 for (int i = N->getNumOperands() - 1; i >= 0; i--) 11215 Ops.push_back(i); 11216 11217 return DAG.getVectorShuffle(N->getValueType(0), dl, Load, 11218 DAG.getUNDEF(N->getValueType(0)), Ops); 11219 } 11220 return SDValue(); 11221 } 11222 11223 SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, 11224 DAGCombinerInfo &DCI) const { 11225 assert(N->getOpcode() == ISD::BUILD_VECTOR && 11226 "Should be called with a BUILD_VECTOR node"); 11227 11228 SelectionDAG &DAG = DCI.DAG; 11229 SDLoc dl(N); 11230 11231 if (!Subtarget.hasVSX()) 11232 return SDValue(); 11233 11234 // The target independent DAG combiner will leave a build_vector of 11235 // float-to-int conversions intact. We can generate MUCH better code for 11236 // a float-to-int conversion of a vector of floats. 11237 SDValue FirstInput = N->getOperand(0); 11238 if (FirstInput.getOpcode() == PPCISD::MFVSR) { 11239 SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI); 11240 if (Reduced) 11241 return Reduced; 11242 } 11243 11244 // If we're building a vector out of consecutive loads, just load that 11245 // vector type. 11246 SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG); 11247 if (Reduced) 11248 return Reduced; 11249 11250 if (N->getValueType(0) != MVT::v2f64) 11251 return SDValue(); 11252 11253 // Looking for: 11254 // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1)) 11255 if (FirstInput.getOpcode() != ISD::SINT_TO_FP && 11256 FirstInput.getOpcode() != ISD::UINT_TO_FP) 11257 return SDValue(); 11258 if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP && 11259 N->getOperand(1).getOpcode() != ISD::UINT_TO_FP) 11260 return SDValue(); 11261 if (FirstInput.getOpcode() != N->getOperand(1).getOpcode()) 11262 return SDValue(); 11263 11264 SDValue Ext1 = FirstInput.getOperand(0); 11265 SDValue Ext2 = N->getOperand(1).getOperand(0); 11266 if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 11267 Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11268 return SDValue(); 11269 11270 ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1)); 11271 ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1)); 11272 if (!Ext1Op || !Ext2Op) 11273 return SDValue(); 11274 if (Ext1.getValueType() != MVT::i32 || 11275 Ext2.getValueType() != MVT::i32) 11276 if (Ext1.getOperand(0) != Ext2.getOperand(0)) 11277 return SDValue(); 11278 11279 int FirstElem = Ext1Op->getZExtValue(); 11280 int SecondElem = Ext2Op->getZExtValue(); 11281 int SubvecIdx; 11282 if (FirstElem == 0 && SecondElem == 1) 11283 SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0; 11284 else if (FirstElem == 2 && SecondElem == 3) 11285 SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1; 11286 else 11287 return SDValue(); 11288 11289 SDValue SrcVec = Ext1.getOperand(0); 11290 auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ? 11291 PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP; 11292 return DAG.getNode(NodeType, dl, MVT::v2f64, 11293 SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl)); 11294 } 11295 11296 SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, 11297 DAGCombinerInfo &DCI) const { 11298 assert((N->getOpcode() == ISD::SINT_TO_FP || 11299 N->getOpcode() == ISD::UINT_TO_FP) && 11300 "Need an int -> FP conversion node here"); 11301 11302 if (useSoftFloat() || !Subtarget.has64BitSupport()) 11303 return SDValue(); 11304 11305 SelectionDAG &DAG = DCI.DAG; 11306 SDLoc dl(N); 11307 SDValue Op(N, 0); 11308 11309 SDValue FirstOperand(Op.getOperand(0)); 11310 bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD && 11311 (FirstOperand.getValueType() == MVT::i8 || 11312 FirstOperand.getValueType() == MVT::i16); 11313 if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) { 11314 bool Signed = N->getOpcode() == ISD::SINT_TO_FP; 11315 bool DstDouble = Op.getValueType() == MVT::f64; 11316 unsigned ConvOp = Signed ? 11317 (DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) : 11318 (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS); 11319 SDValue WidthConst = 11320 DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2, 11321 dl, false); 11322 LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode()); 11323 SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst }; 11324 SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl, 11325 DAG.getVTList(MVT::f64, MVT::Other), 11326 Ops, MVT::i8, LDN->getMemOperand()); 11327 11328 // For signed conversion, we need to sign-extend the value in the VSR 11329 if (Signed) { 11330 SDValue ExtOps[] = { Ld, WidthConst }; 11331 SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps); 11332 return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext); 11333 } else 11334 return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld); 11335 } 11336 11337 // Don't handle ppc_fp128 here or i1 conversions. 11338 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) 11339 return SDValue(); 11340 if (Op.getOperand(0).getValueType() == MVT::i1) 11341 return SDValue(); 11342 11343 // For i32 intermediate values, unfortunately, the conversion functions 11344 // leave the upper 32 bits of the value are undefined. Within the set of 11345 // scalar instructions, we have no method for zero- or sign-extending the 11346 // value. Thus, we cannot handle i32 intermediate values here. 11347 if (Op.getOperand(0).getValueType() == MVT::i32) 11348 return SDValue(); 11349 11350 assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && 11351 "UINT_TO_FP is supported only with FPCVT"); 11352 11353 // If we have FCFIDS, then use it when converting to single-precision. 11354 // Otherwise, convert to double-precision and then round. 11355 unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 11356 ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS 11357 : PPCISD::FCFIDS) 11358 : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU 11359 : PPCISD::FCFID); 11360 MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 11361 ? MVT::f32 11362 : MVT::f64; 11363 11364 // If we're converting from a float, to an int, and back to a float again, 11365 // then we don't need the store/load pair at all. 11366 if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT && 11367 Subtarget.hasFPCVT()) || 11368 (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) { 11369 SDValue Src = Op.getOperand(0).getOperand(0); 11370 if (Src.getValueType() == MVT::f32) { 11371 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); 11372 DCI.AddToWorklist(Src.getNode()); 11373 } else if (Src.getValueType() != MVT::f64) { 11374 // Make sure that we don't pick up a ppc_fp128 source value. 11375 return SDValue(); 11376 } 11377 11378 unsigned FCTOp = 11379 Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ : 11380 PPCISD::FCTIDUZ; 11381 11382 SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src); 11383 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp); 11384 11385 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) { 11386 FP = DAG.getNode(ISD::FP_ROUND, dl, 11387 MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); 11388 DCI.AddToWorklist(FP.getNode()); 11389 } 11390 11391 return FP; 11392 } 11393 11394 return SDValue(); 11395 } 11396 11397 // expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for 11398 // builtins) into loads with swaps. 11399 SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, 11400 DAGCombinerInfo &DCI) const { 11401 SelectionDAG &DAG = DCI.DAG; 11402 SDLoc dl(N); 11403 SDValue Chain; 11404 SDValue Base; 11405 MachineMemOperand *MMO; 11406 11407 switch (N->getOpcode()) { 11408 default: 11409 llvm_unreachable("Unexpected opcode for little endian VSX load"); 11410 case ISD::LOAD: { 11411 LoadSDNode *LD = cast<LoadSDNode>(N); 11412 Chain = LD->getChain(); 11413 Base = LD->getBasePtr(); 11414 MMO = LD->getMemOperand(); 11415 // If the MMO suggests this isn't a load of a full vector, leave 11416 // things alone. For a built-in, we have to make the change for 11417 // correctness, so if there is a size problem that will be a bug. 11418 if (MMO->getSize() < 16) 11419 return SDValue(); 11420 break; 11421 } 11422 case ISD::INTRINSIC_W_CHAIN: { 11423 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N); 11424 Chain = Intrin->getChain(); 11425 // Similarly to the store case below, Intrin->getBasePtr() doesn't get 11426 // us what we want. Get operand 2 instead. 11427 Base = Intrin->getOperand(2); 11428 MMO = Intrin->getMemOperand(); 11429 break; 11430 } 11431 } 11432 11433 MVT VecTy = N->getValueType(0).getSimpleVT(); 11434 11435 // Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is 11436 // aligned and the type is a vector with elements up to 4 bytes 11437 if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) 11438 && VecTy.getScalarSizeInBits() <= 32 ) { 11439 return SDValue(); 11440 } 11441 11442 SDValue LoadOps[] = { Chain, Base }; 11443 SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl, 11444 DAG.getVTList(MVT::v2f64, MVT::Other), 11445 LoadOps, MVT::v2f64, MMO); 11446 11447 DCI.AddToWorklist(Load.getNode()); 11448 Chain = Load.getValue(1); 11449 SDValue Swap = DAG.getNode( 11450 PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load); 11451 DCI.AddToWorklist(Swap.getNode()); 11452 11453 // Add a bitcast if the resulting load type doesn't match v2f64. 11454 if (VecTy != MVT::v2f64) { 11455 SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap); 11456 DCI.AddToWorklist(N.getNode()); 11457 // Package {bitcast value, swap's chain} to match Load's shape. 11458 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other), 11459 N, Swap.getValue(1)); 11460 } 11461 11462 return Swap; 11463 } 11464 11465 // expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for 11466 // builtins) into stores with swaps. 11467 SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, 11468 DAGCombinerInfo &DCI) const { 11469 SelectionDAG &DAG = DCI.DAG; 11470 SDLoc dl(N); 11471 SDValue Chain; 11472 SDValue Base; 11473 unsigned SrcOpnd; 11474 MachineMemOperand *MMO; 11475 11476 switch (N->getOpcode()) { 11477 default: 11478 llvm_unreachable("Unexpected opcode for little endian VSX store"); 11479 case ISD::STORE: { 11480 StoreSDNode *ST = cast<StoreSDNode>(N); 11481 Chain = ST->getChain(); 11482 Base = ST->getBasePtr(); 11483 MMO = ST->getMemOperand(); 11484 SrcOpnd = 1; 11485 // If the MMO suggests this isn't a store of a full vector, leave 11486 // things alone. For a built-in, we have to make the change for 11487 // correctness, so if there is a size problem that will be a bug. 11488 if (MMO->getSize() < 16) 11489 return SDValue(); 11490 break; 11491 } 11492 case ISD::INTRINSIC_VOID: { 11493 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N); 11494 Chain = Intrin->getChain(); 11495 // Intrin->getBasePtr() oddly does not get what we want. 11496 Base = Intrin->getOperand(3); 11497 MMO = Intrin->getMemOperand(); 11498 SrcOpnd = 2; 11499 break; 11500 } 11501 } 11502 11503 SDValue Src = N->getOperand(SrcOpnd); 11504 MVT VecTy = Src.getValueType().getSimpleVT(); 11505 11506 // Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is 11507 // aligned and the type is a vector with elements up to 4 bytes 11508 if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) 11509 && VecTy.getScalarSizeInBits() <= 32 ) { 11510 return SDValue(); 11511 } 11512 11513 // All stores are done as v2f64 and possible bit cast. 11514 if (VecTy != MVT::v2f64) { 11515 Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src); 11516 DCI.AddToWorklist(Src.getNode()); 11517 } 11518 11519 SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl, 11520 DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src); 11521 DCI.AddToWorklist(Swap.getNode()); 11522 Chain = Swap.getValue(1); 11523 SDValue StoreOps[] = { Chain, Swap, Base }; 11524 SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl, 11525 DAG.getVTList(MVT::Other), 11526 StoreOps, VecTy, MMO); 11527 DCI.AddToWorklist(Store.getNode()); 11528 return Store; 11529 } 11530 11531 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, 11532 DAGCombinerInfo &DCI) const { 11533 SelectionDAG &DAG = DCI.DAG; 11534 SDLoc dl(N); 11535 switch (N->getOpcode()) { 11536 default: break; 11537 case ISD::SHL: 11538 return combineSHL(N, DCI); 11539 case ISD::SRA: 11540 return combineSRA(N, DCI); 11541 case ISD::SRL: 11542 return combineSRL(N, DCI); 11543 case PPCISD::SHL: 11544 if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. 11545 return N->getOperand(0); 11546 break; 11547 case PPCISD::SRL: 11548 if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0. 11549 return N->getOperand(0); 11550 break; 11551 case PPCISD::SRA: 11552 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 11553 if (C->isNullValue() || // 0 >>s V -> 0. 11554 C->isAllOnesValue()) // -1 >>s V -> -1. 11555 return N->getOperand(0); 11556 } 11557 break; 11558 case ISD::SIGN_EXTEND: 11559 case ISD::ZERO_EXTEND: 11560 case ISD::ANY_EXTEND: 11561 return DAGCombineExtBoolTrunc(N, DCI); 11562 case ISD::TRUNCATE: 11563 case ISD::SETCC: 11564 case ISD::SELECT_CC: 11565 return DAGCombineTruncBoolExt(N, DCI); 11566 case ISD::SINT_TO_FP: 11567 case ISD::UINT_TO_FP: 11568 return combineFPToIntToFP(N, DCI); 11569 case ISD::STORE: { 11570 EVT Op1VT = N->getOperand(1).getValueType(); 11571 bool ValidTypeForStoreFltAsInt = (Op1VT == MVT::i32) || 11572 (Subtarget.hasP9Vector() && (Op1VT == MVT::i8 || Op1VT == MVT::i16)); 11573 11574 // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)). 11575 if (Subtarget.hasSTFIWX() && !cast<StoreSDNode>(N)->isTruncatingStore() && 11576 N->getOperand(1).getOpcode() == ISD::FP_TO_SINT && 11577 ValidTypeForStoreFltAsInt && 11578 N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) { 11579 SDValue Val = N->getOperand(1).getOperand(0); 11580 if (Val.getValueType() == MVT::f32) { 11581 Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val); 11582 DCI.AddToWorklist(Val.getNode()); 11583 } 11584 Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val); 11585 DCI.AddToWorklist(Val.getNode()); 11586 11587 if (Op1VT == MVT::i32) { 11588 SDValue Ops[] = { 11589 N->getOperand(0), Val, N->getOperand(2), 11590 DAG.getValueType(N->getOperand(1).getValueType()) 11591 }; 11592 11593 Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, 11594 DAG.getVTList(MVT::Other), Ops, 11595 cast<StoreSDNode>(N)->getMemoryVT(), 11596 cast<StoreSDNode>(N)->getMemOperand()); 11597 } else { 11598 unsigned WidthInBytes = 11599 N->getOperand(1).getValueType() == MVT::i8 ? 1 : 2; 11600 SDValue WidthConst = DAG.getIntPtrConstant(WidthInBytes, dl, false); 11601 11602 SDValue Ops[] = { 11603 N->getOperand(0), Val, N->getOperand(2), WidthConst, 11604 DAG.getValueType(N->getOperand(1).getValueType()) 11605 }; 11606 Val = DAG.getMemIntrinsicNode(PPCISD::STXSIX, dl, 11607 DAG.getVTList(MVT::Other), Ops, 11608 cast<StoreSDNode>(N)->getMemoryVT(), 11609 cast<StoreSDNode>(N)->getMemOperand()); 11610 } 11611 11612 DCI.AddToWorklist(Val.getNode()); 11613 return Val; 11614 } 11615 11616 // Turn STORE (BSWAP) -> sthbrx/stwbrx. 11617 if (cast<StoreSDNode>(N)->isUnindexed() && 11618 N->getOperand(1).getOpcode() == ISD::BSWAP && 11619 N->getOperand(1).getNode()->hasOneUse() && 11620 (N->getOperand(1).getValueType() == MVT::i32 || 11621 N->getOperand(1).getValueType() == MVT::i16 || 11622 (Subtarget.hasLDBRX() && Subtarget.isPPC64() && 11623 N->getOperand(1).getValueType() == MVT::i64))) { 11624 SDValue BSwapOp = N->getOperand(1).getOperand(0); 11625 // Do an any-extend to 32-bits if this is a half-word input. 11626 if (BSwapOp.getValueType() == MVT::i16) 11627 BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp); 11628 11629 // If the type of BSWAP operand is wider than stored memory width 11630 // it need to be shifted to the right side before STBRX. 11631 EVT mVT = cast<StoreSDNode>(N)->getMemoryVT(); 11632 if (Op1VT.bitsGT(mVT)) { 11633 int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits(); 11634 BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp, 11635 DAG.getConstant(Shift, dl, MVT::i32)); 11636 // Need to truncate if this is a bswap of i64 stored as i32/i16. 11637 if (Op1VT == MVT::i64) 11638 BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp); 11639 } 11640 11641 SDValue Ops[] = { 11642 N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT) 11643 }; 11644 return 11645 DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other), 11646 Ops, cast<StoreSDNode>(N)->getMemoryVT(), 11647 cast<StoreSDNode>(N)->getMemOperand()); 11648 } 11649 11650 // For little endian, VSX stores require generating xxswapd/lxvd2x. 11651 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. 11652 EVT VT = N->getOperand(1).getValueType(); 11653 if (VT.isSimple()) { 11654 MVT StoreVT = VT.getSimpleVT(); 11655 if (Subtarget.needsSwapsForVSXMemOps() && 11656 (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 || 11657 StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32)) 11658 return expandVSXStoreForLE(N, DCI); 11659 } 11660 break; 11661 } 11662 case ISD::LOAD: { 11663 LoadSDNode *LD = cast<LoadSDNode>(N); 11664 EVT VT = LD->getValueType(0); 11665 11666 // For little endian, VSX loads require generating lxvd2x/xxswapd. 11667 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. 11668 if (VT.isSimple()) { 11669 MVT LoadVT = VT.getSimpleVT(); 11670 if (Subtarget.needsSwapsForVSXMemOps() && 11671 (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 || 11672 LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32)) 11673 return expandVSXLoadForLE(N, DCI); 11674 } 11675 11676 // We sometimes end up with a 64-bit integer load, from which we extract 11677 // two single-precision floating-point numbers. This happens with 11678 // std::complex<float>, and other similar structures, because of the way we 11679 // canonicalize structure copies. However, if we lack direct moves, 11680 // then the final bitcasts from the extracted integer values to the 11681 // floating-point numbers turn into store/load pairs. Even with direct moves, 11682 // just loading the two floating-point numbers is likely better. 11683 auto ReplaceTwoFloatLoad = [&]() { 11684 if (VT != MVT::i64) 11685 return false; 11686 11687 if (LD->getExtensionType() != ISD::NON_EXTLOAD || 11688 LD->isVolatile()) 11689 return false; 11690 11691 // We're looking for a sequence like this: 11692 // t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64 11693 // t16: i64 = srl t13, Constant:i32<32> 11694 // t17: i32 = truncate t16 11695 // t18: f32 = bitcast t17 11696 // t19: i32 = truncate t13 11697 // t20: f32 = bitcast t19 11698 11699 if (!LD->hasNUsesOfValue(2, 0)) 11700 return false; 11701 11702 auto UI = LD->use_begin(); 11703 while (UI.getUse().getResNo() != 0) ++UI; 11704 SDNode *Trunc = *UI++; 11705 while (UI.getUse().getResNo() != 0) ++UI; 11706 SDNode *RightShift = *UI; 11707 if (Trunc->getOpcode() != ISD::TRUNCATE) 11708 std::swap(Trunc, RightShift); 11709 11710 if (Trunc->getOpcode() != ISD::TRUNCATE || 11711 Trunc->getValueType(0) != MVT::i32 || 11712 !Trunc->hasOneUse()) 11713 return false; 11714 if (RightShift->getOpcode() != ISD::SRL || 11715 !isa<ConstantSDNode>(RightShift->getOperand(1)) || 11716 RightShift->getConstantOperandVal(1) != 32 || 11717 !RightShift->hasOneUse()) 11718 return false; 11719 11720 SDNode *Trunc2 = *RightShift->use_begin(); 11721 if (Trunc2->getOpcode() != ISD::TRUNCATE || 11722 Trunc2->getValueType(0) != MVT::i32 || 11723 !Trunc2->hasOneUse()) 11724 return false; 11725 11726 SDNode *Bitcast = *Trunc->use_begin(); 11727 SDNode *Bitcast2 = *Trunc2->use_begin(); 11728 11729 if (Bitcast->getOpcode() != ISD::BITCAST || 11730 Bitcast->getValueType(0) != MVT::f32) 11731 return false; 11732 if (Bitcast2->getOpcode() != ISD::BITCAST || 11733 Bitcast2->getValueType(0) != MVT::f32) 11734 return false; 11735 11736 if (Subtarget.isLittleEndian()) 11737 std::swap(Bitcast, Bitcast2); 11738 11739 // Bitcast has the second float (in memory-layout order) and Bitcast2 11740 // has the first one. 11741 11742 SDValue BasePtr = LD->getBasePtr(); 11743 if (LD->isIndexed()) { 11744 assert(LD->getAddressingMode() == ISD::PRE_INC && 11745 "Non-pre-inc AM on PPC?"); 11746 BasePtr = 11747 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 11748 LD->getOffset()); 11749 } 11750 11751 auto MMOFlags = 11752 LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile; 11753 SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr, 11754 LD->getPointerInfo(), LD->getAlignment(), 11755 MMOFlags, LD->getAAInfo()); 11756 SDValue AddPtr = 11757 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), 11758 BasePtr, DAG.getIntPtrConstant(4, dl)); 11759 SDValue FloatLoad2 = DAG.getLoad( 11760 MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr, 11761 LD->getPointerInfo().getWithOffset(4), 11762 MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo()); 11763 11764 if (LD->isIndexed()) { 11765 // Note that DAGCombine should re-form any pre-increment load(s) from 11766 // what is produced here if that makes sense. 11767 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr); 11768 } 11769 11770 DCI.CombineTo(Bitcast2, FloatLoad); 11771 DCI.CombineTo(Bitcast, FloatLoad2); 11772 11773 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1), 11774 SDValue(FloatLoad2.getNode(), 1)); 11775 return true; 11776 }; 11777 11778 if (ReplaceTwoFloatLoad()) 11779 return SDValue(N, 0); 11780 11781 EVT MemVT = LD->getMemoryVT(); 11782 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); 11783 unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); 11784 Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); 11785 unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy); 11786 if (LD->isUnindexed() && VT.isVector() && 11787 ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && 11788 // P8 and later hardware should just use LOAD. 11789 !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || 11790 VT == MVT::v4i32 || VT == MVT::v4f32)) || 11791 (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) && 11792 LD->getAlignment() >= ScalarABIAlignment)) && 11793 LD->getAlignment() < ABIAlignment) { 11794 // This is a type-legal unaligned Altivec or QPX load. 11795 SDValue Chain = LD->getChain(); 11796 SDValue Ptr = LD->getBasePtr(); 11797 bool isLittleEndian = Subtarget.isLittleEndian(); 11798 11799 // This implements the loading of unaligned vectors as described in 11800 // the venerable Apple Velocity Engine overview. Specifically: 11801 // https://developer.apple.com/hardwaredrivers/ve/alignment.html 11802 // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html 11803 // 11804 // The general idea is to expand a sequence of one or more unaligned 11805 // loads into an alignment-based permutation-control instruction (lvsl 11806 // or lvsr), a series of regular vector loads (which always truncate 11807 // their input address to an aligned address), and a series of 11808 // permutations. The results of these permutations are the requested 11809 // loaded values. The trick is that the last "extra" load is not taken 11810 // from the address you might suspect (sizeof(vector) bytes after the 11811 // last requested load), but rather sizeof(vector) - 1 bytes after the 11812 // last requested vector. The point of this is to avoid a page fault if 11813 // the base address happened to be aligned. This works because if the 11814 // base address is aligned, then adding less than a full vector length 11815 // will cause the last vector in the sequence to be (re)loaded. 11816 // Otherwise, the next vector will be fetched as you might suspect was 11817 // necessary. 11818 11819 // We might be able to reuse the permutation generation from 11820 // a different base address offset from this one by an aligned amount. 11821 // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this 11822 // optimization later. 11823 Intrinsic::ID Intr, IntrLD, IntrPerm; 11824 MVT PermCntlTy, PermTy, LDTy; 11825 if (Subtarget.hasAltivec()) { 11826 Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr : 11827 Intrinsic::ppc_altivec_lvsl; 11828 IntrLD = Intrinsic::ppc_altivec_lvx; 11829 IntrPerm = Intrinsic::ppc_altivec_vperm; 11830 PermCntlTy = MVT::v16i8; 11831 PermTy = MVT::v4i32; 11832 LDTy = MVT::v4i32; 11833 } else { 11834 Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld : 11835 Intrinsic::ppc_qpx_qvlpcls; 11836 IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd : 11837 Intrinsic::ppc_qpx_qvlfs; 11838 IntrPerm = Intrinsic::ppc_qpx_qvfperm; 11839 PermCntlTy = MVT::v4f64; 11840 PermTy = MVT::v4f64; 11841 LDTy = MemVT.getSimpleVT(); 11842 } 11843 11844 SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy); 11845 11846 // Create the new MMO for the new base load. It is like the original MMO, 11847 // but represents an area in memory almost twice the vector size centered 11848 // on the original address. If the address is unaligned, we might start 11849 // reading up to (sizeof(vector)-1) bytes below the address of the 11850 // original unaligned load. 11851 MachineFunction &MF = DAG.getMachineFunction(); 11852 MachineMemOperand *BaseMMO = 11853 MF.getMachineMemOperand(LD->getMemOperand(), 11854 -(long)MemVT.getStoreSize()+1, 11855 2*MemVT.getStoreSize()-1); 11856 11857 // Create the new base load. 11858 SDValue LDXIntID = 11859 DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout())); 11860 SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr }; 11861 SDValue BaseLoad = 11862 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, 11863 DAG.getVTList(PermTy, MVT::Other), 11864 BaseLoadOps, LDTy, BaseMMO); 11865 11866 // Note that the value of IncOffset (which is provided to the next 11867 // load's pointer info offset value, and thus used to calculate the 11868 // alignment), and the value of IncValue (which is actually used to 11869 // increment the pointer value) are different! This is because we 11870 // require the next load to appear to be aligned, even though it 11871 // is actually offset from the base pointer by a lesser amount. 11872 int IncOffset = VT.getSizeInBits() / 8; 11873 int IncValue = IncOffset; 11874 11875 // Walk (both up and down) the chain looking for another load at the real 11876 // (aligned) offset (the alignment of the other load does not matter in 11877 // this case). If found, then do not use the offset reduction trick, as 11878 // that will prevent the loads from being later combined (as they would 11879 // otherwise be duplicates). 11880 if (!findConsecutiveLoad(LD, DAG)) 11881 --IncValue; 11882 11883 SDValue Increment = 11884 DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout())); 11885 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 11886 11887 MachineMemOperand *ExtraMMO = 11888 MF.getMachineMemOperand(LD->getMemOperand(), 11889 1, 2*MemVT.getStoreSize()-1); 11890 SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr }; 11891 SDValue ExtraLoad = 11892 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, 11893 DAG.getVTList(PermTy, MVT::Other), 11894 ExtraLoadOps, LDTy, ExtraMMO); 11895 11896 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 11897 BaseLoad.getValue(1), ExtraLoad.getValue(1)); 11898 11899 // Because vperm has a big-endian bias, we must reverse the order 11900 // of the input vectors and complement the permute control vector 11901 // when generating little endian code. We have already handled the 11902 // latter by using lvsr instead of lvsl, so just reverse BaseLoad 11903 // and ExtraLoad here. 11904 SDValue Perm; 11905 if (isLittleEndian) 11906 Perm = BuildIntrinsicOp(IntrPerm, 11907 ExtraLoad, BaseLoad, PermCntl, DAG, dl); 11908 else 11909 Perm = BuildIntrinsicOp(IntrPerm, 11910 BaseLoad, ExtraLoad, PermCntl, DAG, dl); 11911 11912 if (VT != PermTy) 11913 Perm = Subtarget.hasAltivec() ? 11914 DAG.getNode(ISD::BITCAST, dl, VT, Perm) : 11915 DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX 11916 DAG.getTargetConstant(1, dl, MVT::i64)); 11917 // second argument is 1 because this rounding 11918 // is always exact. 11919 11920 // The output of the permutation is our loaded result, the TokenFactor is 11921 // our new chain. 11922 DCI.CombineTo(N, Perm, TF); 11923 return SDValue(N, 0); 11924 } 11925 } 11926 break; 11927 case ISD::INTRINSIC_WO_CHAIN: { 11928 bool isLittleEndian = Subtarget.isLittleEndian(); 11929 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 11930 Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr 11931 : Intrinsic::ppc_altivec_lvsl); 11932 if ((IID == Intr || 11933 IID == Intrinsic::ppc_qpx_qvlpcld || 11934 IID == Intrinsic::ppc_qpx_qvlpcls) && 11935 N->getOperand(1)->getOpcode() == ISD::ADD) { 11936 SDValue Add = N->getOperand(1); 11937 11938 int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ? 11939 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */; 11940 11941 if (DAG.MaskedValueIsZero(Add->getOperand(1), 11942 APInt::getAllOnesValue(Bits /* alignment */) 11943 .zext(Add.getScalarValueSizeInBits()))) { 11944 SDNode *BasePtr = Add->getOperand(0).getNode(); 11945 for (SDNode::use_iterator UI = BasePtr->use_begin(), 11946 UE = BasePtr->use_end(); 11947 UI != UE; ++UI) { 11948 if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && 11949 cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) { 11950 // We've found another LVSL/LVSR, and this address is an aligned 11951 // multiple of that one. The results will be the same, so use the 11952 // one we've just found instead. 11953 11954 return SDValue(*UI, 0); 11955 } 11956 } 11957 } 11958 11959 if (isa<ConstantSDNode>(Add->getOperand(1))) { 11960 SDNode *BasePtr = Add->getOperand(0).getNode(); 11961 for (SDNode::use_iterator UI = BasePtr->use_begin(), 11962 UE = BasePtr->use_end(); UI != UE; ++UI) { 11963 if (UI->getOpcode() == ISD::ADD && 11964 isa<ConstantSDNode>(UI->getOperand(1)) && 11965 (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() - 11966 cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) % 11967 (1ULL << Bits) == 0) { 11968 SDNode *OtherAdd = *UI; 11969 for (SDNode::use_iterator VI = OtherAdd->use_begin(), 11970 VE = OtherAdd->use_end(); VI != VE; ++VI) { 11971 if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && 11972 cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) { 11973 return SDValue(*VI, 0); 11974 } 11975 } 11976 } 11977 } 11978 } 11979 } 11980 } 11981 11982 break; 11983 case ISD::INTRINSIC_W_CHAIN: 11984 // For little endian, VSX loads require generating lxvd2x/xxswapd. 11985 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. 11986 if (Subtarget.needsSwapsForVSXMemOps()) { 11987 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 11988 default: 11989 break; 11990 case Intrinsic::ppc_vsx_lxvw4x: 11991 case Intrinsic::ppc_vsx_lxvd2x: 11992 return expandVSXLoadForLE(N, DCI); 11993 } 11994 } 11995 break; 11996 case ISD::INTRINSIC_VOID: 11997 // For little endian, VSX stores require generating xxswapd/stxvd2x. 11998 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. 11999 if (Subtarget.needsSwapsForVSXMemOps()) { 12000 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 12001 default: 12002 break; 12003 case Intrinsic::ppc_vsx_stxvw4x: 12004 case Intrinsic::ppc_vsx_stxvd2x: 12005 return expandVSXStoreForLE(N, DCI); 12006 } 12007 } 12008 break; 12009 case ISD::BSWAP: 12010 // Turn BSWAP (LOAD) -> lhbrx/lwbrx. 12011 if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && 12012 N->getOperand(0).hasOneUse() && 12013 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 || 12014 (Subtarget.hasLDBRX() && Subtarget.isPPC64() && 12015 N->getValueType(0) == MVT::i64))) { 12016 SDValue Load = N->getOperand(0); 12017 LoadSDNode *LD = cast<LoadSDNode>(Load); 12018 // Create the byte-swapping load. 12019 SDValue Ops[] = { 12020 LD->getChain(), // Chain 12021 LD->getBasePtr(), // Ptr 12022 DAG.getValueType(N->getValueType(0)) // VT 12023 }; 12024 SDValue BSLoad = 12025 DAG.getMemIntrinsicNode(PPCISD::LBRX, dl, 12026 DAG.getVTList(N->getValueType(0) == MVT::i64 ? 12027 MVT::i64 : MVT::i32, MVT::Other), 12028 Ops, LD->getMemoryVT(), LD->getMemOperand()); 12029 12030 // If this is an i16 load, insert the truncate. 12031 SDValue ResVal = BSLoad; 12032 if (N->getValueType(0) == MVT::i16) 12033 ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad); 12034 12035 // First, combine the bswap away. This makes the value produced by the 12036 // load dead. 12037 DCI.CombineTo(N, ResVal); 12038 12039 // Next, combine the load away, we give it a bogus result value but a real 12040 // chain result. The result value is dead because the bswap is dead. 12041 DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1)); 12042 12043 // Return N so it doesn't get rechecked! 12044 return SDValue(N, 0); 12045 } 12046 break; 12047 case PPCISD::VCMP: 12048 // If a VCMPo node already exists with exactly the same operands as this 12049 // node, use its result instead of this node (VCMPo computes both a CR6 and 12050 // a normal output). 12051 // 12052 if (!N->getOperand(0).hasOneUse() && 12053 !N->getOperand(1).hasOneUse() && 12054 !N->getOperand(2).hasOneUse()) { 12055 12056 // Scan all of the users of the LHS, looking for VCMPo's that match. 12057 SDNode *VCMPoNode = nullptr; 12058 12059 SDNode *LHSN = N->getOperand(0).getNode(); 12060 for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end(); 12061 UI != E; ++UI) 12062 if (UI->getOpcode() == PPCISD::VCMPo && 12063 UI->getOperand(1) == N->getOperand(1) && 12064 UI->getOperand(2) == N->getOperand(2) && 12065 UI->getOperand(0) == N->getOperand(0)) { 12066 VCMPoNode = *UI; 12067 break; 12068 } 12069 12070 // If there is no VCMPo node, or if the flag value has a single use, don't 12071 // transform this. 12072 if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1)) 12073 break; 12074 12075 // Look at the (necessarily single) use of the flag value. If it has a 12076 // chain, this transformation is more complex. Note that multiple things 12077 // could use the value result, which we should ignore. 12078 SDNode *FlagUser = nullptr; 12079 for (SDNode::use_iterator UI = VCMPoNode->use_begin(); 12080 FlagUser == nullptr; ++UI) { 12081 assert(UI != VCMPoNode->use_end() && "Didn't find user!"); 12082 SDNode *User = *UI; 12083 for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { 12084 if (User->getOperand(i) == SDValue(VCMPoNode, 1)) { 12085 FlagUser = User; 12086 break; 12087 } 12088 } 12089 } 12090 12091 // If the user is a MFOCRF instruction, we know this is safe. 12092 // Otherwise we give up for right now. 12093 if (FlagUser->getOpcode() == PPCISD::MFOCRF) 12094 return SDValue(VCMPoNode, 0); 12095 } 12096 break; 12097 case ISD::BRCOND: { 12098 SDValue Cond = N->getOperand(1); 12099 SDValue Target = N->getOperand(2); 12100 12101 if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && 12102 cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() == 12103 Intrinsic::ppc_is_decremented_ctr_nonzero) { 12104 12105 // We now need to make the intrinsic dead (it cannot be instruction 12106 // selected). 12107 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0)); 12108 assert(Cond.getNode()->hasOneUse() && 12109 "Counter decrement has more than one use"); 12110 12111 return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other, 12112 N->getOperand(0), Target); 12113 } 12114 } 12115 break; 12116 case ISD::BR_CC: { 12117 // If this is a branch on an altivec predicate comparison, lower this so 12118 // that we don't have to do a MFOCRF: instead, branch directly on CR6. This 12119 // lowering is done pre-legalize, because the legalizer lowers the predicate 12120 // compare down to code that is difficult to reassemble. 12121 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 12122 SDValue LHS = N->getOperand(2), RHS = N->getOperand(3); 12123 12124 // Sometimes the promoted value of the intrinsic is ANDed by some non-zero 12125 // value. If so, pass-through the AND to get to the intrinsic. 12126 if (LHS.getOpcode() == ISD::AND && 12127 LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && 12128 cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() == 12129 Intrinsic::ppc_is_decremented_ctr_nonzero && 12130 isa<ConstantSDNode>(LHS.getOperand(1)) && 12131 !isNullConstant(LHS.getOperand(1))) 12132 LHS = LHS.getOperand(0); 12133 12134 if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && 12135 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == 12136 Intrinsic::ppc_is_decremented_ctr_nonzero && 12137 isa<ConstantSDNode>(RHS)) { 12138 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && 12139 "Counter decrement comparison is not EQ or NE"); 12140 12141 unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); 12142 bool isBDNZ = (CC == ISD::SETEQ && Val) || 12143 (CC == ISD::SETNE && !Val); 12144 12145 // We now need to make the intrinsic dead (it cannot be instruction 12146 // selected). 12147 DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0)); 12148 assert(LHS.getNode()->hasOneUse() && 12149 "Counter decrement has more than one use"); 12150 12151 return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other, 12152 N->getOperand(0), N->getOperand(4)); 12153 } 12154 12155 int CompareOpc; 12156 bool isDot; 12157 12158 if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && 12159 isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && 12160 getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { 12161 assert(isDot && "Can't compare against a vector result!"); 12162 12163 // If this is a comparison against something other than 0/1, then we know 12164 // that the condition is never/always true. 12165 unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); 12166 if (Val != 0 && Val != 1) { 12167 if (CC == ISD::SETEQ) // Cond never true, remove branch. 12168 return N->getOperand(0); 12169 // Always !=, turn it into an unconditional branch. 12170 return DAG.getNode(ISD::BR, dl, MVT::Other, 12171 N->getOperand(0), N->getOperand(4)); 12172 } 12173 12174 bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0); 12175 12176 // Create the PPCISD altivec 'dot' comparison node. 12177 SDValue Ops[] = { 12178 LHS.getOperand(2), // LHS of compare 12179 LHS.getOperand(3), // RHS of compare 12180 DAG.getConstant(CompareOpc, dl, MVT::i32) 12181 }; 12182 EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue }; 12183 SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); 12184 12185 // Unpack the result based on how the target uses it. 12186 PPC::Predicate CompOpc; 12187 switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) { 12188 default: // Can't happen, don't crash on invalid number though. 12189 case 0: // Branch on the value of the EQ bit of CR6. 12190 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE; 12191 break; 12192 case 1: // Branch on the inverted value of the EQ bit of CR6. 12193 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ; 12194 break; 12195 case 2: // Branch on the value of the LT bit of CR6. 12196 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE; 12197 break; 12198 case 3: // Branch on the inverted value of the LT bit of CR6. 12199 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT; 12200 break; 12201 } 12202 12203 return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0), 12204 DAG.getConstant(CompOpc, dl, MVT::i32), 12205 DAG.getRegister(PPC::CR6, MVT::i32), 12206 N->getOperand(4), CompNode.getValue(1)); 12207 } 12208 break; 12209 } 12210 case ISD::BUILD_VECTOR: 12211 return DAGCombineBuildVector(N, DCI); 12212 } 12213 12214 return SDValue(); 12215 } 12216 12217 SDValue 12218 PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 12219 SelectionDAG &DAG, 12220 std::vector<SDNode *> *Created) const { 12221 // fold (sdiv X, pow2) 12222 EVT VT = N->getValueType(0); 12223 if (VT == MVT::i64 && !Subtarget.isPPC64()) 12224 return SDValue(); 12225 if ((VT != MVT::i32 && VT != MVT::i64) || 12226 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) 12227 return SDValue(); 12228 12229 SDLoc DL(N); 12230 SDValue N0 = N->getOperand(0); 12231 12232 bool IsNegPow2 = (-Divisor).isPowerOf2(); 12233 unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros(); 12234 SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT); 12235 12236 SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt); 12237 if (Created) 12238 Created->push_back(Op.getNode()); 12239 12240 if (IsNegPow2) { 12241 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); 12242 if (Created) 12243 Created->push_back(Op.getNode()); 12244 } 12245 12246 return Op; 12247 } 12248 12249 //===----------------------------------------------------------------------===// 12250 // Inline Assembly Support 12251 //===----------------------------------------------------------------------===// 12252 12253 void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 12254 KnownBits &Known, 12255 const APInt &DemandedElts, 12256 const SelectionDAG &DAG, 12257 unsigned Depth) const { 12258 Known.resetAll(); 12259 switch (Op.getOpcode()) { 12260 default: break; 12261 case PPCISD::LBRX: { 12262 // lhbrx is known to have the top bits cleared out. 12263 if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16) 12264 Known.Zero = 0xFFFF0000; 12265 break; 12266 } 12267 case ISD::INTRINSIC_WO_CHAIN: { 12268 switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { 12269 default: break; 12270 case Intrinsic::ppc_altivec_vcmpbfp_p: 12271 case Intrinsic::ppc_altivec_vcmpeqfp_p: 12272 case Intrinsic::ppc_altivec_vcmpequb_p: 12273 case Intrinsic::ppc_altivec_vcmpequh_p: 12274 case Intrinsic::ppc_altivec_vcmpequw_p: 12275 case Intrinsic::ppc_altivec_vcmpequd_p: 12276 case Intrinsic::ppc_altivec_vcmpgefp_p: 12277 case Intrinsic::ppc_altivec_vcmpgtfp_p: 12278 case Intrinsic::ppc_altivec_vcmpgtsb_p: 12279 case Intrinsic::ppc_altivec_vcmpgtsh_p: 12280 case Intrinsic::ppc_altivec_vcmpgtsw_p: 12281 case Intrinsic::ppc_altivec_vcmpgtsd_p: 12282 case Intrinsic::ppc_altivec_vcmpgtub_p: 12283 case Intrinsic::ppc_altivec_vcmpgtuh_p: 12284 case Intrinsic::ppc_altivec_vcmpgtuw_p: 12285 case Intrinsic::ppc_altivec_vcmpgtud_p: 12286 Known.Zero = ~1U; // All bits but the low one are known to be zero. 12287 break; 12288 } 12289 } 12290 } 12291 } 12292 12293 unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { 12294 switch (Subtarget.getDarwinDirective()) { 12295 default: break; 12296 case PPC::DIR_970: 12297 case PPC::DIR_PWR4: 12298 case PPC::DIR_PWR5: 12299 case PPC::DIR_PWR5X: 12300 case PPC::DIR_PWR6: 12301 case PPC::DIR_PWR6X: 12302 case PPC::DIR_PWR7: 12303 case PPC::DIR_PWR8: 12304 case PPC::DIR_PWR9: { 12305 if (!ML) 12306 break; 12307 12308 const PPCInstrInfo *TII = Subtarget.getInstrInfo(); 12309 12310 // For small loops (between 5 and 8 instructions), align to a 32-byte 12311 // boundary so that the entire loop fits in one instruction-cache line. 12312 uint64_t LoopSize = 0; 12313 for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) 12314 for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) { 12315 LoopSize += TII->getInstSizeInBytes(*J); 12316 if (LoopSize > 32) 12317 break; 12318 } 12319 12320 if (LoopSize > 16 && LoopSize <= 32) 12321 return 5; 12322 12323 break; 12324 } 12325 } 12326 12327 return TargetLowering::getPrefLoopAlignment(ML); 12328 } 12329 12330 /// getConstraintType - Given a constraint, return the type of 12331 /// constraint it is for this target. 12332 PPCTargetLowering::ConstraintType 12333 PPCTargetLowering::getConstraintType(StringRef Constraint) const { 12334 if (Constraint.size() == 1) { 12335 switch (Constraint[0]) { 12336 default: break; 12337 case 'b': 12338 case 'r': 12339 case 'f': 12340 case 'd': 12341 case 'v': 12342 case 'y': 12343 return C_RegisterClass; 12344 case 'Z': 12345 // FIXME: While Z does indicate a memory constraint, it specifically 12346 // indicates an r+r address (used in conjunction with the 'y' modifier 12347 // in the replacement string). Currently, we're forcing the base 12348 // register to be r0 in the asm printer (which is interpreted as zero) 12349 // and forming the complete address in the second register. This is 12350 // suboptimal. 12351 return C_Memory; 12352 } 12353 } else if (Constraint == "wc") { // individual CR bits. 12354 return C_RegisterClass; 12355 } else if (Constraint == "wa" || Constraint == "wd" || 12356 Constraint == "wf" || Constraint == "ws") { 12357 return C_RegisterClass; // VSX registers. 12358 } 12359 return TargetLowering::getConstraintType(Constraint); 12360 } 12361 12362 /// Examine constraint type and operand type and determine a weight value. 12363 /// This object must already have been set up with the operand type 12364 /// and the current alternative constraint selected. 12365 TargetLowering::ConstraintWeight 12366 PPCTargetLowering::getSingleConstraintMatchWeight( 12367 AsmOperandInfo &info, const char *constraint) const { 12368 ConstraintWeight weight = CW_Invalid; 12369 Value *CallOperandVal = info.CallOperandVal; 12370 // If we don't have a value, we can't do a match, 12371 // but allow it at the lowest weight. 12372 if (!CallOperandVal) 12373 return CW_Default; 12374 Type *type = CallOperandVal->getType(); 12375 12376 // Look at the constraint type. 12377 if (StringRef(constraint) == "wc" && type->isIntegerTy(1)) 12378 return CW_Register; // an individual CR bit. 12379 else if ((StringRef(constraint) == "wa" || 12380 StringRef(constraint) == "wd" || 12381 StringRef(constraint) == "wf") && 12382 type->isVectorTy()) 12383 return CW_Register; 12384 else if (StringRef(constraint) == "ws" && type->isDoubleTy()) 12385 return CW_Register; 12386 12387 switch (*constraint) { 12388 default: 12389 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 12390 break; 12391 case 'b': 12392 if (type->isIntegerTy()) 12393 weight = CW_Register; 12394 break; 12395 case 'f': 12396 if (type->isFloatTy()) 12397 weight = CW_Register; 12398 break; 12399 case 'd': 12400 if (type->isDoubleTy()) 12401 weight = CW_Register; 12402 break; 12403 case 'v': 12404 if (type->isVectorTy()) 12405 weight = CW_Register; 12406 break; 12407 case 'y': 12408 weight = CW_Register; 12409 break; 12410 case 'Z': 12411 weight = CW_Memory; 12412 break; 12413 } 12414 return weight; 12415 } 12416 12417 std::pair<unsigned, const TargetRegisterClass *> 12418 PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 12419 StringRef Constraint, 12420 MVT VT) const { 12421 if (Constraint.size() == 1) { 12422 // GCC RS6000 Constraint Letters 12423 switch (Constraint[0]) { 12424 case 'b': // R1-R31 12425 if (VT == MVT::i64 && Subtarget.isPPC64()) 12426 return std::make_pair(0U, &PPC::G8RC_NOX0RegClass); 12427 return std::make_pair(0U, &PPC::GPRC_NOR0RegClass); 12428 case 'r': // R0-R31 12429 if (VT == MVT::i64 && Subtarget.isPPC64()) 12430 return std::make_pair(0U, &PPC::G8RCRegClass); 12431 return std::make_pair(0U, &PPC::GPRCRegClass); 12432 // 'd' and 'f' constraints are both defined to be "the floating point 12433 // registers", where one is for 32-bit and the other for 64-bit. We don't 12434 // really care overly much here so just give them all the same reg classes. 12435 case 'd': 12436 case 'f': 12437 if (VT == MVT::f32 || VT == MVT::i32) 12438 return std::make_pair(0U, &PPC::F4RCRegClass); 12439 if (VT == MVT::f64 || VT == MVT::i64) 12440 return std::make_pair(0U, &PPC::F8RCRegClass); 12441 if (VT == MVT::v4f64 && Subtarget.hasQPX()) 12442 return std::make_pair(0U, &PPC::QFRCRegClass); 12443 if (VT == MVT::v4f32 && Subtarget.hasQPX()) 12444 return std::make_pair(0U, &PPC::QSRCRegClass); 12445 break; 12446 case 'v': 12447 if (VT == MVT::v4f64 && Subtarget.hasQPX()) 12448 return std::make_pair(0U, &PPC::QFRCRegClass); 12449 if (VT == MVT::v4f32 && Subtarget.hasQPX()) 12450 return std::make_pair(0U, &PPC::QSRCRegClass); 12451 if (Subtarget.hasAltivec()) 12452 return std::make_pair(0U, &PPC::VRRCRegClass); 12453 case 'y': // crrc 12454 return std::make_pair(0U, &PPC::CRRCRegClass); 12455 } 12456 } else if (Constraint == "wc" && Subtarget.useCRBits()) { 12457 // An individual CR bit. 12458 return std::make_pair(0U, &PPC::CRBITRCRegClass); 12459 } else if ((Constraint == "wa" || Constraint == "wd" || 12460 Constraint == "wf") && Subtarget.hasVSX()) { 12461 return std::make_pair(0U, &PPC::VSRCRegClass); 12462 } else if (Constraint == "ws" && Subtarget.hasVSX()) { 12463 if (VT == MVT::f32 && Subtarget.hasP8Vector()) 12464 return std::make_pair(0U, &PPC::VSSRCRegClass); 12465 else 12466 return std::make_pair(0U, &PPC::VSFRCRegClass); 12467 } 12468 12469 std::pair<unsigned, const TargetRegisterClass *> R = 12470 TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 12471 12472 // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers 12473 // (which we call X[0-9]+). If a 64-bit value has been requested, and a 12474 // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent 12475 // register. 12476 // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use 12477 // the AsmName field from *RegisterInfo.td, then this would not be necessary. 12478 if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && 12479 PPC::GPRCRegClass.contains(R.first)) 12480 return std::make_pair(TRI->getMatchingSuperReg(R.first, 12481 PPC::sub_32, &PPC::G8RCRegClass), 12482 &PPC::G8RCRegClass); 12483 12484 // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same. 12485 if (!R.second && StringRef("{cc}").equals_lower(Constraint)) { 12486 R.first = PPC::CR0; 12487 R.second = &PPC::CRRCRegClass; 12488 } 12489 12490 return R; 12491 } 12492 12493 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 12494 /// vector. If it is invalid, don't add anything to Ops. 12495 void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 12496 std::string &Constraint, 12497 std::vector<SDValue>&Ops, 12498 SelectionDAG &DAG) const { 12499 SDValue Result; 12500 12501 // Only support length 1 constraints. 12502 if (Constraint.length() > 1) return; 12503 12504 char Letter = Constraint[0]; 12505 switch (Letter) { 12506 default: break; 12507 case 'I': 12508 case 'J': 12509 case 'K': 12510 case 'L': 12511 case 'M': 12512 case 'N': 12513 case 'O': 12514 case 'P': { 12515 ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op); 12516 if (!CST) return; // Must be an immediate to match. 12517 SDLoc dl(Op); 12518 int64_t Value = CST->getSExtValue(); 12519 EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative 12520 // numbers are printed as such. 12521 switch (Letter) { 12522 default: llvm_unreachable("Unknown constraint letter!"); 12523 case 'I': // "I" is a signed 16-bit constant. 12524 if (isInt<16>(Value)) 12525 Result = DAG.getTargetConstant(Value, dl, TCVT); 12526 break; 12527 case 'J': // "J" is a constant with only the high-order 16 bits nonzero. 12528 if (isShiftedUInt<16, 16>(Value)) 12529 Result = DAG.getTargetConstant(Value, dl, TCVT); 12530 break; 12531 case 'L': // "L" is a signed 16-bit constant shifted left 16 bits. 12532 if (isShiftedInt<16, 16>(Value)) 12533 Result = DAG.getTargetConstant(Value, dl, TCVT); 12534 break; 12535 case 'K': // "K" is a constant with only the low-order 16 bits nonzero. 12536 if (isUInt<16>(Value)) 12537 Result = DAG.getTargetConstant(Value, dl, TCVT); 12538 break; 12539 case 'M': // "M" is a constant that is greater than 31. 12540 if (Value > 31) 12541 Result = DAG.getTargetConstant(Value, dl, TCVT); 12542 break; 12543 case 'N': // "N" is a positive constant that is an exact power of two. 12544 if (Value > 0 && isPowerOf2_64(Value)) 12545 Result = DAG.getTargetConstant(Value, dl, TCVT); 12546 break; 12547 case 'O': // "O" is the constant zero. 12548 if (Value == 0) 12549 Result = DAG.getTargetConstant(Value, dl, TCVT); 12550 break; 12551 case 'P': // "P" is a constant whose negation is a signed 16-bit constant. 12552 if (isInt<16>(-Value)) 12553 Result = DAG.getTargetConstant(Value, dl, TCVT); 12554 break; 12555 } 12556 break; 12557 } 12558 } 12559 12560 if (Result.getNode()) { 12561 Ops.push_back(Result); 12562 return; 12563 } 12564 12565 // Handle standard constraint letters. 12566 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 12567 } 12568 12569 // isLegalAddressingMode - Return true if the addressing mode represented 12570 // by AM is legal for this target, for a load/store of the specified type. 12571 bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, 12572 const AddrMode &AM, Type *Ty, 12573 unsigned AS) const { 12574 // PPC does not allow r+i addressing modes for vectors! 12575 if (Ty->isVectorTy() && AM.BaseOffs != 0) 12576 return false; 12577 12578 // PPC allows a sign-extended 16-bit immediate field. 12579 if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1) 12580 return false; 12581 12582 // No global is ever allowed as a base. 12583 if (AM.BaseGV) 12584 return false; 12585 12586 // PPC only support r+r, 12587 switch (AM.Scale) { 12588 case 0: // "r+i" or just "i", depending on HasBaseReg. 12589 break; 12590 case 1: 12591 if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. 12592 return false; 12593 // Otherwise we have r+r or r+i. 12594 break; 12595 case 2: 12596 if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. 12597 return false; 12598 // Allow 2*r as r+r. 12599 break; 12600 default: 12601 // No other scales are supported. 12602 return false; 12603 } 12604 12605 return true; 12606 } 12607 12608 SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, 12609 SelectionDAG &DAG) const { 12610 MachineFunction &MF = DAG.getMachineFunction(); 12611 MachineFrameInfo &MFI = MF.getFrameInfo(); 12612 MFI.setReturnAddressIsTaken(true); 12613 12614 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 12615 return SDValue(); 12616 12617 SDLoc dl(Op); 12618 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 12619 12620 // Make sure the function does not optimize away the store of the RA to 12621 // the stack. 12622 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 12623 FuncInfo->setLRStoreRequired(); 12624 bool isPPC64 = Subtarget.isPPC64(); 12625 auto PtrVT = getPointerTy(MF.getDataLayout()); 12626 12627 if (Depth > 0) { 12628 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 12629 SDValue Offset = 12630 DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl, 12631 isPPC64 ? MVT::i64 : MVT::i32); 12632 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 12633 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), 12634 MachinePointerInfo()); 12635 } 12636 12637 // Just load the return address off the stack. 12638 SDValue RetAddrFI = getReturnAddrFrameIndex(DAG); 12639 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, 12640 MachinePointerInfo()); 12641 } 12642 12643 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, 12644 SelectionDAG &DAG) const { 12645 SDLoc dl(Op); 12646 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 12647 12648 MachineFunction &MF = DAG.getMachineFunction(); 12649 MachineFrameInfo &MFI = MF.getFrameInfo(); 12650 MFI.setFrameAddressIsTaken(true); 12651 12652 EVT PtrVT = getPointerTy(MF.getDataLayout()); 12653 bool isPPC64 = PtrVT == MVT::i64; 12654 12655 // Naked functions never have a frame pointer, and so we use r1. For all 12656 // other functions, this decision must be delayed until during PEI. 12657 unsigned FrameReg; 12658 if (MF.getFunction()->hasFnAttribute(Attribute::Naked)) 12659 FrameReg = isPPC64 ? PPC::X1 : PPC::R1; 12660 else 12661 FrameReg = isPPC64 ? PPC::FP8 : PPC::FP; 12662 12663 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, 12664 PtrVT); 12665 while (Depth--) 12666 FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(), 12667 FrameAddr, MachinePointerInfo()); 12668 return FrameAddr; 12669 } 12670 12671 // FIXME? Maybe this could be a TableGen attribute on some registers and 12672 // this table could be generated automatically from RegInfo. 12673 unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, 12674 SelectionDAG &DAG) const { 12675 bool isPPC64 = Subtarget.isPPC64(); 12676 bool isDarwinABI = Subtarget.isDarwinABI(); 12677 12678 if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || 12679 (!isPPC64 && VT != MVT::i32)) 12680 report_fatal_error("Invalid register global variable type"); 12681 12682 bool is64Bit = isPPC64 && VT == MVT::i64; 12683 unsigned Reg = StringSwitch<unsigned>(RegName) 12684 .Case("r1", is64Bit ? PPC::X1 : PPC::R1) 12685 .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2) 12686 .Case("r13", (!isPPC64 && isDarwinABI) ? 0 : 12687 (is64Bit ? PPC::X13 : PPC::R13)) 12688 .Default(0); 12689 12690 if (Reg) 12691 return Reg; 12692 report_fatal_error("Invalid register name global variable"); 12693 } 12694 12695 bool 12696 PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 12697 // The PowerPC target isn't yet aware of offsets. 12698 return false; 12699 } 12700 12701 bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 12702 const CallInst &I, 12703 unsigned Intrinsic) const { 12704 switch (Intrinsic) { 12705 case Intrinsic::ppc_qpx_qvlfd: 12706 case Intrinsic::ppc_qpx_qvlfs: 12707 case Intrinsic::ppc_qpx_qvlfcd: 12708 case Intrinsic::ppc_qpx_qvlfcs: 12709 case Intrinsic::ppc_qpx_qvlfiwa: 12710 case Intrinsic::ppc_qpx_qvlfiwz: 12711 case Intrinsic::ppc_altivec_lvx: 12712 case Intrinsic::ppc_altivec_lvxl: 12713 case Intrinsic::ppc_altivec_lvebx: 12714 case Intrinsic::ppc_altivec_lvehx: 12715 case Intrinsic::ppc_altivec_lvewx: 12716 case Intrinsic::ppc_vsx_lxvd2x: 12717 case Intrinsic::ppc_vsx_lxvw4x: { 12718 EVT VT; 12719 switch (Intrinsic) { 12720 case Intrinsic::ppc_altivec_lvebx: 12721 VT = MVT::i8; 12722 break; 12723 case Intrinsic::ppc_altivec_lvehx: 12724 VT = MVT::i16; 12725 break; 12726 case Intrinsic::ppc_altivec_lvewx: 12727 VT = MVT::i32; 12728 break; 12729 case Intrinsic::ppc_vsx_lxvd2x: 12730 VT = MVT::v2f64; 12731 break; 12732 case Intrinsic::ppc_qpx_qvlfd: 12733 VT = MVT::v4f64; 12734 break; 12735 case Intrinsic::ppc_qpx_qvlfs: 12736 VT = MVT::v4f32; 12737 break; 12738 case Intrinsic::ppc_qpx_qvlfcd: 12739 VT = MVT::v2f64; 12740 break; 12741 case Intrinsic::ppc_qpx_qvlfcs: 12742 VT = MVT::v2f32; 12743 break; 12744 default: 12745 VT = MVT::v4i32; 12746 break; 12747 } 12748 12749 Info.opc = ISD::INTRINSIC_W_CHAIN; 12750 Info.memVT = VT; 12751 Info.ptrVal = I.getArgOperand(0); 12752 Info.offset = -VT.getStoreSize()+1; 12753 Info.size = 2*VT.getStoreSize()-1; 12754 Info.align = 1; 12755 Info.vol = false; 12756 Info.readMem = true; 12757 Info.writeMem = false; 12758 return true; 12759 } 12760 case Intrinsic::ppc_qpx_qvlfda: 12761 case Intrinsic::ppc_qpx_qvlfsa: 12762 case Intrinsic::ppc_qpx_qvlfcda: 12763 case Intrinsic::ppc_qpx_qvlfcsa: 12764 case Intrinsic::ppc_qpx_qvlfiwaa: 12765 case Intrinsic::ppc_qpx_qvlfiwza: { 12766 EVT VT; 12767 switch (Intrinsic) { 12768 case Intrinsic::ppc_qpx_qvlfda: 12769 VT = MVT::v4f64; 12770 break; 12771 case Intrinsic::ppc_qpx_qvlfsa: 12772 VT = MVT::v4f32; 12773 break; 12774 case Intrinsic::ppc_qpx_qvlfcda: 12775 VT = MVT::v2f64; 12776 break; 12777 case Intrinsic::ppc_qpx_qvlfcsa: 12778 VT = MVT::v2f32; 12779 break; 12780 default: 12781 VT = MVT::v4i32; 12782 break; 12783 } 12784 12785 Info.opc = ISD::INTRINSIC_W_CHAIN; 12786 Info.memVT = VT; 12787 Info.ptrVal = I.getArgOperand(0); 12788 Info.offset = 0; 12789 Info.size = VT.getStoreSize(); 12790 Info.align = 1; 12791 Info.vol = false; 12792 Info.readMem = true; 12793 Info.writeMem = false; 12794 return true; 12795 } 12796 case Intrinsic::ppc_qpx_qvstfd: 12797 case Intrinsic::ppc_qpx_qvstfs: 12798 case Intrinsic::ppc_qpx_qvstfcd: 12799 case Intrinsic::ppc_qpx_qvstfcs: 12800 case Intrinsic::ppc_qpx_qvstfiw: 12801 case Intrinsic::ppc_altivec_stvx: 12802 case Intrinsic::ppc_altivec_stvxl: 12803 case Intrinsic::ppc_altivec_stvebx: 12804 case Intrinsic::ppc_altivec_stvehx: 12805 case Intrinsic::ppc_altivec_stvewx: 12806 case Intrinsic::ppc_vsx_stxvd2x: 12807 case Intrinsic::ppc_vsx_stxvw4x: { 12808 EVT VT; 12809 switch (Intrinsic) { 12810 case Intrinsic::ppc_altivec_stvebx: 12811 VT = MVT::i8; 12812 break; 12813 case Intrinsic::ppc_altivec_stvehx: 12814 VT = MVT::i16; 12815 break; 12816 case Intrinsic::ppc_altivec_stvewx: 12817 VT = MVT::i32; 12818 break; 12819 case Intrinsic::ppc_vsx_stxvd2x: 12820 VT = MVT::v2f64; 12821 break; 12822 case Intrinsic::ppc_qpx_qvstfd: 12823 VT = MVT::v4f64; 12824 break; 12825 case Intrinsic::ppc_qpx_qvstfs: 12826 VT = MVT::v4f32; 12827 break; 12828 case Intrinsic::ppc_qpx_qvstfcd: 12829 VT = MVT::v2f64; 12830 break; 12831 case Intrinsic::ppc_qpx_qvstfcs: 12832 VT = MVT::v2f32; 12833 break; 12834 default: 12835 VT = MVT::v4i32; 12836 break; 12837 } 12838 12839 Info.opc = ISD::INTRINSIC_VOID; 12840 Info.memVT = VT; 12841 Info.ptrVal = I.getArgOperand(1); 12842 Info.offset = -VT.getStoreSize()+1; 12843 Info.size = 2*VT.getStoreSize()-1; 12844 Info.align = 1; 12845 Info.vol = false; 12846 Info.readMem = false; 12847 Info.writeMem = true; 12848 return true; 12849 } 12850 case Intrinsic::ppc_qpx_qvstfda: 12851 case Intrinsic::ppc_qpx_qvstfsa: 12852 case Intrinsic::ppc_qpx_qvstfcda: 12853 case Intrinsic::ppc_qpx_qvstfcsa: 12854 case Intrinsic::ppc_qpx_qvstfiwa: { 12855 EVT VT; 12856 switch (Intrinsic) { 12857 case Intrinsic::ppc_qpx_qvstfda: 12858 VT = MVT::v4f64; 12859 break; 12860 case Intrinsic::ppc_qpx_qvstfsa: 12861 VT = MVT::v4f32; 12862 break; 12863 case Intrinsic::ppc_qpx_qvstfcda: 12864 VT = MVT::v2f64; 12865 break; 12866 case Intrinsic::ppc_qpx_qvstfcsa: 12867 VT = MVT::v2f32; 12868 break; 12869 default: 12870 VT = MVT::v4i32; 12871 break; 12872 } 12873 12874 Info.opc = ISD::INTRINSIC_VOID; 12875 Info.memVT = VT; 12876 Info.ptrVal = I.getArgOperand(1); 12877 Info.offset = 0; 12878 Info.size = VT.getStoreSize(); 12879 Info.align = 1; 12880 Info.vol = false; 12881 Info.readMem = false; 12882 Info.writeMem = true; 12883 return true; 12884 } 12885 default: 12886 break; 12887 } 12888 12889 return false; 12890 } 12891 12892 /// getOptimalMemOpType - Returns the target specific optimal type for load 12893 /// and store operations as a result of memset, memcpy, and memmove 12894 /// lowering. If DstAlign is zero that means it's safe to destination 12895 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 12896 /// means there isn't a need to check it against alignment requirement, 12897 /// probably because the source does not need to be loaded. If 'IsMemset' is 12898 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 12899 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 12900 /// source is constant so it does not need to be loaded. 12901 /// It returns EVT::Other if the type should be determined using generic 12902 /// target-independent logic. 12903 EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, 12904 unsigned DstAlign, unsigned SrcAlign, 12905 bool IsMemset, bool ZeroMemset, 12906 bool MemcpyStrSrc, 12907 MachineFunction &MF) const { 12908 if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { 12909 const Function *F = MF.getFunction(); 12910 // When expanding a memset, require at least two QPX instructions to cover 12911 // the cost of loading the value to be stored from the constant pool. 12912 if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) && 12913 (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) && 12914 !F->hasFnAttribute(Attribute::NoImplicitFloat)) { 12915 return MVT::v4f64; 12916 } 12917 12918 // We should use Altivec/VSX loads and stores when available. For unaligned 12919 // addresses, unaligned VSX loads are only fast starting with the P8. 12920 if (Subtarget.hasAltivec() && Size >= 16 && 12921 (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) || 12922 ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector()))) 12923 return MVT::v4i32; 12924 } 12925 12926 if (Subtarget.isPPC64()) { 12927 return MVT::i64; 12928 } 12929 12930 return MVT::i32; 12931 } 12932 12933 /// \brief Returns true if it is beneficial to convert a load of a constant 12934 /// to just the constant itself. 12935 bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 12936 Type *Ty) const { 12937 assert(Ty->isIntegerTy()); 12938 12939 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 12940 return !(BitSize == 0 || BitSize > 64); 12941 } 12942 12943 bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 12944 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 12945 return false; 12946 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 12947 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 12948 return NumBits1 == 64 && NumBits2 == 32; 12949 } 12950 12951 bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 12952 if (!VT1.isInteger() || !VT2.isInteger()) 12953 return false; 12954 unsigned NumBits1 = VT1.getSizeInBits(); 12955 unsigned NumBits2 = VT2.getSizeInBits(); 12956 return NumBits1 == 64 && NumBits2 == 32; 12957 } 12958 12959 bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 12960 // Generally speaking, zexts are not free, but they are free when they can be 12961 // folded with other operations. 12962 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) { 12963 EVT MemVT = LD->getMemoryVT(); 12964 if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 || 12965 (Subtarget.isPPC64() && MemVT == MVT::i32)) && 12966 (LD->getExtensionType() == ISD::NON_EXTLOAD || 12967 LD->getExtensionType() == ISD::ZEXTLOAD)) 12968 return true; 12969 } 12970 12971 // FIXME: Add other cases... 12972 // - 32-bit shifts with a zext to i64 12973 // - zext after ctlz, bswap, etc. 12974 // - zext after and by a constant mask 12975 12976 return TargetLowering::isZExtFree(Val, VT2); 12977 } 12978 12979 bool PPCTargetLowering::isFPExtFree(EVT VT) const { 12980 assert(VT.isFloatingPoint()); 12981 return true; 12982 } 12983 12984 bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 12985 return isInt<16>(Imm) || isUInt<16>(Imm); 12986 } 12987 12988 bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { 12989 return isInt<16>(Imm) || isUInt<16>(Imm); 12990 } 12991 12992 bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 12993 unsigned, 12994 unsigned, 12995 bool *Fast) const { 12996 if (DisablePPCUnaligned) 12997 return false; 12998 12999 // PowerPC supports unaligned memory access for simple non-vector types. 13000 // Although accessing unaligned addresses is not as efficient as accessing 13001 // aligned addresses, it is generally more efficient than manual expansion, 13002 // and generally only traps for software emulation when crossing page 13003 // boundaries. 13004 13005 if (!VT.isSimple()) 13006 return false; 13007 13008 if (VT.getSimpleVT().isVector()) { 13009 if (Subtarget.hasVSX()) { 13010 if (VT != MVT::v2f64 && VT != MVT::v2i64 && 13011 VT != MVT::v4f32 && VT != MVT::v4i32) 13012 return false; 13013 } else { 13014 return false; 13015 } 13016 } 13017 13018 if (VT == MVT::ppcf128) 13019 return false; 13020 13021 if (Fast) 13022 *Fast = true; 13023 13024 return true; 13025 } 13026 13027 bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 13028 VT = VT.getScalarType(); 13029 13030 if (!VT.isSimple()) 13031 return false; 13032 13033 switch (VT.getSimpleVT().SimpleTy) { 13034 case MVT::f32: 13035 case MVT::f64: 13036 return true; 13037 default: 13038 break; 13039 } 13040 13041 return false; 13042 } 13043 13044 const MCPhysReg * 13045 PPCTargetLowering::getScratchRegisters(CallingConv::ID) const { 13046 // LR is a callee-save register, but we must treat it as clobbered by any call 13047 // site. Hence we include LR in the scratch registers, which are in turn added 13048 // as implicit-defs for stackmaps and patchpoints. The same reasoning applies 13049 // to CTR, which is used by any indirect call. 13050 static const MCPhysReg ScratchRegs[] = { 13051 PPC::X12, PPC::LR8, PPC::CTR8, 0 13052 }; 13053 13054 return ScratchRegs; 13055 } 13056 13057 unsigned PPCTargetLowering::getExceptionPointerRegister( 13058 const Constant *PersonalityFn) const { 13059 return Subtarget.isPPC64() ? PPC::X3 : PPC::R3; 13060 } 13061 13062 unsigned PPCTargetLowering::getExceptionSelectorRegister( 13063 const Constant *PersonalityFn) const { 13064 return Subtarget.isPPC64() ? PPC::X4 : PPC::R4; 13065 } 13066 13067 bool 13068 PPCTargetLowering::shouldExpandBuildVectorWithShuffles( 13069 EVT VT , unsigned DefinedValues) const { 13070 if (VT == MVT::v2i64) 13071 return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves 13072 13073 if (Subtarget.hasVSX() || Subtarget.hasQPX()) 13074 return true; 13075 13076 return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); 13077 } 13078 13079 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const { 13080 if (DisableILPPref || Subtarget.enableMachineScheduler()) 13081 return TargetLowering::getSchedulingPreference(N); 13082 13083 return Sched::ILP; 13084 } 13085 13086 // Create a fast isel object. 13087 FastISel * 13088 PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo, 13089 const TargetLibraryInfo *LibInfo) const { 13090 return PPC::createFastISel(FuncInfo, LibInfo); 13091 } 13092 13093 void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 13094 if (Subtarget.isDarwinABI()) return; 13095 if (!Subtarget.isPPC64()) return; 13096 13097 // Update IsSplitCSR in PPCFunctionInfo 13098 PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>(); 13099 PFI->setIsSplitCSR(true); 13100 } 13101 13102 void PPCTargetLowering::insertCopiesSplitCSR( 13103 MachineBasicBlock *Entry, 13104 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 13105 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); 13106 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 13107 if (!IStart) 13108 return; 13109 13110 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 13111 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 13112 MachineBasicBlock::iterator MBBI = Entry->begin(); 13113 for (const MCPhysReg *I = IStart; *I; ++I) { 13114 const TargetRegisterClass *RC = nullptr; 13115 if (PPC::G8RCRegClass.contains(*I)) 13116 RC = &PPC::G8RCRegClass; 13117 else if (PPC::F8RCRegClass.contains(*I)) 13118 RC = &PPC::F8RCRegClass; 13119 else if (PPC::CRRCRegClass.contains(*I)) 13120 RC = &PPC::CRRCRegClass; 13121 else if (PPC::VRRCRegClass.contains(*I)) 13122 RC = &PPC::VRRCRegClass; 13123 else 13124 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 13125 13126 unsigned NewVR = MRI->createVirtualRegister(RC); 13127 // Create copy from CSR to a virtual register. 13128 // FIXME: this currently does not emit CFI pseudo-instructions, it works 13129 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 13130 // nounwind. If we want to generalize this later, we may need to emit 13131 // CFI pseudo-instructions. 13132 assert(Entry->getParent()->getFunction()->hasFnAttribute( 13133 Attribute::NoUnwind) && 13134 "Function should be nounwind in insertCopiesSplitCSR!"); 13135 Entry->addLiveIn(*I); 13136 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 13137 .addReg(*I); 13138 13139 // Insert the copy-back instructions right before the terminator 13140 for (auto *Exit : Exits) 13141 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 13142 TII->get(TargetOpcode::COPY), *I) 13143 .addReg(NewVR); 13144 } 13145 } 13146 13147 // Override to enable LOAD_STACK_GUARD lowering on Linux. 13148 bool PPCTargetLowering::useLoadStackGuardNode() const { 13149 if (!Subtarget.isTargetLinux()) 13150 return TargetLowering::useLoadStackGuardNode(); 13151 return true; 13152 } 13153 13154 // Override to disable global variable loading on Linux. 13155 void PPCTargetLowering::insertSSPDeclarations(Module &M) const { 13156 if (!Subtarget.isTargetLinux()) 13157 return TargetLowering::insertSSPDeclarations(M); 13158 } 13159 13160 bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 13161 if (!VT.isSimple() || !Subtarget.hasVSX()) 13162 return false; 13163 13164 switch(VT.getSimpleVT().SimpleTy) { 13165 default: 13166 // For FP types that are currently not supported by PPC backend, return 13167 // false. Examples: f16, f80. 13168 return false; 13169 case MVT::f32: 13170 case MVT::f64: 13171 case MVT::ppcf128: 13172 return Imm.isPosZero(); 13173 } 13174 } 13175 13176 // For vector shift operation op, fold 13177 // (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y) 13178 static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N, 13179 SelectionDAG &DAG) { 13180 SDValue N0 = N->getOperand(0); 13181 SDValue N1 = N->getOperand(1); 13182 EVT VT = N0.getValueType(); 13183 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 13184 unsigned Opcode = N->getOpcode(); 13185 unsigned TargetOpcode; 13186 13187 switch (Opcode) { 13188 default: 13189 llvm_unreachable("Unexpected shift operation"); 13190 case ISD::SHL: 13191 TargetOpcode = PPCISD::SHL; 13192 break; 13193 case ISD::SRL: 13194 TargetOpcode = PPCISD::SRL; 13195 break; 13196 case ISD::SRA: 13197 TargetOpcode = PPCISD::SRA; 13198 break; 13199 } 13200 13201 if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) && 13202 N1->getOpcode() == ISD::AND) 13203 if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) 13204 if (Mask->getZExtValue() == OpSizeInBits - 1) 13205 return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0)); 13206 13207 return SDValue(); 13208 } 13209 13210 SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const { 13211 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) 13212 return Value; 13213 13214 return SDValue(); 13215 } 13216 13217 SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const { 13218 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) 13219 return Value; 13220 13221 return SDValue(); 13222 } 13223 13224 SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { 13225 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) 13226 return Value; 13227 13228 return SDValue(); 13229 } 13230