1 //===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the PPCISelLowering class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "PPCISelLowering.h" 15 #include "MCTargetDesc/PPCPredicates.h" 16 #include "PPC.h" 17 #include "PPCCCState.h" 18 #include "PPCCallingConv.h" 19 #include "PPCFrameLowering.h" 20 #include "PPCInstrInfo.h" 21 #include "PPCMachineFunctionInfo.h" 22 #include "PPCPerfectShuffle.h" 23 #include "PPCRegisterInfo.h" 24 #include "PPCSubtarget.h" 25 #include "PPCTargetMachine.h" 26 #include "llvm/ADT/APFloat.h" 27 #include "llvm/ADT/APInt.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/DenseMap.h" 30 #include "llvm/ADT/None.h" 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/ADT/SmallPtrSet.h" 33 #include "llvm/ADT/SmallSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Statistic.h" 36 #include "llvm/ADT/StringRef.h" 37 #include "llvm/ADT/StringSwitch.h" 38 #include "llvm/CodeGen/CallingConvLower.h" 39 #include "llvm/CodeGen/ISDOpcodes.h" 40 #include "llvm/CodeGen/MachineBasicBlock.h" 41 #include "llvm/CodeGen/MachineFrameInfo.h" 42 #include "llvm/CodeGen/MachineFunction.h" 43 #include "llvm/CodeGen/MachineInstr.h" 44 #include "llvm/CodeGen/MachineInstrBuilder.h" 45 #include "llvm/CodeGen/MachineJumpTableInfo.h" 46 #include "llvm/CodeGen/MachineLoopInfo.h" 47 #include "llvm/CodeGen/MachineMemOperand.h" 48 #include "llvm/CodeGen/MachineOperand.h" 49 #include "llvm/CodeGen/MachineRegisterInfo.h" 50 #include "llvm/CodeGen/MachineValueType.h" 51 #include "llvm/CodeGen/RuntimeLibcalls.h" 52 #include "llvm/CodeGen/SelectionDAG.h" 53 #include "llvm/CodeGen/SelectionDAGNodes.h" 54 #include "llvm/CodeGen/ValueTypes.h" 55 #include "llvm/IR/CallSite.h" 56 #include "llvm/IR/CallingConv.h" 57 #include "llvm/IR/Constant.h" 58 #include "llvm/IR/Constants.h" 59 #include "llvm/IR/DataLayout.h" 60 #include "llvm/IR/DebugLoc.h" 61 #include "llvm/IR/DerivedTypes.h" 62 #include "llvm/IR/Function.h" 63 #include "llvm/IR/GlobalValue.h" 64 #include "llvm/IR/IRBuilder.h" 65 #include "llvm/IR/Instructions.h" 66 #include "llvm/IR/Intrinsics.h" 67 #include "llvm/IR/Module.h" 68 #include "llvm/IR/Type.h" 69 #include "llvm/IR/Use.h" 70 #include "llvm/IR/Value.h" 71 #include "llvm/MC/MCExpr.h" 72 #include "llvm/MC/MCRegisterInfo.h" 73 #include "llvm/Support/AtomicOrdering.h" 74 #include "llvm/Support/BranchProbability.h" 75 #include "llvm/Support/Casting.h" 76 #include "llvm/Support/CodeGen.h" 77 #include "llvm/Support/CommandLine.h" 78 #include "llvm/Support/Compiler.h" 79 #include "llvm/Support/Debug.h" 80 #include "llvm/Support/ErrorHandling.h" 81 #include "llvm/Support/Format.h" 82 #include "llvm/Support/KnownBits.h" 83 #include "llvm/Support/MathExtras.h" 84 #include "llvm/Support/raw_ostream.h" 85 #include "llvm/Target/TargetInstrInfo.h" 86 #include "llvm/Target/TargetLowering.h" 87 #include "llvm/Target/TargetMachine.h" 88 #include "llvm/Target/TargetOptions.h" 89 #include "llvm/Target/TargetRegisterInfo.h" 90 #include <algorithm> 91 #include <cassert> 92 #include <cstdint> 93 #include <iterator> 94 #include <list> 95 #include <utility> 96 #include <vector> 97 98 using namespace llvm; 99 100 #define DEBUG_TYPE "ppc-lowering" 101 102 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc", 103 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); 104 105 static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref", 106 cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden); 107 108 static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned", 109 cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); 110 111 static cl::opt<bool> DisableSCO("disable-ppc-sco", 112 cl::desc("disable sibling call optimization on ppc"), cl::Hidden); 113 114 STATISTIC(NumTailCalls, "Number of tail calls"); 115 STATISTIC(NumSiblingCalls, "Number of sibling calls"); 116 117 // FIXME: Remove this once the bug has been fixed! 118 extern cl::opt<bool> ANDIGlueBug; 119 120 PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, 121 const PPCSubtarget &STI) 122 : TargetLowering(TM), Subtarget(STI) { 123 // Use _setjmp/_longjmp instead of setjmp/longjmp. 124 setUseUnderscoreSetJmp(true); 125 setUseUnderscoreLongJmp(true); 126 127 // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all 128 // arguments are at least 4/8 bytes aligned. 129 bool isPPC64 = Subtarget.isPPC64(); 130 setMinStackArgumentAlignment(isPPC64 ? 8:4); 131 132 // Set up the register classes. 133 addRegisterClass(MVT::i32, &PPC::GPRCRegClass); 134 if (!useSoftFloat()) { 135 addRegisterClass(MVT::f32, &PPC::F4RCRegClass); 136 addRegisterClass(MVT::f64, &PPC::F8RCRegClass); 137 } 138 139 // Match BITREVERSE to customized fast code sequence in the td file. 140 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 141 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 142 143 // PowerPC has an i16 but no i8 (or i1) SEXTLOAD. 144 for (MVT VT : MVT::integer_valuetypes()) { 145 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 146 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); 147 } 148 149 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 150 151 // PowerPC has pre-inc load and store's. 152 setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal); 153 setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal); 154 setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal); 155 setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal); 156 setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal); 157 setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal); 158 setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal); 159 setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal); 160 setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal); 161 setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal); 162 setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal); 163 setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal); 164 setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal); 165 setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal); 166 167 if (Subtarget.useCRBits()) { 168 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 169 170 if (isPPC64 || Subtarget.hasFPCVT()) { 171 setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); 172 AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, 173 isPPC64 ? MVT::i64 : MVT::i32); 174 setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); 175 AddPromotedToType(ISD::UINT_TO_FP, MVT::i1, 176 isPPC64 ? MVT::i64 : MVT::i32); 177 } else { 178 setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom); 179 setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); 180 } 181 182 // PowerPC does not support direct load/store of condition registers. 183 setOperationAction(ISD::LOAD, MVT::i1, Custom); 184 setOperationAction(ISD::STORE, MVT::i1, Custom); 185 186 // FIXME: Remove this once the ANDI glue bug is fixed: 187 if (ANDIGlueBug) 188 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); 189 190 for (MVT VT : MVT::integer_valuetypes()) { 191 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 192 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 193 setTruncStoreAction(VT, MVT::i1, Expand); 194 } 195 196 addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass); 197 } 198 199 // This is used in the ppcf128->int sequence. Note it has different semantics 200 // from FP_ROUND: that rounds to nearest, this rounds to zero. 201 setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom); 202 203 // We do not currently implement these libm ops for PowerPC. 204 setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand); 205 setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand); 206 setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand); 207 setOperationAction(ISD::FRINT, MVT::ppcf128, Expand); 208 setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand); 209 setOperationAction(ISD::FREM, MVT::ppcf128, Expand); 210 211 // PowerPC has no SREM/UREM instructions unless we are on P9 212 // On P9 we may use a hardware instruction to compute the remainder. 213 // The instructions are not legalized directly because in the cases where the 214 // result of both the remainder and the division is required it is more 215 // efficient to compute the remainder from the result of the division rather 216 // than use the remainder instruction. 217 if (Subtarget.isISA3_0()) { 218 setOperationAction(ISD::SREM, MVT::i32, Custom); 219 setOperationAction(ISD::UREM, MVT::i32, Custom); 220 setOperationAction(ISD::SREM, MVT::i64, Custom); 221 setOperationAction(ISD::UREM, MVT::i64, Custom); 222 } else { 223 setOperationAction(ISD::SREM, MVT::i32, Expand); 224 setOperationAction(ISD::UREM, MVT::i32, Expand); 225 setOperationAction(ISD::SREM, MVT::i64, Expand); 226 setOperationAction(ISD::UREM, MVT::i64, Expand); 227 } 228 229 // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. 230 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 231 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 232 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 233 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 234 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 235 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 236 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 237 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 238 239 // We don't support sin/cos/sqrt/fmod/pow 240 setOperationAction(ISD::FSIN , MVT::f64, Expand); 241 setOperationAction(ISD::FCOS , MVT::f64, Expand); 242 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 243 setOperationAction(ISD::FREM , MVT::f64, Expand); 244 setOperationAction(ISD::FPOW , MVT::f64, Expand); 245 setOperationAction(ISD::FMA , MVT::f64, Legal); 246 setOperationAction(ISD::FSIN , MVT::f32, Expand); 247 setOperationAction(ISD::FCOS , MVT::f32, Expand); 248 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 249 setOperationAction(ISD::FREM , MVT::f32, Expand); 250 setOperationAction(ISD::FPOW , MVT::f32, Expand); 251 setOperationAction(ISD::FMA , MVT::f32, Legal); 252 253 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 254 255 // If we're enabling GP optimizations, use hardware square root 256 if (!Subtarget.hasFSQRT() && 257 !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() && 258 Subtarget.hasFRE())) 259 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 260 261 if (!Subtarget.hasFSQRT() && 262 !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() && 263 Subtarget.hasFRES())) 264 setOperationAction(ISD::FSQRT, MVT::f32, Expand); 265 266 if (Subtarget.hasFCPSGN()) { 267 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal); 268 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal); 269 } else { 270 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 271 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 272 } 273 274 if (Subtarget.hasFPRND()) { 275 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 276 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 277 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 278 setOperationAction(ISD::FROUND, MVT::f64, Legal); 279 280 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 281 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 282 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 283 setOperationAction(ISD::FROUND, MVT::f32, Legal); 284 } 285 286 // PowerPC does not have BSWAP 287 // CTPOP or CTTZ were introduced in P8/P9 respectivelly 288 setOperationAction(ISD::BSWAP, MVT::i32 , Expand); 289 setOperationAction(ISD::BSWAP, MVT::i64 , Expand); 290 if (Subtarget.isISA3_0()) { 291 setOperationAction(ISD::CTTZ , MVT::i32 , Legal); 292 setOperationAction(ISD::CTTZ , MVT::i64 , Legal); 293 } else { 294 setOperationAction(ISD::CTTZ , MVT::i32 , Expand); 295 setOperationAction(ISD::CTTZ , MVT::i64 , Expand); 296 } 297 298 if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) { 299 setOperationAction(ISD::CTPOP, MVT::i32 , Legal); 300 setOperationAction(ISD::CTPOP, MVT::i64 , Legal); 301 } else { 302 setOperationAction(ISD::CTPOP, MVT::i32 , Expand); 303 setOperationAction(ISD::CTPOP, MVT::i64 , Expand); 304 } 305 306 // PowerPC does not have ROTR 307 setOperationAction(ISD::ROTR, MVT::i32 , Expand); 308 setOperationAction(ISD::ROTR, MVT::i64 , Expand); 309 310 if (!Subtarget.useCRBits()) { 311 // PowerPC does not have Select 312 setOperationAction(ISD::SELECT, MVT::i32, Expand); 313 setOperationAction(ISD::SELECT, MVT::i64, Expand); 314 setOperationAction(ISD::SELECT, MVT::f32, Expand); 315 setOperationAction(ISD::SELECT, MVT::f64, Expand); 316 } 317 318 // PowerPC wants to turn select_cc of FP into fsel when possible. 319 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 320 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 321 322 // PowerPC wants to optimize integer setcc a bit 323 if (!Subtarget.useCRBits()) 324 setOperationAction(ISD::SETCC, MVT::i32, Custom); 325 326 // PowerPC does not have BRCOND which requires SetCC 327 if (!Subtarget.useCRBits()) 328 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 329 330 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 331 332 // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores. 333 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 334 335 // PowerPC does not have [U|S]INT_TO_FP 336 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); 337 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); 338 339 if (Subtarget.hasDirectMove() && isPPC64) { 340 setOperationAction(ISD::BITCAST, MVT::f32, Legal); 341 setOperationAction(ISD::BITCAST, MVT::i32, Legal); 342 setOperationAction(ISD::BITCAST, MVT::i64, Legal); 343 setOperationAction(ISD::BITCAST, MVT::f64, Legal); 344 } else { 345 setOperationAction(ISD::BITCAST, MVT::f32, Expand); 346 setOperationAction(ISD::BITCAST, MVT::i32, Expand); 347 setOperationAction(ISD::BITCAST, MVT::i64, Expand); 348 setOperationAction(ISD::BITCAST, MVT::f64, Expand); 349 } 350 351 // We cannot sextinreg(i1). Expand to shifts. 352 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 353 354 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support 355 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 356 // support continuation, user-level threading, and etc.. As a result, no 357 // other SjLj exception interfaces are implemented and please don't build 358 // your own exception handling based on them. 359 // LLVM/Clang supports zero-cost DWARF exception handling. 360 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 361 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 362 363 // We want to legalize GlobalAddress and ConstantPool nodes into the 364 // appropriate instructions to materialize the address. 365 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 366 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 367 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 368 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 369 setOperationAction(ISD::JumpTable, MVT::i32, Custom); 370 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 371 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 372 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 373 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 374 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 375 376 // TRAP is legal. 377 setOperationAction(ISD::TRAP, MVT::Other, Legal); 378 379 // TRAMPOLINE is custom lowered. 380 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 381 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 382 383 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 384 setOperationAction(ISD::VASTART , MVT::Other, Custom); 385 386 if (Subtarget.isSVR4ABI()) { 387 if (isPPC64) { 388 // VAARG always uses double-word chunks, so promote anything smaller. 389 setOperationAction(ISD::VAARG, MVT::i1, Promote); 390 AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64); 391 setOperationAction(ISD::VAARG, MVT::i8, Promote); 392 AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64); 393 setOperationAction(ISD::VAARG, MVT::i16, Promote); 394 AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64); 395 setOperationAction(ISD::VAARG, MVT::i32, Promote); 396 AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64); 397 setOperationAction(ISD::VAARG, MVT::Other, Expand); 398 } else { 399 // VAARG is custom lowered with the 32-bit SVR4 ABI. 400 setOperationAction(ISD::VAARG, MVT::Other, Custom); 401 setOperationAction(ISD::VAARG, MVT::i64, Custom); 402 } 403 } else 404 setOperationAction(ISD::VAARG, MVT::Other, Expand); 405 406 if (Subtarget.isSVR4ABI() && !isPPC64) 407 // VACOPY is custom lowered with the 32-bit SVR4 ABI. 408 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 409 else 410 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 411 412 // Use the default implementation. 413 setOperationAction(ISD::VAEND , MVT::Other, Expand); 414 setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); 415 setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); 416 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); 417 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); 418 setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom); 419 setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom); 420 setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); 421 setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom); 422 423 // We want to custom lower some of our intrinsics. 424 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 425 426 // To handle counter-based loop conditions. 427 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); 428 429 setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); 430 setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); 431 setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom); 432 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 433 434 // Comparisons that require checking two conditions. 435 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 436 setCondCodeAction(ISD::SETULT, MVT::f64, Expand); 437 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 438 setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); 439 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 440 setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); 441 setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); 442 setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); 443 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 444 setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); 445 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 446 setCondCodeAction(ISD::SETONE, MVT::f64, Expand); 447 448 if (Subtarget.has64BitSupport()) { 449 // They also have instructions for converting between i64 and fp. 450 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 451 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); 452 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 453 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); 454 // This is just the low 32 bits of a (signed) fp->i64 conversion. 455 // We cannot do this with Promote because i64 is not a legal type. 456 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 457 458 if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) 459 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 460 } else { 461 // PowerPC does not have FP_TO_UINT on 32-bit implementations. 462 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); 463 } 464 465 // With the instructions enabled under FPCVT, we can do everything. 466 if (Subtarget.hasFPCVT()) { 467 if (Subtarget.has64BitSupport()) { 468 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 469 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 470 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 471 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 472 } 473 474 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 475 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 476 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 477 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 478 } 479 480 if (Subtarget.use64BitRegs()) { 481 // 64-bit PowerPC implementations can support i64 types directly 482 addRegisterClass(MVT::i64, &PPC::G8RCRegClass); 483 // BUILD_PAIR can't be handled natively, and should be expanded to shl/or 484 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); 485 // 64-bit PowerPC wants to expand i128 shifts itself. 486 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 487 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 488 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 489 } else { 490 // 32-bit PowerPC wants to expand i64 shifts itself. 491 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 492 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 493 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 494 } 495 496 if (Subtarget.hasAltivec()) { 497 // First set operation action for all vector types to expand. Then we 498 // will selectively turn on ones that can be effectively codegen'd. 499 for (MVT VT : MVT::vector_valuetypes()) { 500 // add/sub are legal for all supported vector VT's. 501 setOperationAction(ISD::ADD, VT, Legal); 502 setOperationAction(ISD::SUB, VT, Legal); 503 504 // Vector instructions introduced in P8 505 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { 506 setOperationAction(ISD::CTPOP, VT, Legal); 507 setOperationAction(ISD::CTLZ, VT, Legal); 508 } 509 else { 510 setOperationAction(ISD::CTPOP, VT, Expand); 511 setOperationAction(ISD::CTLZ, VT, Expand); 512 } 513 514 // Vector instructions introduced in P9 515 if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128)) 516 setOperationAction(ISD::CTTZ, VT, Legal); 517 else 518 setOperationAction(ISD::CTTZ, VT, Expand); 519 520 // We promote all shuffles to v16i8. 521 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote); 522 AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8); 523 524 // We promote all non-typed operations to v4i32. 525 setOperationAction(ISD::AND , VT, Promote); 526 AddPromotedToType (ISD::AND , VT, MVT::v4i32); 527 setOperationAction(ISD::OR , VT, Promote); 528 AddPromotedToType (ISD::OR , VT, MVT::v4i32); 529 setOperationAction(ISD::XOR , VT, Promote); 530 AddPromotedToType (ISD::XOR , VT, MVT::v4i32); 531 setOperationAction(ISD::LOAD , VT, Promote); 532 AddPromotedToType (ISD::LOAD , VT, MVT::v4i32); 533 setOperationAction(ISD::SELECT, VT, Promote); 534 AddPromotedToType (ISD::SELECT, VT, MVT::v4i32); 535 setOperationAction(ISD::SELECT_CC, VT, Promote); 536 AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32); 537 setOperationAction(ISD::STORE, VT, Promote); 538 AddPromotedToType (ISD::STORE, VT, MVT::v4i32); 539 540 // No other operations are legal. 541 setOperationAction(ISD::MUL , VT, Expand); 542 setOperationAction(ISD::SDIV, VT, Expand); 543 setOperationAction(ISD::SREM, VT, Expand); 544 setOperationAction(ISD::UDIV, VT, Expand); 545 setOperationAction(ISD::UREM, VT, Expand); 546 setOperationAction(ISD::FDIV, VT, Expand); 547 setOperationAction(ISD::FREM, VT, Expand); 548 setOperationAction(ISD::FNEG, VT, Expand); 549 setOperationAction(ISD::FSQRT, VT, Expand); 550 setOperationAction(ISD::FLOG, VT, Expand); 551 setOperationAction(ISD::FLOG10, VT, Expand); 552 setOperationAction(ISD::FLOG2, VT, Expand); 553 setOperationAction(ISD::FEXP, VT, Expand); 554 setOperationAction(ISD::FEXP2, VT, Expand); 555 setOperationAction(ISD::FSIN, VT, Expand); 556 setOperationAction(ISD::FCOS, VT, Expand); 557 setOperationAction(ISD::FABS, VT, Expand); 558 setOperationAction(ISD::FFLOOR, VT, Expand); 559 setOperationAction(ISD::FCEIL, VT, Expand); 560 setOperationAction(ISD::FTRUNC, VT, Expand); 561 setOperationAction(ISD::FRINT, VT, Expand); 562 setOperationAction(ISD::FNEARBYINT, VT, Expand); 563 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); 564 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 565 setOperationAction(ISD::BUILD_VECTOR, VT, Expand); 566 setOperationAction(ISD::MULHU, VT, Expand); 567 setOperationAction(ISD::MULHS, VT, Expand); 568 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 569 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 570 setOperationAction(ISD::UDIVREM, VT, Expand); 571 setOperationAction(ISD::SDIVREM, VT, Expand); 572 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 573 setOperationAction(ISD::FPOW, VT, Expand); 574 setOperationAction(ISD::BSWAP, VT, Expand); 575 setOperationAction(ISD::VSELECT, VT, Expand); 576 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 577 setOperationAction(ISD::ROTL, VT, Expand); 578 setOperationAction(ISD::ROTR, VT, Expand); 579 580 for (MVT InnerVT : MVT::vector_valuetypes()) { 581 setTruncStoreAction(VT, InnerVT, Expand); 582 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 583 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 584 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 585 } 586 } 587 588 // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle 589 // with merges, splats, etc. 590 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); 591 592 setOperationAction(ISD::AND , MVT::v4i32, Legal); 593 setOperationAction(ISD::OR , MVT::v4i32, Legal); 594 setOperationAction(ISD::XOR , MVT::v4i32, Legal); 595 setOperationAction(ISD::LOAD , MVT::v4i32, Legal); 596 setOperationAction(ISD::SELECT, MVT::v4i32, 597 Subtarget.useCRBits() ? Legal : Expand); 598 setOperationAction(ISD::STORE , MVT::v4i32, Legal); 599 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 600 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); 601 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 602 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); 603 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 604 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 605 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 606 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); 607 608 addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); 609 addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); 610 addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); 611 addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass); 612 613 setOperationAction(ISD::MUL, MVT::v4f32, Legal); 614 setOperationAction(ISD::FMA, MVT::v4f32, Legal); 615 616 if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) { 617 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 618 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 619 } 620 621 if (Subtarget.hasP8Altivec()) 622 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 623 else 624 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 625 626 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 627 setOperationAction(ISD::MUL, MVT::v16i8, Custom); 628 629 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom); 630 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom); 631 632 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); 633 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); 634 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); 635 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 636 637 // Altivec does not contain unordered floating-point compare instructions 638 setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); 639 setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand); 640 setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); 641 setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); 642 643 if (Subtarget.hasVSX()) { 644 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); 645 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); 646 if (Subtarget.hasP8Vector()) { 647 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); 648 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); 649 } 650 if (Subtarget.hasDirectMove() && isPPC64) { 651 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal); 652 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal); 653 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal); 654 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal); 655 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal); 656 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal); 657 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); 658 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 659 } 660 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); 661 662 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); 663 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); 664 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); 665 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); 666 setOperationAction(ISD::FROUND, MVT::v2f64, Legal); 667 668 setOperationAction(ISD::FROUND, MVT::v4f32, Legal); 669 670 setOperationAction(ISD::MUL, MVT::v2f64, Legal); 671 setOperationAction(ISD::FMA, MVT::v2f64, Legal); 672 673 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 674 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 675 676 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); 677 setOperationAction(ISD::VSELECT, MVT::v8i16, Legal); 678 setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); 679 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 680 setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); 681 682 // Share the Altivec comparison restrictions. 683 setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); 684 setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); 685 setCondCodeAction(ISD::SETO, MVT::v2f64, Expand); 686 setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand); 687 688 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 689 setOperationAction(ISD::STORE, MVT::v2f64, Legal); 690 691 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal); 692 693 if (Subtarget.hasP8Vector()) 694 addRegisterClass(MVT::f32, &PPC::VSSRCRegClass); 695 696 addRegisterClass(MVT::f64, &PPC::VSFRCRegClass); 697 698 addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass); 699 addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass); 700 addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass); 701 702 if (Subtarget.hasP8Altivec()) { 703 setOperationAction(ISD::SHL, MVT::v2i64, Legal); 704 setOperationAction(ISD::SRA, MVT::v2i64, Legal); 705 setOperationAction(ISD::SRL, MVT::v2i64, Legal); 706 707 // 128 bit shifts can be accomplished via 3 instructions for SHL and 708 // SRL, but not for SRA because of the instructions available: 709 // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth 710 // doing 711 setOperationAction(ISD::SHL, MVT::v1i128, Expand); 712 setOperationAction(ISD::SRL, MVT::v1i128, Expand); 713 setOperationAction(ISD::SRA, MVT::v1i128, Expand); 714 715 setOperationAction(ISD::SETCC, MVT::v2i64, Legal); 716 } 717 else { 718 setOperationAction(ISD::SHL, MVT::v2i64, Expand); 719 setOperationAction(ISD::SRA, MVT::v2i64, Expand); 720 setOperationAction(ISD::SRL, MVT::v2i64, Expand); 721 722 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 723 724 // VSX v2i64 only supports non-arithmetic operations. 725 setOperationAction(ISD::ADD, MVT::v2i64, Expand); 726 setOperationAction(ISD::SUB, MVT::v2i64, Expand); 727 } 728 729 setOperationAction(ISD::LOAD, MVT::v2i64, Promote); 730 AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64); 731 setOperationAction(ISD::STORE, MVT::v2i64, Promote); 732 AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64); 733 734 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal); 735 736 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); 737 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); 738 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); 739 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); 740 741 // Vector operation legalization checks the result type of 742 // SIGN_EXTEND_INREG, overall legalization checks the inner type. 743 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal); 744 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal); 745 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 746 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 747 748 setOperationAction(ISD::FNEG, MVT::v4f32, Legal); 749 setOperationAction(ISD::FNEG, MVT::v2f64, Legal); 750 setOperationAction(ISD::FABS, MVT::v4f32, Legal); 751 setOperationAction(ISD::FABS, MVT::v2f64, Legal); 752 753 if (Subtarget.hasDirectMove()) 754 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 755 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 756 757 addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass); 758 } 759 760 if (Subtarget.hasP8Altivec()) { 761 addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass); 762 addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass); 763 } 764 765 if (Subtarget.hasP9Vector()) { 766 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 767 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 768 769 // 128 bit shifts can be accomplished via 3 instructions for SHL and 770 // SRL, but not for SRA because of the instructions available: 771 // VS{RL} and VS{RL}O. 772 setOperationAction(ISD::SHL, MVT::v1i128, Legal); 773 setOperationAction(ISD::SRL, MVT::v1i128, Legal); 774 setOperationAction(ISD::SRA, MVT::v1i128, Expand); 775 } 776 } 777 778 if (Subtarget.hasQPX()) { 779 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 780 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 781 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 782 setOperationAction(ISD::FREM, MVT::v4f64, Expand); 783 784 setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal); 785 setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand); 786 787 setOperationAction(ISD::LOAD , MVT::v4f64, Custom); 788 setOperationAction(ISD::STORE , MVT::v4f64, Custom); 789 790 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); 791 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom); 792 793 if (!Subtarget.useCRBits()) 794 setOperationAction(ISD::SELECT, MVT::v4f64, Expand); 795 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 796 797 setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal); 798 setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand); 799 setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand); 800 setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand); 801 setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom); 802 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal); 803 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 804 805 setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal); 806 setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); 807 808 setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); 809 setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand); 810 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); 811 812 setOperationAction(ISD::FNEG , MVT::v4f64, Legal); 813 setOperationAction(ISD::FABS , MVT::v4f64, Legal); 814 setOperationAction(ISD::FSIN , MVT::v4f64, Expand); 815 setOperationAction(ISD::FCOS , MVT::v4f64, Expand); 816 setOperationAction(ISD::FPOW , MVT::v4f64, Expand); 817 setOperationAction(ISD::FLOG , MVT::v4f64, Expand); 818 setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); 819 setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand); 820 setOperationAction(ISD::FEXP , MVT::v4f64, Expand); 821 setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand); 822 823 setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal); 824 setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal); 825 826 setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal); 827 setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal); 828 829 addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass); 830 831 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 832 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 833 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 834 setOperationAction(ISD::FREM, MVT::v4f32, Expand); 835 836 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); 837 setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand); 838 839 setOperationAction(ISD::LOAD , MVT::v4f32, Custom); 840 setOperationAction(ISD::STORE , MVT::v4f32, Custom); 841 842 if (!Subtarget.useCRBits()) 843 setOperationAction(ISD::SELECT, MVT::v4f32, Expand); 844 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 845 846 setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal); 847 setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand); 848 setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand); 849 setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand); 850 setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom); 851 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); 852 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 853 854 setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal); 855 setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand); 856 857 setOperationAction(ISD::FNEG , MVT::v4f32, Legal); 858 setOperationAction(ISD::FABS , MVT::v4f32, Legal); 859 setOperationAction(ISD::FSIN , MVT::v4f32, Expand); 860 setOperationAction(ISD::FCOS , MVT::v4f32, Expand); 861 setOperationAction(ISD::FPOW , MVT::v4f32, Expand); 862 setOperationAction(ISD::FLOG , MVT::v4f32, Expand); 863 setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); 864 setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand); 865 setOperationAction(ISD::FEXP , MVT::v4f32, Expand); 866 setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand); 867 868 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 869 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 870 871 setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal); 872 setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal); 873 874 addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass); 875 876 setOperationAction(ISD::AND , MVT::v4i1, Legal); 877 setOperationAction(ISD::OR , MVT::v4i1, Legal); 878 setOperationAction(ISD::XOR , MVT::v4i1, Legal); 879 880 if (!Subtarget.useCRBits()) 881 setOperationAction(ISD::SELECT, MVT::v4i1, Expand); 882 setOperationAction(ISD::VSELECT, MVT::v4i1, Legal); 883 884 setOperationAction(ISD::LOAD , MVT::v4i1, Custom); 885 setOperationAction(ISD::STORE , MVT::v4i1, Custom); 886 887 setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom); 888 setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand); 889 setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand); 890 setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand); 891 setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom); 892 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand); 893 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); 894 895 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); 896 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); 897 898 addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass); 899 900 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 901 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); 902 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); 903 setOperationAction(ISD::FROUND, MVT::v4f64, Legal); 904 905 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 906 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 907 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 908 setOperationAction(ISD::FROUND, MVT::v4f32, Legal); 909 910 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand); 911 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 912 913 // These need to set FE_INEXACT, and so cannot be vectorized here. 914 setOperationAction(ISD::FRINT, MVT::v4f64, Expand); 915 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 916 917 if (TM.Options.UnsafeFPMath) { 918 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 919 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 920 921 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 922 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 923 } else { 924 setOperationAction(ISD::FDIV, MVT::v4f64, Expand); 925 setOperationAction(ISD::FSQRT, MVT::v4f64, Expand); 926 927 setOperationAction(ISD::FDIV, MVT::v4f32, Expand); 928 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 929 } 930 } 931 932 if (Subtarget.has64BitSupport()) 933 setOperationAction(ISD::PREFETCH, MVT::Other, Legal); 934 935 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom); 936 937 if (!isPPC64) { 938 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); 939 setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); 940 } 941 942 setBooleanContents(ZeroOrOneBooleanContent); 943 944 if (Subtarget.hasAltivec()) { 945 // Altivec instructions set fields to all zeros or all ones. 946 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 947 } 948 949 if (!isPPC64) { 950 // These libcalls are not available in 32-bit. 951 setLibcallName(RTLIB::SHL_I128, nullptr); 952 setLibcallName(RTLIB::SRL_I128, nullptr); 953 setLibcallName(RTLIB::SRA_I128, nullptr); 954 } 955 956 setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); 957 958 // We have target-specific dag combine patterns for the following nodes: 959 setTargetDAGCombine(ISD::SHL); 960 setTargetDAGCombine(ISD::SRA); 961 setTargetDAGCombine(ISD::SRL); 962 setTargetDAGCombine(ISD::SINT_TO_FP); 963 setTargetDAGCombine(ISD::BUILD_VECTOR); 964 if (Subtarget.hasFPCVT()) 965 setTargetDAGCombine(ISD::UINT_TO_FP); 966 setTargetDAGCombine(ISD::LOAD); 967 setTargetDAGCombine(ISD::STORE); 968 setTargetDAGCombine(ISD::BR_CC); 969 if (Subtarget.useCRBits()) 970 setTargetDAGCombine(ISD::BRCOND); 971 setTargetDAGCombine(ISD::BSWAP); 972 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 973 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 974 setTargetDAGCombine(ISD::INTRINSIC_VOID); 975 976 setTargetDAGCombine(ISD::SIGN_EXTEND); 977 setTargetDAGCombine(ISD::ZERO_EXTEND); 978 setTargetDAGCombine(ISD::ANY_EXTEND); 979 980 if (Subtarget.useCRBits()) { 981 setTargetDAGCombine(ISD::TRUNCATE); 982 setTargetDAGCombine(ISD::SETCC); 983 setTargetDAGCombine(ISD::SELECT_CC); 984 } 985 986 // Use reciprocal estimates. 987 if (TM.Options.UnsafeFPMath) { 988 setTargetDAGCombine(ISD::FDIV); 989 setTargetDAGCombine(ISD::FSQRT); 990 } 991 992 // Darwin long double math library functions have $LDBL128 appended. 993 if (Subtarget.isDarwin()) { 994 setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); 995 setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128"); 996 setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128"); 997 setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128"); 998 setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128"); 999 setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128"); 1000 setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128"); 1001 setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128"); 1002 setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128"); 1003 setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128"); 1004 } 1005 1006 // With 32 condition bits, we don't need to sink (and duplicate) compares 1007 // aggressively in CodeGenPrep. 1008 if (Subtarget.useCRBits()) { 1009 setHasMultipleConditionRegisters(); 1010 setJumpIsExpensive(); 1011 } 1012 1013 setMinFunctionAlignment(2); 1014 if (Subtarget.isDarwin()) 1015 setPrefFunctionAlignment(4); 1016 1017 switch (Subtarget.getDarwinDirective()) { 1018 default: break; 1019 case PPC::DIR_970: 1020 case PPC::DIR_A2: 1021 case PPC::DIR_E500mc: 1022 case PPC::DIR_E5500: 1023 case PPC::DIR_PWR4: 1024 case PPC::DIR_PWR5: 1025 case PPC::DIR_PWR5X: 1026 case PPC::DIR_PWR6: 1027 case PPC::DIR_PWR6X: 1028 case PPC::DIR_PWR7: 1029 case PPC::DIR_PWR8: 1030 case PPC::DIR_PWR9: 1031 setPrefFunctionAlignment(4); 1032 setPrefLoopAlignment(4); 1033 break; 1034 } 1035 1036 if (Subtarget.enableMachineScheduler()) 1037 setSchedulingPreference(Sched::Source); 1038 else 1039 setSchedulingPreference(Sched::Hybrid); 1040 1041 computeRegisterProperties(STI.getRegisterInfo()); 1042 1043 // The Freescale cores do better with aggressive inlining of memcpy and 1044 // friends. GCC uses same threshold of 128 bytes (= 32 word stores). 1045 if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || 1046 Subtarget.getDarwinDirective() == PPC::DIR_E5500) { 1047 MaxStoresPerMemset = 32; 1048 MaxStoresPerMemsetOptSize = 16; 1049 MaxStoresPerMemcpy = 32; 1050 MaxStoresPerMemcpyOptSize = 8; 1051 MaxStoresPerMemmove = 32; 1052 MaxStoresPerMemmoveOptSize = 8; 1053 } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) { 1054 // The A2 also benefits from (very) aggressive inlining of memcpy and 1055 // friends. The overhead of a the function call, even when warm, can be 1056 // over one hundred cycles. 1057 MaxStoresPerMemset = 128; 1058 MaxStoresPerMemcpy = 128; 1059 MaxStoresPerMemmove = 128; 1060 MaxLoadsPerMemcmp = 128; 1061 } else { 1062 MaxLoadsPerMemcmp = 8; 1063 MaxLoadsPerMemcmpOptSize = 4; 1064 } 1065 } 1066 1067 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1068 /// the desired ByVal argument alignment. 1069 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, 1070 unsigned MaxMaxAlign) { 1071 if (MaxAlign == MaxMaxAlign) 1072 return; 1073 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1074 if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256) 1075 MaxAlign = 32; 1076 else if (VTy->getBitWidth() >= 128 && MaxAlign < 16) 1077 MaxAlign = 16; 1078 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1079 unsigned EltAlign = 0; 1080 getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign); 1081 if (EltAlign > MaxAlign) 1082 MaxAlign = EltAlign; 1083 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1084 for (auto *EltTy : STy->elements()) { 1085 unsigned EltAlign = 0; 1086 getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign); 1087 if (EltAlign > MaxAlign) 1088 MaxAlign = EltAlign; 1089 if (MaxAlign == MaxMaxAlign) 1090 break; 1091 } 1092 } 1093 } 1094 1095 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1096 /// function arguments in the caller parameter area. 1097 unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, 1098 const DataLayout &DL) const { 1099 // Darwin passes everything on 4 byte boundary. 1100 if (Subtarget.isDarwin()) 1101 return 4; 1102 1103 // 16byte and wider vectors are passed on 16byte boundary. 1104 // The rest is 8 on PPC64 and 4 on PPC32 boundary. 1105 unsigned Align = Subtarget.isPPC64() ? 8 : 4; 1106 if (Subtarget.hasAltivec() || Subtarget.hasQPX()) 1107 getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16); 1108 return Align; 1109 } 1110 1111 bool PPCTargetLowering::useSoftFloat() const { 1112 return Subtarget.useSoftFloat(); 1113 } 1114 1115 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { 1116 switch ((PPCISD::NodeType)Opcode) { 1117 case PPCISD::FIRST_NUMBER: break; 1118 case PPCISD::FSEL: return "PPCISD::FSEL"; 1119 case PPCISD::FCFID: return "PPCISD::FCFID"; 1120 case PPCISD::FCFIDU: return "PPCISD::FCFIDU"; 1121 case PPCISD::FCFIDS: return "PPCISD::FCFIDS"; 1122 case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS"; 1123 case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ"; 1124 case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; 1125 case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ"; 1126 case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ"; 1127 case PPCISD::FRE: return "PPCISD::FRE"; 1128 case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; 1129 case PPCISD::STFIWX: return "PPCISD::STFIWX"; 1130 case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; 1131 case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; 1132 case PPCISD::VPERM: return "PPCISD::VPERM"; 1133 case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; 1134 case PPCISD::XXINSERT: return "PPCISD::XXINSERT"; 1135 case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE"; 1136 case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; 1137 case PPCISD::VECSHL: return "PPCISD::VECSHL"; 1138 case PPCISD::CMPB: return "PPCISD::CMPB"; 1139 case PPCISD::Hi: return "PPCISD::Hi"; 1140 case PPCISD::Lo: return "PPCISD::Lo"; 1141 case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; 1142 case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; 1143 case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; 1144 case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; 1145 case PPCISD::SRL: return "PPCISD::SRL"; 1146 case PPCISD::SRA: return "PPCISD::SRA"; 1147 case PPCISD::SHL: return "PPCISD::SHL"; 1148 case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE"; 1149 case PPCISD::CALL: return "PPCISD::CALL"; 1150 case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP"; 1151 case PPCISD::MTCTR: return "PPCISD::MTCTR"; 1152 case PPCISD::BCTRL: return "PPCISD::BCTRL"; 1153 case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC"; 1154 case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; 1155 case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE"; 1156 case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; 1157 case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP"; 1158 case PPCISD::MFOCRF: return "PPCISD::MFOCRF"; 1159 case PPCISD::MFVSR: return "PPCISD::MFVSR"; 1160 case PPCISD::MTVSRA: return "PPCISD::MTVSRA"; 1161 case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ"; 1162 case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP"; 1163 case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP"; 1164 case PPCISD::ANDIo_1_EQ_BIT: return "PPCISD::ANDIo_1_EQ_BIT"; 1165 case PPCISD::ANDIo_1_GT_BIT: return "PPCISD::ANDIo_1_GT_BIT"; 1166 case PPCISD::VCMP: return "PPCISD::VCMP"; 1167 case PPCISD::VCMPo: return "PPCISD::VCMPo"; 1168 case PPCISD::LBRX: return "PPCISD::LBRX"; 1169 case PPCISD::STBRX: return "PPCISD::STBRX"; 1170 case PPCISD::LFIWAX: return "PPCISD::LFIWAX"; 1171 case PPCISD::LFIWZX: return "PPCISD::LFIWZX"; 1172 case PPCISD::LXSIZX: return "PPCISD::LXSIZX"; 1173 case PPCISD::STXSIX: return "PPCISD::STXSIX"; 1174 case PPCISD::VEXTS: return "PPCISD::VEXTS"; 1175 case PPCISD::SExtVElems: return "PPCISD::SExtVElems"; 1176 case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; 1177 case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; 1178 case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; 1179 case PPCISD::BDNZ: return "PPCISD::BDNZ"; 1180 case PPCISD::BDZ: return "PPCISD::BDZ"; 1181 case PPCISD::MFFS: return "PPCISD::MFFS"; 1182 case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ"; 1183 case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN"; 1184 case PPCISD::CR6SET: return "PPCISD::CR6SET"; 1185 case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET"; 1186 case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT"; 1187 case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT"; 1188 case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA"; 1189 case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L"; 1190 case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS"; 1191 case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA"; 1192 case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L"; 1193 case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR"; 1194 case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR"; 1195 case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA"; 1196 case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L"; 1197 case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR"; 1198 case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR"; 1199 case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; 1200 case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; 1201 case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; 1202 case PPCISD::SC: return "PPCISD::SC"; 1203 case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB"; 1204 case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE"; 1205 case PPCISD::RFEBB: return "PPCISD::RFEBB"; 1206 case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; 1207 case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; 1208 case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; 1209 case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; 1210 case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; 1211 case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI"; 1212 case PPCISD::QBFLT: return "PPCISD::QBFLT"; 1213 case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; 1214 } 1215 return nullptr; 1216 } 1217 1218 EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, 1219 EVT VT) const { 1220 if (!VT.isVector()) 1221 return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; 1222 1223 if (Subtarget.hasQPX()) 1224 return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements()); 1225 1226 return VT.changeVectorElementTypeToInteger(); 1227 } 1228 1229 bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const { 1230 assert(VT.isFloatingPoint() && "Non-floating-point FMA?"); 1231 return true; 1232 } 1233 1234 //===----------------------------------------------------------------------===// 1235 // Node matching predicates, for use by the tblgen matching code. 1236 //===----------------------------------------------------------------------===// 1237 1238 /// isFloatingPointZero - Return true if this is 0.0 or -0.0. 1239 static bool isFloatingPointZero(SDValue Op) { 1240 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 1241 return CFP->getValueAPF().isZero(); 1242 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 1243 // Maybe this has already been legalized into the constant pool? 1244 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1))) 1245 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 1246 return CFP->getValueAPF().isZero(); 1247 } 1248 return false; 1249 } 1250 1251 /// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return 1252 /// true if Op is undef or if it matches the specified value. 1253 static bool isConstantOrUndef(int Op, int Val) { 1254 return Op < 0 || Op == Val; 1255 } 1256 1257 /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a 1258 /// VPKUHUM instruction. 1259 /// The ShuffleKind distinguishes between big-endian operations with 1260 /// two different inputs (0), either-endian operations with two identical 1261 /// inputs (1), and little-endian operations with two different inputs (2). 1262 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). 1263 bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, 1264 SelectionDAG &DAG) { 1265 bool IsLE = DAG.getDataLayout().isLittleEndian(); 1266 if (ShuffleKind == 0) { 1267 if (IsLE) 1268 return false; 1269 for (unsigned i = 0; i != 16; ++i) 1270 if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) 1271 return false; 1272 } else if (ShuffleKind == 2) { 1273 if (!IsLE) 1274 return false; 1275 for (unsigned i = 0; i != 16; ++i) 1276 if (!isConstantOrUndef(N->getMaskElt(i), i*2)) 1277 return false; 1278 } else if (ShuffleKind == 1) { 1279 unsigned j = IsLE ? 0 : 1; 1280 for (unsigned i = 0; i != 8; ++i) 1281 if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || 1282 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) 1283 return false; 1284 } 1285 return true; 1286 } 1287 1288 /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a 1289 /// VPKUWUM instruction. 1290 /// The ShuffleKind distinguishes between big-endian operations with 1291 /// two different inputs (0), either-endian operations with two identical 1292 /// inputs (1), and little-endian operations with two different inputs (2). 1293 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). 1294 bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, 1295 SelectionDAG &DAG) { 1296 bool IsLE = DAG.getDataLayout().isLittleEndian(); 1297 if (ShuffleKind == 0) { 1298 if (IsLE) 1299 return false; 1300 for (unsigned i = 0; i != 16; i += 2) 1301 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || 1302 !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) 1303 return false; 1304 } else if (ShuffleKind == 2) { 1305 if (!IsLE) 1306 return false; 1307 for (unsigned i = 0; i != 16; i += 2) 1308 if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || 1309 !isConstantOrUndef(N->getMaskElt(i+1), i*2+1)) 1310 return false; 1311 } else if (ShuffleKind == 1) { 1312 unsigned j = IsLE ? 0 : 2; 1313 for (unsigned i = 0; i != 8; i += 2) 1314 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || 1315 !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || 1316 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || 1317 !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1)) 1318 return false; 1319 } 1320 return true; 1321 } 1322 1323 /// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a 1324 /// VPKUDUM instruction, AND the VPKUDUM instruction exists for the 1325 /// current subtarget. 1326 /// 1327 /// The ShuffleKind distinguishes between big-endian operations with 1328 /// two different inputs (0), either-endian operations with two identical 1329 /// inputs (1), and little-endian operations with two different inputs (2). 1330 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). 1331 bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, 1332 SelectionDAG &DAG) { 1333 const PPCSubtarget& Subtarget = 1334 static_cast<const PPCSubtarget&>(DAG.getSubtarget()); 1335 if (!Subtarget.hasP8Vector()) 1336 return false; 1337 1338 bool IsLE = DAG.getDataLayout().isLittleEndian(); 1339 if (ShuffleKind == 0) { 1340 if (IsLE) 1341 return false; 1342 for (unsigned i = 0; i != 16; i += 4) 1343 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) || 1344 !isConstantOrUndef(N->getMaskElt(i+1), i*2+5) || 1345 !isConstantOrUndef(N->getMaskElt(i+2), i*2+6) || 1346 !isConstantOrUndef(N->getMaskElt(i+3), i*2+7)) 1347 return false; 1348 } else if (ShuffleKind == 2) { 1349 if (!IsLE) 1350 return false; 1351 for (unsigned i = 0; i != 16; i += 4) 1352 if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || 1353 !isConstantOrUndef(N->getMaskElt(i+1), i*2+1) || 1354 !isConstantOrUndef(N->getMaskElt(i+2), i*2+2) || 1355 !isConstantOrUndef(N->getMaskElt(i+3), i*2+3)) 1356 return false; 1357 } else if (ShuffleKind == 1) { 1358 unsigned j = IsLE ? 0 : 4; 1359 for (unsigned i = 0; i != 8; i += 4) 1360 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || 1361 !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || 1362 !isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) || 1363 !isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) || 1364 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || 1365 !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) || 1366 !isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) || 1367 !isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3)) 1368 return false; 1369 } 1370 return true; 1371 } 1372 1373 /// isVMerge - Common function, used to match vmrg* shuffles. 1374 /// 1375 static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, 1376 unsigned LHSStart, unsigned RHSStart) { 1377 if (N->getValueType(0) != MVT::v16i8) 1378 return false; 1379 assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) && 1380 "Unsupported merge size!"); 1381 1382 for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units 1383 for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit 1384 if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j), 1385 LHSStart+j+i*UnitSize) || 1386 !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j), 1387 RHSStart+j+i*UnitSize)) 1388 return false; 1389 } 1390 return true; 1391 } 1392 1393 /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for 1394 /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). 1395 /// The ShuffleKind distinguishes between big-endian merges with two 1396 /// different inputs (0), either-endian merges with two identical inputs (1), 1397 /// and little-endian merges with two different inputs (2). For the latter, 1398 /// the input operands are swapped (see PPCInstrAltivec.td). 1399 bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, 1400 unsigned ShuffleKind, SelectionDAG &DAG) { 1401 if (DAG.getDataLayout().isLittleEndian()) { 1402 if (ShuffleKind == 1) // unary 1403 return isVMerge(N, UnitSize, 0, 0); 1404 else if (ShuffleKind == 2) // swapped 1405 return isVMerge(N, UnitSize, 0, 16); 1406 else 1407 return false; 1408 } else { 1409 if (ShuffleKind == 1) // unary 1410 return isVMerge(N, UnitSize, 8, 8); 1411 else if (ShuffleKind == 0) // normal 1412 return isVMerge(N, UnitSize, 8, 24); 1413 else 1414 return false; 1415 } 1416 } 1417 1418 /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for 1419 /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). 1420 /// The ShuffleKind distinguishes between big-endian merges with two 1421 /// different inputs (0), either-endian merges with two identical inputs (1), 1422 /// and little-endian merges with two different inputs (2). For the latter, 1423 /// the input operands are swapped (see PPCInstrAltivec.td). 1424 bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, 1425 unsigned ShuffleKind, SelectionDAG &DAG) { 1426 if (DAG.getDataLayout().isLittleEndian()) { 1427 if (ShuffleKind == 1) // unary 1428 return isVMerge(N, UnitSize, 8, 8); 1429 else if (ShuffleKind == 2) // swapped 1430 return isVMerge(N, UnitSize, 8, 24); 1431 else 1432 return false; 1433 } else { 1434 if (ShuffleKind == 1) // unary 1435 return isVMerge(N, UnitSize, 0, 0); 1436 else if (ShuffleKind == 0) // normal 1437 return isVMerge(N, UnitSize, 0, 16); 1438 else 1439 return false; 1440 } 1441 } 1442 1443 /** 1444 * \brief Common function used to match vmrgew and vmrgow shuffles 1445 * 1446 * The indexOffset determines whether to look for even or odd words in 1447 * the shuffle mask. This is based on the of the endianness of the target 1448 * machine. 1449 * - Little Endian: 1450 * - Use offset of 0 to check for odd elements 1451 * - Use offset of 4 to check for even elements 1452 * - Big Endian: 1453 * - Use offset of 0 to check for even elements 1454 * - Use offset of 4 to check for odd elements 1455 * A detailed description of the vector element ordering for little endian and 1456 * big endian can be found at 1457 * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html 1458 * Targeting your applications - what little endian and big endian IBM XL C/C++ 1459 * compiler differences mean to you 1460 * 1461 * The mask to the shuffle vector instruction specifies the indices of the 1462 * elements from the two input vectors to place in the result. The elements are 1463 * numbered in array-access order, starting with the first vector. These vectors 1464 * are always of type v16i8, thus each vector will contain 16 elements of size 1465 * 8. More info on the shuffle vector can be found in the 1466 * http://llvm.org/docs/LangRef.html#shufflevector-instruction 1467 * Language Reference. 1468 * 1469 * The RHSStartValue indicates whether the same input vectors are used (unary) 1470 * or two different input vectors are used, based on the following: 1471 * - If the instruction uses the same vector for both inputs, the range of the 1472 * indices will be 0 to 15. In this case, the RHSStart value passed should 1473 * be 0. 1474 * - If the instruction has two different vectors then the range of the 1475 * indices will be 0 to 31. In this case, the RHSStart value passed should 1476 * be 16 (indices 0-15 specify elements in the first vector while indices 16 1477 * to 31 specify elements in the second vector). 1478 * 1479 * \param[in] N The shuffle vector SD Node to analyze 1480 * \param[in] IndexOffset Specifies whether to look for even or odd elements 1481 * \param[in] RHSStartValue Specifies the starting index for the righthand input 1482 * vector to the shuffle_vector instruction 1483 * \return true iff this shuffle vector represents an even or odd word merge 1484 */ 1485 static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset, 1486 unsigned RHSStartValue) { 1487 if (N->getValueType(0) != MVT::v16i8) 1488 return false; 1489 1490 for (unsigned i = 0; i < 2; ++i) 1491 for (unsigned j = 0; j < 4; ++j) 1492 if (!isConstantOrUndef(N->getMaskElt(i*4+j), 1493 i*RHSStartValue+j+IndexOffset) || 1494 !isConstantOrUndef(N->getMaskElt(i*4+j+8), 1495 i*RHSStartValue+j+IndexOffset+8)) 1496 return false; 1497 return true; 1498 } 1499 1500 /** 1501 * \brief Determine if the specified shuffle mask is suitable for the vmrgew or 1502 * vmrgow instructions. 1503 * 1504 * \param[in] N The shuffle vector SD Node to analyze 1505 * \param[in] CheckEven Check for an even merge (true) or an odd merge (false) 1506 * \param[in] ShuffleKind Identify the type of merge: 1507 * - 0 = big-endian merge with two different inputs; 1508 * - 1 = either-endian merge with two identical inputs; 1509 * - 2 = little-endian merge with two different inputs (inputs are swapped for 1510 * little-endian merges). 1511 * \param[in] DAG The current SelectionDAG 1512 * \return true iff this shuffle mask 1513 */ 1514 bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, 1515 unsigned ShuffleKind, SelectionDAG &DAG) { 1516 if (DAG.getDataLayout().isLittleEndian()) { 1517 unsigned indexOffset = CheckEven ? 4 : 0; 1518 if (ShuffleKind == 1) // Unary 1519 return isVMerge(N, indexOffset, 0); 1520 else if (ShuffleKind == 2) // swapped 1521 return isVMerge(N, indexOffset, 16); 1522 else 1523 return false; 1524 } 1525 else { 1526 unsigned indexOffset = CheckEven ? 0 : 4; 1527 if (ShuffleKind == 1) // Unary 1528 return isVMerge(N, indexOffset, 0); 1529 else if (ShuffleKind == 0) // Normal 1530 return isVMerge(N, indexOffset, 16); 1531 else 1532 return false; 1533 } 1534 return false; 1535 } 1536 1537 /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift 1538 /// amount, otherwise return -1. 1539 /// The ShuffleKind distinguishes between big-endian operations with two 1540 /// different inputs (0), either-endian operations with two identical inputs 1541 /// (1), and little-endian operations with two different inputs (2). For the 1542 /// latter, the input operands are swapped (see PPCInstrAltivec.td). 1543 int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, 1544 SelectionDAG &DAG) { 1545 if (N->getValueType(0) != MVT::v16i8) 1546 return -1; 1547 1548 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 1549 1550 // Find the first non-undef value in the shuffle mask. 1551 unsigned i; 1552 for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i) 1553 /*search*/; 1554 1555 if (i == 16) return -1; // all undef. 1556 1557 // Otherwise, check to see if the rest of the elements are consecutively 1558 // numbered from this value. 1559 unsigned ShiftAmt = SVOp->getMaskElt(i); 1560 if (ShiftAmt < i) return -1; 1561 1562 ShiftAmt -= i; 1563 bool isLE = DAG.getDataLayout().isLittleEndian(); 1564 1565 if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { 1566 // Check the rest of the elements to see if they are consecutive. 1567 for (++i; i != 16; ++i) 1568 if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) 1569 return -1; 1570 } else if (ShuffleKind == 1) { 1571 // Check the rest of the elements to see if they are consecutive. 1572 for (++i; i != 16; ++i) 1573 if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) 1574 return -1; 1575 } else 1576 return -1; 1577 1578 if (isLE) 1579 ShiftAmt = 16 - ShiftAmt; 1580 1581 return ShiftAmt; 1582 } 1583 1584 /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand 1585 /// specifies a splat of a single element that is suitable for input to 1586 /// VSPLTB/VSPLTH/VSPLTW. 1587 bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { 1588 assert(N->getValueType(0) == MVT::v16i8 && 1589 (EltSize == 1 || EltSize == 2 || EltSize == 4)); 1590 1591 // The consecutive indices need to specify an element, not part of two 1592 // different elements. So abandon ship early if this isn't the case. 1593 if (N->getMaskElt(0) % EltSize != 0) 1594 return false; 1595 1596 // This is a splat operation if each element of the permute is the same, and 1597 // if the value doesn't reference the second vector. 1598 unsigned ElementBase = N->getMaskElt(0); 1599 1600 // FIXME: Handle UNDEF elements too! 1601 if (ElementBase >= 16) 1602 return false; 1603 1604 // Check that the indices are consecutive, in the case of a multi-byte element 1605 // splatted with a v16i8 mask. 1606 for (unsigned i = 1; i != EltSize; ++i) 1607 if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase)) 1608 return false; 1609 1610 for (unsigned i = EltSize, e = 16; i != e; i += EltSize) { 1611 if (N->getMaskElt(i) < 0) continue; 1612 for (unsigned j = 0; j != EltSize; ++j) 1613 if (N->getMaskElt(i+j) != N->getMaskElt(j)) 1614 return false; 1615 } 1616 return true; 1617 } 1618 1619 /// Check that the mask is shuffling N byte elements. Within each N byte 1620 /// element of the mask, the indices could be either in increasing or 1621 /// decreasing order as long as they are consecutive. 1622 /// \param[in] N the shuffle vector SD Node to analyze 1623 /// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/ 1624 /// Word/DoubleWord/QuadWord). 1625 /// \param[in] StepLen the delta indices number among the N byte element, if 1626 /// the mask is in increasing/decreasing order then it is 1/-1. 1627 /// \return true iff the mask is shuffling N byte elements. 1628 static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width, 1629 int StepLen) { 1630 assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) && 1631 "Unexpected element width."); 1632 assert((StepLen == 1 || StepLen == -1) && "Unexpected element width."); 1633 1634 unsigned NumOfElem = 16 / Width; 1635 unsigned MaskVal[16]; // Width is never greater than 16 1636 for (unsigned i = 0; i < NumOfElem; ++i) { 1637 MaskVal[0] = N->getMaskElt(i * Width); 1638 if ((StepLen == 1) && (MaskVal[0] % Width)) { 1639 return false; 1640 } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) { 1641 return false; 1642 } 1643 1644 for (unsigned int j = 1; j < Width; ++j) { 1645 MaskVal[j] = N->getMaskElt(i * Width + j); 1646 if (MaskVal[j] != MaskVal[j-1] + StepLen) { 1647 return false; 1648 } 1649 } 1650 } 1651 1652 return true; 1653 } 1654 1655 bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, 1656 unsigned &InsertAtByte, bool &Swap, bool IsLE) { 1657 if (!isNByteElemShuffleMask(N, 4, 1)) 1658 return false; 1659 1660 // Now we look at mask elements 0,4,8,12 1661 unsigned M0 = N->getMaskElt(0) / 4; 1662 unsigned M1 = N->getMaskElt(4) / 4; 1663 unsigned M2 = N->getMaskElt(8) / 4; 1664 unsigned M3 = N->getMaskElt(12) / 4; 1665 unsigned LittleEndianShifts[] = { 2, 1, 0, 3 }; 1666 unsigned BigEndianShifts[] = { 3, 0, 1, 2 }; 1667 1668 // Below, let H and L be arbitrary elements of the shuffle mask 1669 // where H is in the range [4,7] and L is in the range [0,3]. 1670 // H, 1, 2, 3 or L, 5, 6, 7 1671 if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) || 1672 (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) { 1673 ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3]; 1674 InsertAtByte = IsLE ? 12 : 0; 1675 Swap = M0 < 4; 1676 return true; 1677 } 1678 // 0, H, 2, 3 or 4, L, 6, 7 1679 if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) || 1680 (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) { 1681 ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3]; 1682 InsertAtByte = IsLE ? 8 : 4; 1683 Swap = M1 < 4; 1684 return true; 1685 } 1686 // 0, 1, H, 3 or 4, 5, L, 7 1687 if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) || 1688 (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) { 1689 ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3]; 1690 InsertAtByte = IsLE ? 4 : 8; 1691 Swap = M2 < 4; 1692 return true; 1693 } 1694 // 0, 1, 2, H or 4, 5, 6, L 1695 if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) || 1696 (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) { 1697 ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3]; 1698 InsertAtByte = IsLE ? 0 : 12; 1699 Swap = M3 < 4; 1700 return true; 1701 } 1702 1703 // If both vector operands for the shuffle are the same vector, the mask will 1704 // contain only elements from the first one and the second one will be undef. 1705 if (N->getOperand(1).isUndef()) { 1706 ShiftElts = 0; 1707 Swap = true; 1708 unsigned XXINSERTWSrcElem = IsLE ? 2 : 1; 1709 if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) { 1710 InsertAtByte = IsLE ? 12 : 0; 1711 return true; 1712 } 1713 if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) { 1714 InsertAtByte = IsLE ? 8 : 4; 1715 return true; 1716 } 1717 if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) { 1718 InsertAtByte = IsLE ? 4 : 8; 1719 return true; 1720 } 1721 if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) { 1722 InsertAtByte = IsLE ? 0 : 12; 1723 return true; 1724 } 1725 } 1726 1727 return false; 1728 } 1729 1730 bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, 1731 bool &Swap, bool IsLE) { 1732 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); 1733 // Ensure each byte index of the word is consecutive. 1734 if (!isNByteElemShuffleMask(N, 4, 1)) 1735 return false; 1736 1737 // Now we look at mask elements 0,4,8,12, which are the beginning of words. 1738 unsigned M0 = N->getMaskElt(0) / 4; 1739 unsigned M1 = N->getMaskElt(4) / 4; 1740 unsigned M2 = N->getMaskElt(8) / 4; 1741 unsigned M3 = N->getMaskElt(12) / 4; 1742 1743 // If both vector operands for the shuffle are the same vector, the mask will 1744 // contain only elements from the first one and the second one will be undef. 1745 if (N->getOperand(1).isUndef()) { 1746 assert(M0 < 4 && "Indexing into an undef vector?"); 1747 if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4) 1748 return false; 1749 1750 ShiftElts = IsLE ? (4 - M0) % 4 : M0; 1751 Swap = false; 1752 return true; 1753 } 1754 1755 // Ensure each word index of the ShuffleVector Mask is consecutive. 1756 if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8) 1757 return false; 1758 1759 if (IsLE) { 1760 if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) { 1761 // Input vectors don't need to be swapped if the leading element 1762 // of the result is one of the 3 left elements of the second vector 1763 // (or if there is no shift to be done at all). 1764 Swap = false; 1765 ShiftElts = (8 - M0) % 8; 1766 } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) { 1767 // Input vectors need to be swapped if the leading element 1768 // of the result is one of the 3 left elements of the first vector 1769 // (or if we're shifting by 4 - thereby simply swapping the vectors). 1770 Swap = true; 1771 ShiftElts = (4 - M0) % 4; 1772 } 1773 1774 return true; 1775 } else { // BE 1776 if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) { 1777 // Input vectors don't need to be swapped if the leading element 1778 // of the result is one of the 4 elements of the first vector. 1779 Swap = false; 1780 ShiftElts = M0; 1781 } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) { 1782 // Input vectors need to be swapped if the leading element 1783 // of the result is one of the 4 elements of the right vector. 1784 Swap = true; 1785 ShiftElts = M0 - 4; 1786 } 1787 1788 return true; 1789 } 1790 } 1791 1792 bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) { 1793 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); 1794 1795 if (!isNByteElemShuffleMask(N, Width, -1)) 1796 return false; 1797 1798 for (int i = 0; i < 16; i += Width) 1799 if (N->getMaskElt(i) != i + Width - 1) 1800 return false; 1801 1802 return true; 1803 } 1804 1805 bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) { 1806 return isXXBRShuffleMaskHelper(N, 2); 1807 } 1808 1809 bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) { 1810 return isXXBRShuffleMaskHelper(N, 4); 1811 } 1812 1813 bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) { 1814 return isXXBRShuffleMaskHelper(N, 8); 1815 } 1816 1817 bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) { 1818 return isXXBRShuffleMaskHelper(N, 16); 1819 } 1820 1821 /// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap 1822 /// if the inputs to the instruction should be swapped and set \p DM to the 1823 /// value for the immediate. 1824 /// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI 1825 /// AND element 0 of the result comes from the first input (LE) or second input 1826 /// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered. 1827 /// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle 1828 /// mask. 1829 bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM, 1830 bool &Swap, bool IsLE) { 1831 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); 1832 1833 // Ensure each byte index of the double word is consecutive. 1834 if (!isNByteElemShuffleMask(N, 8, 1)) 1835 return false; 1836 1837 unsigned M0 = N->getMaskElt(0) / 8; 1838 unsigned M1 = N->getMaskElt(8) / 8; 1839 assert(((M0 | M1) < 4) && "A mask element out of bounds?"); 1840 1841 // If both vector operands for the shuffle are the same vector, the mask will 1842 // contain only elements from the first one and the second one will be undef. 1843 if (N->getOperand(1).isUndef()) { 1844 if ((M0 | M1) < 2) { 1845 DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1); 1846 Swap = false; 1847 return true; 1848 } else 1849 return false; 1850 } 1851 1852 if (IsLE) { 1853 if (M0 > 1 && M1 < 2) { 1854 Swap = false; 1855 } else if (M0 < 2 && M1 > 1) { 1856 M0 = (M0 + 2) % 4; 1857 M1 = (M1 + 2) % 4; 1858 Swap = true; 1859 } else 1860 return false; 1861 1862 // Note: if control flow comes here that means Swap is already set above 1863 DM = (((~M1) & 1) << 1) + ((~M0) & 1); 1864 return true; 1865 } else { // BE 1866 if (M0 < 2 && M1 > 1) { 1867 Swap = false; 1868 } else if (M0 > 1 && M1 < 2) { 1869 M0 = (M0 + 2) % 4; 1870 M1 = (M1 + 2) % 4; 1871 Swap = true; 1872 } else 1873 return false; 1874 1875 // Note: if control flow comes here that means Swap is already set above 1876 DM = (M0 << 1) + (M1 & 1); 1877 return true; 1878 } 1879 } 1880 1881 1882 /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the 1883 /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. 1884 unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, 1885 SelectionDAG &DAG) { 1886 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 1887 assert(isSplatShuffleMask(SVOp, EltSize)); 1888 if (DAG.getDataLayout().isLittleEndian()) 1889 return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); 1890 else 1891 return SVOp->getMaskElt(0) / EltSize; 1892 } 1893 1894 /// get_VSPLTI_elt - If this is a build_vector of constants which can be formed 1895 /// by using a vspltis[bhw] instruction of the specified element size, return 1896 /// the constant being splatted. The ByteSize field indicates the number of 1897 /// bytes of each element [124] -> [bhw]. 1898 SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { 1899 SDValue OpVal(nullptr, 0); 1900 1901 // If ByteSize of the splat is bigger than the element size of the 1902 // build_vector, then we have a case where we are checking for a splat where 1903 // multiple elements of the buildvector are folded together into a single 1904 // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8). 1905 unsigned EltSize = 16/N->getNumOperands(); 1906 if (EltSize < ByteSize) { 1907 unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval. 1908 SDValue UniquedVals[4]; 1909 assert(Multiple > 1 && Multiple <= 4 && "How can this happen?"); 1910 1911 // See if all of the elements in the buildvector agree across. 1912 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 1913 if (N->getOperand(i).isUndef()) continue; 1914 // If the element isn't a constant, bail fully out. 1915 if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue(); 1916 1917 if (!UniquedVals[i&(Multiple-1)].getNode()) 1918 UniquedVals[i&(Multiple-1)] = N->getOperand(i); 1919 else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i)) 1920 return SDValue(); // no match. 1921 } 1922 1923 // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains 1924 // either constant or undef values that are identical for each chunk. See 1925 // if these chunks can form into a larger vspltis*. 1926 1927 // Check to see if all of the leading entries are either 0 or -1. If 1928 // neither, then this won't fit into the immediate field. 1929 bool LeadingZero = true; 1930 bool LeadingOnes = true; 1931 for (unsigned i = 0; i != Multiple-1; ++i) { 1932 if (!UniquedVals[i].getNode()) continue; // Must have been undefs. 1933 1934 LeadingZero &= isNullConstant(UniquedVals[i]); 1935 LeadingOnes &= isAllOnesConstant(UniquedVals[i]); 1936 } 1937 // Finally, check the least significant entry. 1938 if (LeadingZero) { 1939 if (!UniquedVals[Multiple-1].getNode()) 1940 return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef 1941 int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue(); 1942 if (Val < 16) // 0,0,0,4 -> vspltisw(4) 1943 return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); 1944 } 1945 if (LeadingOnes) { 1946 if (!UniquedVals[Multiple-1].getNode()) 1947 return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef 1948 int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue(); 1949 if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2) 1950 return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); 1951 } 1952 1953 return SDValue(); 1954 } 1955 1956 // Check to see if this buildvec has a single non-undef value in its elements. 1957 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 1958 if (N->getOperand(i).isUndef()) continue; 1959 if (!OpVal.getNode()) 1960 OpVal = N->getOperand(i); 1961 else if (OpVal != N->getOperand(i)) 1962 return SDValue(); 1963 } 1964 1965 if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def. 1966 1967 unsigned ValSizeInBytes = EltSize; 1968 uint64_t Value = 0; 1969 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) { 1970 Value = CN->getZExtValue(); 1971 } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) { 1972 assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!"); 1973 Value = FloatToBits(CN->getValueAPF().convertToFloat()); 1974 } 1975 1976 // If the splat value is larger than the element value, then we can never do 1977 // this splat. The only case that we could fit the replicated bits into our 1978 // immediate field for would be zero, and we prefer to use vxor for it. 1979 if (ValSizeInBytes < ByteSize) return SDValue(); 1980 1981 // If the element value is larger than the splat value, check if it consists 1982 // of a repeated bit pattern of size ByteSize. 1983 if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8)) 1984 return SDValue(); 1985 1986 // Properly sign extend the value. 1987 int MaskVal = SignExtend32(Value, ByteSize * 8); 1988 1989 // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros. 1990 if (MaskVal == 0) return SDValue(); 1991 1992 // Finally, if this value fits in a 5 bit sext field, return it 1993 if (SignExtend32<5>(MaskVal) == MaskVal) 1994 return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32); 1995 return SDValue(); 1996 } 1997 1998 /// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift 1999 /// amount, otherwise return -1. 2000 int PPC::isQVALIGNIShuffleMask(SDNode *N) { 2001 EVT VT = N->getValueType(0); 2002 if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1) 2003 return -1; 2004 2005 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2006 2007 // Find the first non-undef value in the shuffle mask. 2008 unsigned i; 2009 for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i) 2010 /*search*/; 2011 2012 if (i == 4) return -1; // all undef. 2013 2014 // Otherwise, check to see if the rest of the elements are consecutively 2015 // numbered from this value. 2016 unsigned ShiftAmt = SVOp->getMaskElt(i); 2017 if (ShiftAmt < i) return -1; 2018 ShiftAmt -= i; 2019 2020 // Check the rest of the elements to see if they are consecutive. 2021 for (++i; i != 4; ++i) 2022 if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) 2023 return -1; 2024 2025 return ShiftAmt; 2026 } 2027 2028 //===----------------------------------------------------------------------===// 2029 // Addressing Mode Selection 2030 //===----------------------------------------------------------------------===// 2031 2032 /// isIntS16Immediate - This method tests to see if the node is either a 32-bit 2033 /// or 64-bit immediate, and if the value can be accurately represented as a 2034 /// sign extension from a 16-bit value. If so, this returns true and the 2035 /// immediate. 2036 bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) { 2037 if (!isa<ConstantSDNode>(N)) 2038 return false; 2039 2040 Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue(); 2041 if (N->getValueType(0) == MVT::i32) 2042 return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); 2043 else 2044 return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); 2045 } 2046 bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) { 2047 return isIntS16Immediate(Op.getNode(), Imm); 2048 } 2049 2050 /// SelectAddressRegReg - Given the specified addressed, check to see if it 2051 /// can be represented as an indexed [r+r] operation. Returns false if it 2052 /// can be more efficiently represented with [r+imm]. 2053 bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, 2054 SDValue &Index, 2055 SelectionDAG &DAG) const { 2056 int16_t imm = 0; 2057 if (N.getOpcode() == ISD::ADD) { 2058 if (isIntS16Immediate(N.getOperand(1), imm)) 2059 return false; // r+i 2060 if (N.getOperand(1).getOpcode() == PPCISD::Lo) 2061 return false; // r+i 2062 2063 Base = N.getOperand(0); 2064 Index = N.getOperand(1); 2065 return true; 2066 } else if (N.getOpcode() == ISD::OR) { 2067 if (isIntS16Immediate(N.getOperand(1), imm)) 2068 return false; // r+i can fold it if we can. 2069 2070 // If this is an or of disjoint bitfields, we can codegen this as an add 2071 // (for better address arithmetic) if the LHS and RHS of the OR are provably 2072 // disjoint. 2073 KnownBits LHSKnown, RHSKnown; 2074 DAG.computeKnownBits(N.getOperand(0), LHSKnown); 2075 2076 if (LHSKnown.Zero.getBoolValue()) { 2077 DAG.computeKnownBits(N.getOperand(1), RHSKnown); 2078 // If all of the bits are known zero on the LHS or RHS, the add won't 2079 // carry. 2080 if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) { 2081 Base = N.getOperand(0); 2082 Index = N.getOperand(1); 2083 return true; 2084 } 2085 } 2086 } 2087 2088 return false; 2089 } 2090 2091 // If we happen to be doing an i64 load or store into a stack slot that has 2092 // less than a 4-byte alignment, then the frame-index elimination may need to 2093 // use an indexed load or store instruction (because the offset may not be a 2094 // multiple of 4). The extra register needed to hold the offset comes from the 2095 // register scavenger, and it is possible that the scavenger will need to use 2096 // an emergency spill slot. As a result, we need to make sure that a spill slot 2097 // is allocated when doing an i64 load/store into a less-than-4-byte-aligned 2098 // stack slot. 2099 static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { 2100 // FIXME: This does not handle the LWA case. 2101 if (VT != MVT::i64) 2102 return; 2103 2104 // NOTE: We'll exclude negative FIs here, which come from argument 2105 // lowering, because there are no known test cases triggering this problem 2106 // using packed structures (or similar). We can remove this exclusion if 2107 // we find such a test case. The reason why this is so test-case driven is 2108 // because this entire 'fixup' is only to prevent crashes (from the 2109 // register scavenger) on not-really-valid inputs. For example, if we have: 2110 // %a = alloca i1 2111 // %b = bitcast i1* %a to i64* 2112 // store i64* a, i64 b 2113 // then the store should really be marked as 'align 1', but is not. If it 2114 // were marked as 'align 1' then the indexed form would have been 2115 // instruction-selected initially, and the problem this 'fixup' is preventing 2116 // won't happen regardless. 2117 if (FrameIdx < 0) 2118 return; 2119 2120 MachineFunction &MF = DAG.getMachineFunction(); 2121 MachineFrameInfo &MFI = MF.getFrameInfo(); 2122 2123 unsigned Align = MFI.getObjectAlignment(FrameIdx); 2124 if (Align >= 4) 2125 return; 2126 2127 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 2128 FuncInfo->setHasNonRISpills(); 2129 } 2130 2131 /// Returns true if the address N can be represented by a base register plus 2132 /// a signed 16-bit displacement [r+imm], and if it is not better 2133 /// represented as reg+reg. If \p Alignment is non-zero, only accept 2134 /// displacements that are multiples of that value. 2135 bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, 2136 SDValue &Base, 2137 SelectionDAG &DAG, 2138 unsigned Alignment) const { 2139 // FIXME dl should come from parent load or store, not from address 2140 SDLoc dl(N); 2141 // If this can be more profitably realized as r+r, fail. 2142 if (SelectAddressRegReg(N, Disp, Base, DAG)) 2143 return false; 2144 2145 if (N.getOpcode() == ISD::ADD) { 2146 int16_t imm = 0; 2147 if (isIntS16Immediate(N.getOperand(1), imm) && 2148 (!Alignment || (imm % Alignment) == 0)) { 2149 Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); 2150 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { 2151 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); 2152 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); 2153 } else { 2154 Base = N.getOperand(0); 2155 } 2156 return true; // [r+i] 2157 } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { 2158 // Match LOAD (ADD (X, Lo(G))). 2159 assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue() 2160 && "Cannot handle constant offsets yet!"); 2161 Disp = N.getOperand(1).getOperand(0); // The global address. 2162 assert(Disp.getOpcode() == ISD::TargetGlobalAddress || 2163 Disp.getOpcode() == ISD::TargetGlobalTLSAddress || 2164 Disp.getOpcode() == ISD::TargetConstantPool || 2165 Disp.getOpcode() == ISD::TargetJumpTable); 2166 Base = N.getOperand(0); 2167 return true; // [&g+r] 2168 } 2169 } else if (N.getOpcode() == ISD::OR) { 2170 int16_t imm = 0; 2171 if (isIntS16Immediate(N.getOperand(1), imm) && 2172 (!Alignment || (imm % Alignment) == 0)) { 2173 // If this is an or of disjoint bitfields, we can codegen this as an add 2174 // (for better address arithmetic) if the LHS and RHS of the OR are 2175 // provably disjoint. 2176 KnownBits LHSKnown; 2177 DAG.computeKnownBits(N.getOperand(0), LHSKnown); 2178 2179 if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { 2180 // If all of the bits are known zero on the LHS or RHS, the add won't 2181 // carry. 2182 if (FrameIndexSDNode *FI = 2183 dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { 2184 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); 2185 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); 2186 } else { 2187 Base = N.getOperand(0); 2188 } 2189 Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); 2190 return true; 2191 } 2192 } 2193 } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { 2194 // Loading from a constant address. 2195 2196 // If this address fits entirely in a 16-bit sext immediate field, codegen 2197 // this as "d, 0" 2198 int16_t Imm; 2199 if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) { 2200 Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); 2201 Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, 2202 CN->getValueType(0)); 2203 return true; 2204 } 2205 2206 // Handle 32-bit sext immediates with LIS + addr mode. 2207 if ((CN->getValueType(0) == MVT::i32 || 2208 (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && 2209 (!Alignment || (CN->getZExtValue() % Alignment) == 0)) { 2210 int Addr = (int)CN->getZExtValue(); 2211 2212 // Otherwise, break this down into an LIS + disp. 2213 Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32); 2214 2215 Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl, 2216 MVT::i32); 2217 unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8; 2218 Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0); 2219 return true; 2220 } 2221 } 2222 2223 Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout())); 2224 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) { 2225 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); 2226 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); 2227 } else 2228 Base = N; 2229 return true; // [r+0] 2230 } 2231 2232 /// SelectAddressRegRegOnly - Given the specified addressed, force it to be 2233 /// represented as an indexed [r+r] operation. 2234 bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, 2235 SDValue &Index, 2236 SelectionDAG &DAG) const { 2237 // Check to see if we can easily represent this as an [r+r] address. This 2238 // will fail if it thinks that the address is more profitably represented as 2239 // reg+imm, e.g. where imm = 0. 2240 if (SelectAddressRegReg(N, Base, Index, DAG)) 2241 return true; 2242 2243 // If the address is the result of an add, we will utilize the fact that the 2244 // address calculation includes an implicit add. However, we can reduce 2245 // register pressure if we do not materialize a constant just for use as the 2246 // index register. We only get rid of the add if it is not an add of a 2247 // value and a 16-bit signed constant and both have a single use. 2248 int16_t imm = 0; 2249 if (N.getOpcode() == ISD::ADD && 2250 (!isIntS16Immediate(N.getOperand(1), imm) || 2251 !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) { 2252 Base = N.getOperand(0); 2253 Index = N.getOperand(1); 2254 return true; 2255 } 2256 2257 // Otherwise, do it the hard way, using R0 as the base register. 2258 Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, 2259 N.getValueType()); 2260 Index = N; 2261 return true; 2262 } 2263 2264 /// getPreIndexedAddressParts - returns true by value, base pointer and 2265 /// offset pointer and addressing mode by reference if the node's address 2266 /// can be legally represented as pre-indexed load / store address. 2267 bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 2268 SDValue &Offset, 2269 ISD::MemIndexedMode &AM, 2270 SelectionDAG &DAG) const { 2271 if (DisablePPCPreinc) return false; 2272 2273 bool isLoad = true; 2274 SDValue Ptr; 2275 EVT VT; 2276 unsigned Alignment; 2277 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 2278 Ptr = LD->getBasePtr(); 2279 VT = LD->getMemoryVT(); 2280 Alignment = LD->getAlignment(); 2281 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 2282 Ptr = ST->getBasePtr(); 2283 VT = ST->getMemoryVT(); 2284 Alignment = ST->getAlignment(); 2285 isLoad = false; 2286 } else 2287 return false; 2288 2289 // PowerPC doesn't have preinc load/store instructions for vectors (except 2290 // for QPX, which does have preinc r+r forms). 2291 if (VT.isVector()) { 2292 if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) { 2293 return false; 2294 } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) { 2295 AM = ISD::PRE_INC; 2296 return true; 2297 } 2298 } 2299 2300 if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { 2301 // Common code will reject creating a pre-inc form if the base pointer 2302 // is a frame index, or if N is a store and the base pointer is either 2303 // the same as or a predecessor of the value being stored. Check for 2304 // those situations here, and try with swapped Base/Offset instead. 2305 bool Swap = false; 2306 2307 if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base)) 2308 Swap = true; 2309 else if (!isLoad) { 2310 SDValue Val = cast<StoreSDNode>(N)->getValue(); 2311 if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode())) 2312 Swap = true; 2313 } 2314 2315 if (Swap) 2316 std::swap(Base, Offset); 2317 2318 AM = ISD::PRE_INC; 2319 return true; 2320 } 2321 2322 // LDU/STU can only handle immediates that are a multiple of 4. 2323 if (VT != MVT::i64) { 2324 if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0)) 2325 return false; 2326 } else { 2327 // LDU/STU need an address with at least 4-byte alignment. 2328 if (Alignment < 4) 2329 return false; 2330 2331 if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4)) 2332 return false; 2333 } 2334 2335 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 2336 // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of 2337 // sext i32 to i64 when addr mode is r+i. 2338 if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 && 2339 LD->getExtensionType() == ISD::SEXTLOAD && 2340 isa<ConstantSDNode>(Offset)) 2341 return false; 2342 } 2343 2344 AM = ISD::PRE_INC; 2345 return true; 2346 } 2347 2348 //===----------------------------------------------------------------------===// 2349 // LowerOperation implementation 2350 //===----------------------------------------------------------------------===// 2351 2352 /// Return true if we should reference labels using a PICBase, set the HiOpFlags 2353 /// and LoOpFlags to the target MO flags. 2354 static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget, 2355 unsigned &HiOpFlags, unsigned &LoOpFlags, 2356 const GlobalValue *GV = nullptr) { 2357 HiOpFlags = PPCII::MO_HA; 2358 LoOpFlags = PPCII::MO_LO; 2359 2360 // Don't use the pic base if not in PIC relocation model. 2361 if (IsPIC) { 2362 HiOpFlags |= PPCII::MO_PIC_FLAG; 2363 LoOpFlags |= PPCII::MO_PIC_FLAG; 2364 } 2365 2366 // If this is a reference to a global value that requires a non-lazy-ptr, make 2367 // sure that instruction lowering adds it. 2368 if (GV && Subtarget.hasLazyResolverStub(GV)) { 2369 HiOpFlags |= PPCII::MO_NLP_FLAG; 2370 LoOpFlags |= PPCII::MO_NLP_FLAG; 2371 2372 if (GV->hasHiddenVisibility()) { 2373 HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; 2374 LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; 2375 } 2376 } 2377 } 2378 2379 static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC, 2380 SelectionDAG &DAG) { 2381 SDLoc DL(HiPart); 2382 EVT PtrVT = HiPart.getValueType(); 2383 SDValue Zero = DAG.getConstant(0, DL, PtrVT); 2384 2385 SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero); 2386 SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero); 2387 2388 // With PIC, the first instruction is actually "GR+hi(&G)". 2389 if (isPIC) 2390 Hi = DAG.getNode(ISD::ADD, DL, PtrVT, 2391 DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi); 2392 2393 // Generate non-pic code that has direct accesses to the constant pool. 2394 // The address of the global is just (hi(&g)+lo(&g)). 2395 return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo); 2396 } 2397 2398 static void setUsesTOCBasePtr(MachineFunction &MF) { 2399 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 2400 FuncInfo->setUsesTOCBasePtr(); 2401 } 2402 2403 static void setUsesTOCBasePtr(SelectionDAG &DAG) { 2404 setUsesTOCBasePtr(DAG.getMachineFunction()); 2405 } 2406 2407 static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit, 2408 SDValue GA) { 2409 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 2410 SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) : 2411 DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); 2412 2413 SDValue Ops[] = { GA, Reg }; 2414 return DAG.getMemIntrinsicNode( 2415 PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, 2416 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true, 2417 false, 0); 2418 } 2419 2420 SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, 2421 SelectionDAG &DAG) const { 2422 EVT PtrVT = Op.getValueType(); 2423 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 2424 const Constant *C = CP->getConstVal(); 2425 2426 // 64-bit SVR4 ABI code is always position-independent. 2427 // The actual address of the GlobalValue is stored in the TOC. 2428 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2429 setUsesTOCBasePtr(DAG); 2430 SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); 2431 return getTOCEntry(DAG, SDLoc(CP), true, GA); 2432 } 2433 2434 unsigned MOHiFlag, MOLoFlag; 2435 bool IsPIC = isPositionIndependent(); 2436 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); 2437 2438 if (IsPIC && Subtarget.isSVR4ABI()) { 2439 SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 2440 PPCII::MO_PIC_FLAG); 2441 return getTOCEntry(DAG, SDLoc(CP), false, GA); 2442 } 2443 2444 SDValue CPIHi = 2445 DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag); 2446 SDValue CPILo = 2447 DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag); 2448 return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG); 2449 } 2450 2451 // For 64-bit PowerPC, prefer the more compact relative encodings. 2452 // This trades 32 bits per jump table entry for one or two instructions 2453 // on the jump site. 2454 unsigned PPCTargetLowering::getJumpTableEncoding() const { 2455 if (isJumpTableRelative()) 2456 return MachineJumpTableInfo::EK_LabelDifference32; 2457 2458 return TargetLowering::getJumpTableEncoding(); 2459 } 2460 2461 bool PPCTargetLowering::isJumpTableRelative() const { 2462 if (Subtarget.isPPC64()) 2463 return true; 2464 return TargetLowering::isJumpTableRelative(); 2465 } 2466 2467 SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table, 2468 SelectionDAG &DAG) const { 2469 if (!Subtarget.isPPC64()) 2470 return TargetLowering::getPICJumpTableRelocBase(Table, DAG); 2471 2472 switch (getTargetMachine().getCodeModel()) { 2473 case CodeModel::Default: 2474 case CodeModel::Small: 2475 case CodeModel::Medium: 2476 return TargetLowering::getPICJumpTableRelocBase(Table, DAG); 2477 default: 2478 return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(), 2479 getPointerTy(DAG.getDataLayout())); 2480 } 2481 } 2482 2483 const MCExpr * 2484 PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF, 2485 unsigned JTI, 2486 MCContext &Ctx) const { 2487 if (!Subtarget.isPPC64()) 2488 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 2489 2490 switch (getTargetMachine().getCodeModel()) { 2491 case CodeModel::Default: 2492 case CodeModel::Small: 2493 case CodeModel::Medium: 2494 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 2495 default: 2496 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); 2497 } 2498 } 2499 2500 SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 2501 EVT PtrVT = Op.getValueType(); 2502 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 2503 2504 // 64-bit SVR4 ABI code is always position-independent. 2505 // The actual address of the GlobalValue is stored in the TOC. 2506 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2507 setUsesTOCBasePtr(DAG); 2508 SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); 2509 return getTOCEntry(DAG, SDLoc(JT), true, GA); 2510 } 2511 2512 unsigned MOHiFlag, MOLoFlag; 2513 bool IsPIC = isPositionIndependent(); 2514 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); 2515 2516 if (IsPIC && Subtarget.isSVR4ABI()) { 2517 SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 2518 PPCII::MO_PIC_FLAG); 2519 return getTOCEntry(DAG, SDLoc(GA), false, GA); 2520 } 2521 2522 SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); 2523 SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag); 2524 return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG); 2525 } 2526 2527 SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, 2528 SelectionDAG &DAG) const { 2529 EVT PtrVT = Op.getValueType(); 2530 BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op); 2531 const BlockAddress *BA = BASDN->getBlockAddress(); 2532 2533 // 64-bit SVR4 ABI code is always position-independent. 2534 // The actual BlockAddress is stored in the TOC. 2535 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2536 setUsesTOCBasePtr(DAG); 2537 SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()); 2538 return getTOCEntry(DAG, SDLoc(BASDN), true, GA); 2539 } 2540 2541 unsigned MOHiFlag, MOLoFlag; 2542 bool IsPIC = isPositionIndependent(); 2543 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); 2544 SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag); 2545 SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag); 2546 return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG); 2547 } 2548 2549 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, 2550 SelectionDAG &DAG) const { 2551 // FIXME: TLS addresses currently use medium model code sequences, 2552 // which is the most useful form. Eventually support for small and 2553 // large models could be added if users need it, at the cost of 2554 // additional complexity. 2555 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 2556 if (DAG.getTarget().Options.EmulatedTLS) 2557 return LowerToTLSEmulatedModel(GA, DAG); 2558 2559 SDLoc dl(GA); 2560 const GlobalValue *GV = GA->getGlobal(); 2561 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2562 bool is64bit = Subtarget.isPPC64(); 2563 const Module *M = DAG.getMachineFunction().getFunction()->getParent(); 2564 PICLevel::Level picLevel = M->getPICLevel(); 2565 2566 TLSModel::Model Model = getTargetMachine().getTLSModel(GV); 2567 2568 if (Model == TLSModel::LocalExec) { 2569 SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2570 PPCII::MO_TPREL_HA); 2571 SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2572 PPCII::MO_TPREL_LO); 2573 SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64) 2574 : DAG.getRegister(PPC::R2, MVT::i32); 2575 2576 SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); 2577 return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); 2578 } 2579 2580 if (Model == TLSModel::InitialExec) { 2581 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); 2582 SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2583 PPCII::MO_TLS); 2584 SDValue GOTPtr; 2585 if (is64bit) { 2586 setUsesTOCBasePtr(DAG); 2587 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); 2588 GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, 2589 PtrVT, GOTReg, TGA); 2590 } else 2591 GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); 2592 SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, 2593 PtrVT, TGA, GOTPtr); 2594 return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS); 2595 } 2596 2597 if (Model == TLSModel::GeneralDynamic) { 2598 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); 2599 SDValue GOTPtr; 2600 if (is64bit) { 2601 setUsesTOCBasePtr(DAG); 2602 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); 2603 GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, 2604 GOTReg, TGA); 2605 } else { 2606 if (picLevel == PICLevel::SmallPIC) 2607 GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); 2608 else 2609 GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); 2610 } 2611 return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT, 2612 GOTPtr, TGA, TGA); 2613 } 2614 2615 if (Model == TLSModel::LocalDynamic) { 2616 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); 2617 SDValue GOTPtr; 2618 if (is64bit) { 2619 setUsesTOCBasePtr(DAG); 2620 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); 2621 GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, 2622 GOTReg, TGA); 2623 } else { 2624 if (picLevel == PICLevel::SmallPIC) 2625 GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); 2626 else 2627 GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); 2628 } 2629 SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl, 2630 PtrVT, GOTPtr, TGA, TGA); 2631 SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, 2632 PtrVT, TLSAddr, TGA); 2633 return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); 2634 } 2635 2636 llvm_unreachable("Unknown TLS model!"); 2637 } 2638 2639 SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, 2640 SelectionDAG &DAG) const { 2641 EVT PtrVT = Op.getValueType(); 2642 GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op); 2643 SDLoc DL(GSDN); 2644 const GlobalValue *GV = GSDN->getGlobal(); 2645 2646 // 64-bit SVR4 ABI code is always position-independent. 2647 // The actual address of the GlobalValue is stored in the TOC. 2648 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2649 setUsesTOCBasePtr(DAG); 2650 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset()); 2651 return getTOCEntry(DAG, DL, true, GA); 2652 } 2653 2654 unsigned MOHiFlag, MOLoFlag; 2655 bool IsPIC = isPositionIndependent(); 2656 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV); 2657 2658 if (IsPIC && Subtarget.isSVR4ABI()) { 2659 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 2660 GSDN->getOffset(), 2661 PPCII::MO_PIC_FLAG); 2662 return getTOCEntry(DAG, DL, false, GA); 2663 } 2664 2665 SDValue GAHi = 2666 DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag); 2667 SDValue GALo = 2668 DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag); 2669 2670 SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG); 2671 2672 // If the global reference is actually to a non-lazy-pointer, we have to do an 2673 // extra load to get the address of the global. 2674 if (MOHiFlag & PPCII::MO_NLP_FLAG) 2675 Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo()); 2676 return Ptr; 2677 } 2678 2679 SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 2680 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 2681 SDLoc dl(Op); 2682 2683 if (Op.getValueType() == MVT::v2i64) { 2684 // When the operands themselves are v2i64 values, we need to do something 2685 // special because VSX has no underlying comparison operations for these. 2686 if (Op.getOperand(0).getValueType() == MVT::v2i64) { 2687 // Equality can be handled by casting to the legal type for Altivec 2688 // comparisons, everything else needs to be expanded. 2689 if (CC == ISD::SETEQ || CC == ISD::SETNE) { 2690 return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 2691 DAG.getSetCC(dl, MVT::v4i32, 2692 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), 2693 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)), 2694 CC)); 2695 } 2696 2697 return SDValue(); 2698 } 2699 2700 // We handle most of these in the usual way. 2701 return Op; 2702 } 2703 2704 // If we're comparing for equality to zero, expose the fact that this is 2705 // implemented as a ctlz/srl pair on ppc, so that the dag combiner can 2706 // fold the new nodes. 2707 if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG)) 2708 return V; 2709 2710 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 2711 // Leave comparisons against 0 and -1 alone for now, since they're usually 2712 // optimized. FIXME: revisit this when we can custom lower all setcc 2713 // optimizations. 2714 if (C->isAllOnesValue() || C->isNullValue()) 2715 return SDValue(); 2716 } 2717 2718 // If we have an integer seteq/setne, turn it into a compare against zero 2719 // by xor'ing the rhs with the lhs, which is faster than setting a 2720 // condition register, reading it back out, and masking the correct bit. The 2721 // normal approach here uses sub to do this instead of xor. Using xor exposes 2722 // the result to other bit-twiddling opportunities. 2723 EVT LHSVT = Op.getOperand(0).getValueType(); 2724 if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 2725 EVT VT = Op.getValueType(); 2726 SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0), 2727 Op.getOperand(1)); 2728 return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC); 2729 } 2730 return SDValue(); 2731 } 2732 2733 SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 2734 SDNode *Node = Op.getNode(); 2735 EVT VT = Node->getValueType(0); 2736 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2737 SDValue InChain = Node->getOperand(0); 2738 SDValue VAListPtr = Node->getOperand(1); 2739 const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); 2740 SDLoc dl(Node); 2741 2742 assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only"); 2743 2744 // gpr_index 2745 SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, 2746 VAListPtr, MachinePointerInfo(SV), MVT::i8); 2747 InChain = GprIndex.getValue(1); 2748 2749 if (VT == MVT::i64) { 2750 // Check if GprIndex is even 2751 SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex, 2752 DAG.getConstant(1, dl, MVT::i32)); 2753 SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd, 2754 DAG.getConstant(0, dl, MVT::i32), ISD::SETNE); 2755 SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex, 2756 DAG.getConstant(1, dl, MVT::i32)); 2757 // Align GprIndex to be even if it isn't 2758 GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne, 2759 GprIndex); 2760 } 2761 2762 // fpr index is 1 byte after gpr 2763 SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, 2764 DAG.getConstant(1, dl, MVT::i32)); 2765 2766 // fpr 2767 SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, 2768 FprPtr, MachinePointerInfo(SV), MVT::i8); 2769 InChain = FprIndex.getValue(1); 2770 2771 SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, 2772 DAG.getConstant(8, dl, MVT::i32)); 2773 2774 SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, 2775 DAG.getConstant(4, dl, MVT::i32)); 2776 2777 // areas 2778 SDValue OverflowArea = 2779 DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo()); 2780 InChain = OverflowArea.getValue(1); 2781 2782 SDValue RegSaveArea = 2783 DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo()); 2784 InChain = RegSaveArea.getValue(1); 2785 2786 // select overflow_area if index > 8 2787 SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex, 2788 DAG.getConstant(8, dl, MVT::i32), ISD::SETLT); 2789 2790 // adjustment constant gpr_index * 4/8 2791 SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32, 2792 VT.isInteger() ? GprIndex : FprIndex, 2793 DAG.getConstant(VT.isInteger() ? 4 : 8, dl, 2794 MVT::i32)); 2795 2796 // OurReg = RegSaveArea + RegConstant 2797 SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea, 2798 RegConstant); 2799 2800 // Floating types are 32 bytes into RegSaveArea 2801 if (VT.isFloatingPoint()) 2802 OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg, 2803 DAG.getConstant(32, dl, MVT::i32)); 2804 2805 // increase {f,g}pr_index by 1 (or 2 if VT is i64) 2806 SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32, 2807 VT.isInteger() ? GprIndex : FprIndex, 2808 DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl, 2809 MVT::i32)); 2810 2811 InChain = DAG.getTruncStore(InChain, dl, IndexPlus1, 2812 VT.isInteger() ? VAListPtr : FprPtr, 2813 MachinePointerInfo(SV), MVT::i8); 2814 2815 // determine if we should load from reg_save_area or overflow_area 2816 SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea); 2817 2818 // increase overflow_area by 4/8 if gpr/fpr > 8 2819 SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea, 2820 DAG.getConstant(VT.isInteger() ? 4 : 8, 2821 dl, MVT::i32)); 2822 2823 OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea, 2824 OverflowAreaPlusN); 2825 2826 InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr, 2827 MachinePointerInfo(), MVT::i32); 2828 2829 return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo()); 2830 } 2831 2832 SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 2833 assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only"); 2834 2835 // We have to copy the entire va_list struct: 2836 // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte 2837 return DAG.getMemcpy(Op.getOperand(0), Op, 2838 Op.getOperand(1), Op.getOperand(2), 2839 DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true, 2840 false, MachinePointerInfo(), MachinePointerInfo()); 2841 } 2842 2843 SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, 2844 SelectionDAG &DAG) const { 2845 return Op.getOperand(0); 2846 } 2847 2848 SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 2849 SelectionDAG &DAG) const { 2850 SDValue Chain = Op.getOperand(0); 2851 SDValue Trmp = Op.getOperand(1); // trampoline 2852 SDValue FPtr = Op.getOperand(2); // nested function 2853 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 2854 SDLoc dl(Op); 2855 2856 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2857 bool isPPC64 = (PtrVT == MVT::i64); 2858 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); 2859 2860 TargetLowering::ArgListTy Args; 2861 TargetLowering::ArgListEntry Entry; 2862 2863 Entry.Ty = IntPtrTy; 2864 Entry.Node = Trmp; Args.push_back(Entry); 2865 2866 // TrampSize == (isPPC64 ? 48 : 40); 2867 Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl, 2868 isPPC64 ? MVT::i64 : MVT::i32); 2869 Args.push_back(Entry); 2870 2871 Entry.Node = FPtr; Args.push_back(Entry); 2872 Entry.Node = Nest; Args.push_back(Entry); 2873 2874 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) 2875 TargetLowering::CallLoweringInfo CLI(DAG); 2876 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 2877 CallingConv::C, Type::getVoidTy(*DAG.getContext()), 2878 DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args)); 2879 2880 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2881 return CallResult.second; 2882 } 2883 2884 SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2885 MachineFunction &MF = DAG.getMachineFunction(); 2886 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 2887 EVT PtrVT = getPointerTy(MF.getDataLayout()); 2888 2889 SDLoc dl(Op); 2890 2891 if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) { 2892 // vastart just stores the address of the VarArgsFrameIndex slot into the 2893 // memory location argument. 2894 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 2895 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2896 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 2897 MachinePointerInfo(SV)); 2898 } 2899 2900 // For the 32-bit SVR4 ABI we follow the layout of the va_list struct. 2901 // We suppose the given va_list is already allocated. 2902 // 2903 // typedef struct { 2904 // char gpr; /* index into the array of 8 GPRs 2905 // * stored in the register save area 2906 // * gpr=0 corresponds to r3, 2907 // * gpr=1 to r4, etc. 2908 // */ 2909 // char fpr; /* index into the array of 8 FPRs 2910 // * stored in the register save area 2911 // * fpr=0 corresponds to f1, 2912 // * fpr=1 to f2, etc. 2913 // */ 2914 // char *overflow_arg_area; 2915 // /* location on stack that holds 2916 // * the next overflow argument 2917 // */ 2918 // char *reg_save_area; 2919 // /* where r3:r10 and f1:f8 (if saved) 2920 // * are stored 2921 // */ 2922 // } va_list[1]; 2923 2924 SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32); 2925 SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32); 2926 SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(), 2927 PtrVT); 2928 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 2929 PtrVT); 2930 2931 uint64_t FrameOffset = PtrVT.getSizeInBits()/8; 2932 SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT); 2933 2934 uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1; 2935 SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT); 2936 2937 uint64_t FPROffset = 1; 2938 SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT); 2939 2940 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2941 2942 // Store first byte : number of int regs 2943 SDValue firstStore = 2944 DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1), 2945 MachinePointerInfo(SV), MVT::i8); 2946 uint64_t nextOffset = FPROffset; 2947 SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1), 2948 ConstFPROffset); 2949 2950 // Store second byte : number of float regs 2951 SDValue secondStore = 2952 DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr, 2953 MachinePointerInfo(SV, nextOffset), MVT::i8); 2954 nextOffset += StackOffset; 2955 nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset); 2956 2957 // Store second word : arguments given on stack 2958 SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr, 2959 MachinePointerInfo(SV, nextOffset)); 2960 nextOffset += FrameOffset; 2961 nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset); 2962 2963 // Store third word : arguments given in registers 2964 return DAG.getStore(thirdStore, dl, FR, nextPtr, 2965 MachinePointerInfo(SV, nextOffset)); 2966 } 2967 2968 #include "PPCGenCallingConv.inc" 2969 2970 // Function whose sole purpose is to kill compiler warnings 2971 // stemming from unused functions included from PPCGenCallingConv.inc. 2972 CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { 2973 return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; 2974 } 2975 2976 bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, 2977 CCValAssign::LocInfo &LocInfo, 2978 ISD::ArgFlagsTy &ArgFlags, 2979 CCState &State) { 2980 return true; 2981 } 2982 2983 bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, 2984 MVT &LocVT, 2985 CCValAssign::LocInfo &LocInfo, 2986 ISD::ArgFlagsTy &ArgFlags, 2987 CCState &State) { 2988 static const MCPhysReg ArgRegs[] = { 2989 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 2990 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 2991 }; 2992 const unsigned NumArgRegs = array_lengthof(ArgRegs); 2993 2994 unsigned RegNum = State.getFirstUnallocated(ArgRegs); 2995 2996 // Skip one register if the first unallocated register has an even register 2997 // number and there are still argument registers available which have not been 2998 // allocated yet. RegNum is actually an index into ArgRegs, which means we 2999 // need to skip a register if RegNum is odd. 3000 if (RegNum != NumArgRegs && RegNum % 2 == 1) { 3001 State.AllocateReg(ArgRegs[RegNum]); 3002 } 3003 3004 // Always return false here, as this function only makes sure that the first 3005 // unallocated register has an odd register number and does not actually 3006 // allocate a register for the current argument. 3007 return false; 3008 } 3009 3010 bool 3011 llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT, 3012 MVT &LocVT, 3013 CCValAssign::LocInfo &LocInfo, 3014 ISD::ArgFlagsTy &ArgFlags, 3015 CCState &State) { 3016 static const MCPhysReg ArgRegs[] = { 3017 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 3018 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 3019 }; 3020 const unsigned NumArgRegs = array_lengthof(ArgRegs); 3021 3022 unsigned RegNum = State.getFirstUnallocated(ArgRegs); 3023 int RegsLeft = NumArgRegs - RegNum; 3024 3025 // Skip if there is not enough registers left for long double type (4 gpr regs 3026 // in soft float mode) and put long double argument on the stack. 3027 if (RegNum != NumArgRegs && RegsLeft < 4) { 3028 for (int i = 0; i < RegsLeft; i++) { 3029 State.AllocateReg(ArgRegs[RegNum + i]); 3030 } 3031 } 3032 3033 return false; 3034 } 3035 3036 bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, 3037 MVT &LocVT, 3038 CCValAssign::LocInfo &LocInfo, 3039 ISD::ArgFlagsTy &ArgFlags, 3040 CCState &State) { 3041 static const MCPhysReg ArgRegs[] = { 3042 PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, 3043 PPC::F8 3044 }; 3045 3046 const unsigned NumArgRegs = array_lengthof(ArgRegs); 3047 3048 unsigned RegNum = State.getFirstUnallocated(ArgRegs); 3049 3050 // If there is only one Floating-point register left we need to put both f64 3051 // values of a split ppc_fp128 value on the stack. 3052 if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) { 3053 State.AllocateReg(ArgRegs[RegNum]); 3054 } 3055 3056 // Always return false here, as this function only makes sure that the two f64 3057 // values a ppc_fp128 value is split into are both passed in registers or both 3058 // passed on the stack and does not actually allocate a register for the 3059 // current argument. 3060 return false; 3061 } 3062 3063 /// FPR - The set of FP registers that should be allocated for arguments, 3064 /// on Darwin. 3065 static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, 3066 PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, 3067 PPC::F11, PPC::F12, PPC::F13}; 3068 3069 /// QFPR - The set of QPX registers that should be allocated for arguments. 3070 static const MCPhysReg QFPR[] = { 3071 PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, 3072 PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; 3073 3074 /// CalculateStackSlotSize - Calculates the size reserved for this argument on 3075 /// the stack. 3076 static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, 3077 unsigned PtrByteSize) { 3078 unsigned ArgSize = ArgVT.getStoreSize(); 3079 if (Flags.isByVal()) 3080 ArgSize = Flags.getByValSize(); 3081 3082 // Round up to multiples of the pointer size, except for array members, 3083 // which are always packed. 3084 if (!Flags.isInConsecutiveRegs()) 3085 ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3086 3087 return ArgSize; 3088 } 3089 3090 /// CalculateStackSlotAlignment - Calculates the alignment of this argument 3091 /// on the stack. 3092 static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, 3093 ISD::ArgFlagsTy Flags, 3094 unsigned PtrByteSize) { 3095 unsigned Align = PtrByteSize; 3096 3097 // Altivec parameters are padded to a 16 byte boundary. 3098 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || 3099 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || 3100 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || 3101 ArgVT == MVT::v1i128) 3102 Align = 16; 3103 // QPX vector types stored in double-precision are padded to a 32 byte 3104 // boundary. 3105 else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1) 3106 Align = 32; 3107 3108 // ByVal parameters are aligned as requested. 3109 if (Flags.isByVal()) { 3110 unsigned BVAlign = Flags.getByValAlign(); 3111 if (BVAlign > PtrByteSize) { 3112 if (BVAlign % PtrByteSize != 0) 3113 llvm_unreachable( 3114 "ByVal alignment is not a multiple of the pointer size"); 3115 3116 Align = BVAlign; 3117 } 3118 } 3119 3120 // Array members are always packed to their original alignment. 3121 if (Flags.isInConsecutiveRegs()) { 3122 // If the array member was split into multiple registers, the first 3123 // needs to be aligned to the size of the full type. (Except for 3124 // ppcf128, which is only aligned as its f64 components.) 3125 if (Flags.isSplit() && OrigVT != MVT::ppcf128) 3126 Align = OrigVT.getStoreSize(); 3127 else 3128 Align = ArgVT.getStoreSize(); 3129 } 3130 3131 return Align; 3132 } 3133 3134 /// CalculateStackSlotUsed - Return whether this argument will use its 3135 /// stack slot (instead of being passed in registers). ArgOffset, 3136 /// AvailableFPRs, and AvailableVRs must hold the current argument 3137 /// position, and will be updated to account for this argument. 3138 static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, 3139 ISD::ArgFlagsTy Flags, 3140 unsigned PtrByteSize, 3141 unsigned LinkageSize, 3142 unsigned ParamAreaSize, 3143 unsigned &ArgOffset, 3144 unsigned &AvailableFPRs, 3145 unsigned &AvailableVRs, bool HasQPX) { 3146 bool UseMemory = false; 3147 3148 // Respect alignment of argument on the stack. 3149 unsigned Align = 3150 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); 3151 ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; 3152 // If there's no space left in the argument save area, we must 3153 // use memory (this check also catches zero-sized arguments). 3154 if (ArgOffset >= LinkageSize + ParamAreaSize) 3155 UseMemory = true; 3156 3157 // Allocate argument on the stack. 3158 ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); 3159 if (Flags.isInConsecutiveRegsLast()) 3160 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3161 // If we overran the argument save area, we must use memory 3162 // (this check catches arguments passed partially in memory) 3163 if (ArgOffset > LinkageSize + ParamAreaSize) 3164 UseMemory = true; 3165 3166 // However, if the argument is actually passed in an FPR or a VR, 3167 // we don't use memory after all. 3168 if (!Flags.isByVal()) { 3169 if (ArgVT == MVT::f32 || ArgVT == MVT::f64 || 3170 // QPX registers overlap with the scalar FP registers. 3171 (HasQPX && (ArgVT == MVT::v4f32 || 3172 ArgVT == MVT::v4f64 || 3173 ArgVT == MVT::v4i1))) 3174 if (AvailableFPRs > 0) { 3175 --AvailableFPRs; 3176 return false; 3177 } 3178 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || 3179 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || 3180 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || 3181 ArgVT == MVT::v1i128) 3182 if (AvailableVRs > 0) { 3183 --AvailableVRs; 3184 return false; 3185 } 3186 } 3187 3188 return UseMemory; 3189 } 3190 3191 /// EnsureStackAlignment - Round stack frame size up from NumBytes to 3192 /// ensure minimum alignment required for target. 3193 static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering, 3194 unsigned NumBytes) { 3195 unsigned TargetAlign = Lowering->getStackAlignment(); 3196 unsigned AlignMask = TargetAlign - 1; 3197 NumBytes = (NumBytes + AlignMask) & ~AlignMask; 3198 return NumBytes; 3199 } 3200 3201 SDValue PPCTargetLowering::LowerFormalArguments( 3202 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3203 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3204 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3205 if (Subtarget.isSVR4ABI()) { 3206 if (Subtarget.isPPC64()) 3207 return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, 3208 dl, DAG, InVals); 3209 else 3210 return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, 3211 dl, DAG, InVals); 3212 } else { 3213 return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, 3214 dl, DAG, InVals); 3215 } 3216 } 3217 3218 SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( 3219 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3220 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3221 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3222 3223 // 32-bit SVR4 ABI Stack Frame Layout: 3224 // +-----------------------------------+ 3225 // +--> | Back chain | 3226 // | +-----------------------------------+ 3227 // | | Floating-point register save area | 3228 // | +-----------------------------------+ 3229 // | | General register save area | 3230 // | +-----------------------------------+ 3231 // | | CR save word | 3232 // | +-----------------------------------+ 3233 // | | VRSAVE save word | 3234 // | +-----------------------------------+ 3235 // | | Alignment padding | 3236 // | +-----------------------------------+ 3237 // | | Vector register save area | 3238 // | +-----------------------------------+ 3239 // | | Local variable space | 3240 // | +-----------------------------------+ 3241 // | | Parameter list area | 3242 // | +-----------------------------------+ 3243 // | | LR save word | 3244 // | +-----------------------------------+ 3245 // SP--> +--- | Back chain | 3246 // +-----------------------------------+ 3247 // 3248 // Specifications: 3249 // System V Application Binary Interface PowerPC Processor Supplement 3250 // AltiVec Technology Programming Interface Manual 3251 3252 MachineFunction &MF = DAG.getMachineFunction(); 3253 MachineFrameInfo &MFI = MF.getFrameInfo(); 3254 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3255 3256 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3257 // Potential tail calls could cause overwriting of argument stack slots. 3258 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && 3259 (CallConv == CallingConv::Fast)); 3260 unsigned PtrByteSize = 4; 3261 3262 // Assign locations to all of the incoming arguments. 3263 SmallVector<CCValAssign, 16> ArgLocs; 3264 PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3265 *DAG.getContext()); 3266 3267 // Reserve space for the linkage area on the stack. 3268 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 3269 CCInfo.AllocateStack(LinkageSize, PtrByteSize); 3270 if (useSoftFloat()) 3271 CCInfo.PreAnalyzeFormalArguments(Ins); 3272 3273 CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); 3274 CCInfo.clearWasPPCF128(); 3275 3276 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3277 CCValAssign &VA = ArgLocs[i]; 3278 3279 // Arguments stored in registers. 3280 if (VA.isRegLoc()) { 3281 const TargetRegisterClass *RC; 3282 EVT ValVT = VA.getValVT(); 3283 3284 switch (ValVT.getSimpleVT().SimpleTy) { 3285 default: 3286 llvm_unreachable("ValVT not supported by formal arguments Lowering"); 3287 case MVT::i1: 3288 case MVT::i32: 3289 RC = &PPC::GPRCRegClass; 3290 break; 3291 case MVT::f32: 3292 if (Subtarget.hasP8Vector()) 3293 RC = &PPC::VSSRCRegClass; 3294 else 3295 RC = &PPC::F4RCRegClass; 3296 break; 3297 case MVT::f64: 3298 if (Subtarget.hasVSX()) 3299 RC = &PPC::VSFRCRegClass; 3300 else 3301 RC = &PPC::F8RCRegClass; 3302 break; 3303 case MVT::v16i8: 3304 case MVT::v8i16: 3305 case MVT::v4i32: 3306 RC = &PPC::VRRCRegClass; 3307 break; 3308 case MVT::v4f32: 3309 RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass; 3310 break; 3311 case MVT::v2f64: 3312 case MVT::v2i64: 3313 RC = &PPC::VRRCRegClass; 3314 break; 3315 case MVT::v4f64: 3316 RC = &PPC::QFRCRegClass; 3317 break; 3318 case MVT::v4i1: 3319 RC = &PPC::QBRCRegClass; 3320 break; 3321 } 3322 3323 // Transform the arguments stored in physical registers into virtual ones. 3324 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3325 SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, 3326 ValVT == MVT::i1 ? MVT::i32 : ValVT); 3327 3328 if (ValVT == MVT::i1) 3329 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); 3330 3331 InVals.push_back(ArgValue); 3332 } else { 3333 // Argument stored in memory. 3334 assert(VA.isMemLoc()); 3335 3336 unsigned ArgSize = VA.getLocVT().getStoreSize(); 3337 int FI = MFI.CreateFixedObject(ArgSize, VA.getLocMemOffset(), 3338 isImmutable); 3339 3340 // Create load nodes to retrieve arguments from the stack. 3341 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3342 InVals.push_back( 3343 DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo())); 3344 } 3345 } 3346 3347 // Assign locations to all of the incoming aggregate by value arguments. 3348 // Aggregates passed by value are stored in the local variable space of the 3349 // caller's stack frame, right above the parameter list area. 3350 SmallVector<CCValAssign, 16> ByValArgLocs; 3351 CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), 3352 ByValArgLocs, *DAG.getContext()); 3353 3354 // Reserve stack space for the allocations in CCInfo. 3355 CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); 3356 3357 CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal); 3358 3359 // Area that is at least reserved in the caller of this function. 3360 unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); 3361 MinReservedArea = std::max(MinReservedArea, LinkageSize); 3362 3363 // Set the size that is at least reserved in caller of this function. Tail 3364 // call optimized function's reserved stack space needs to be aligned so that 3365 // taking the difference between two stack areas will result in an aligned 3366 // stack. 3367 MinReservedArea = 3368 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); 3369 FuncInfo->setMinReservedArea(MinReservedArea); 3370 3371 SmallVector<SDValue, 8> MemOps; 3372 3373 // If the function takes variable number of arguments, make a frame index for 3374 // the start of the first vararg value... for expansion of llvm.va_start. 3375 if (isVarArg) { 3376 static const MCPhysReg GPArgRegs[] = { 3377 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 3378 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 3379 }; 3380 const unsigned NumGPArgRegs = array_lengthof(GPArgRegs); 3381 3382 static const MCPhysReg FPArgRegs[] = { 3383 PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, 3384 PPC::F8 3385 }; 3386 unsigned NumFPArgRegs = array_lengthof(FPArgRegs); 3387 3388 if (useSoftFloat()) 3389 NumFPArgRegs = 0; 3390 3391 FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs)); 3392 FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs)); 3393 3394 // Make room for NumGPArgRegs and NumFPArgRegs. 3395 int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 + 3396 NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8; 3397 3398 FuncInfo->setVarArgsStackOffset( 3399 MFI.CreateFixedObject(PtrVT.getSizeInBits()/8, 3400 CCInfo.getNextStackOffset(), true)); 3401 3402 FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false)); 3403 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3404 3405 // The fixed integer arguments of a variadic function are stored to the 3406 // VarArgsFrameIndex on the stack so that they may be loaded by 3407 // dereferencing the result of va_next. 3408 for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) { 3409 // Get an existing live-in vreg, or add a new one. 3410 unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]); 3411 if (!VReg) 3412 VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass); 3413 3414 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3415 SDValue Store = 3416 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 3417 MemOps.push_back(Store); 3418 // Increment the address by four for the next argument to store 3419 SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); 3420 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 3421 } 3422 3423 // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6 3424 // is set. 3425 // The double arguments are stored to the VarArgsFrameIndex 3426 // on the stack. 3427 for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) { 3428 // Get an existing live-in vreg, or add a new one. 3429 unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]); 3430 if (!VReg) 3431 VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass); 3432 3433 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64); 3434 SDValue Store = 3435 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 3436 MemOps.push_back(Store); 3437 // Increment the address by eight for the next argument to store 3438 SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl, 3439 PtrVT); 3440 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 3441 } 3442 } 3443 3444 if (!MemOps.empty()) 3445 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3446 3447 return Chain; 3448 } 3449 3450 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote 3451 // value to MVT::i64 and then truncate to the correct register size. 3452 SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, 3453 EVT ObjectVT, SelectionDAG &DAG, 3454 SDValue ArgVal, 3455 const SDLoc &dl) const { 3456 if (Flags.isSExt()) 3457 ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal, 3458 DAG.getValueType(ObjectVT)); 3459 else if (Flags.isZExt()) 3460 ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal, 3461 DAG.getValueType(ObjectVT)); 3462 3463 return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal); 3464 } 3465 3466 SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( 3467 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3468 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3469 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3470 // TODO: add description of PPC stack frame format, or at least some docs. 3471 // 3472 bool isELFv2ABI = Subtarget.isELFv2ABI(); 3473 bool isLittleEndian = Subtarget.isLittleEndian(); 3474 MachineFunction &MF = DAG.getMachineFunction(); 3475 MachineFrameInfo &MFI = MF.getFrameInfo(); 3476 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3477 3478 assert(!(CallConv == CallingConv::Fast && isVarArg) && 3479 "fastcc not supported on varargs functions"); 3480 3481 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3482 // Potential tail calls could cause overwriting of argument stack slots. 3483 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && 3484 (CallConv == CallingConv::Fast)); 3485 unsigned PtrByteSize = 8; 3486 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 3487 3488 static const MCPhysReg GPR[] = { 3489 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 3490 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 3491 }; 3492 static const MCPhysReg VR[] = { 3493 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 3494 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 3495 }; 3496 3497 const unsigned Num_GPR_Regs = array_lengthof(GPR); 3498 const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; 3499 const unsigned Num_VR_Regs = array_lengthof(VR); 3500 const unsigned Num_QFPR_Regs = Num_FPR_Regs; 3501 3502 // Do a first pass over the arguments to determine whether the ABI 3503 // guarantees that our caller has allocated the parameter save area 3504 // on its stack frame. In the ELFv1 ABI, this is always the case; 3505 // in the ELFv2 ABI, it is true if this is a vararg function or if 3506 // any parameter is located in a stack slot. 3507 3508 bool HasParameterArea = !isELFv2ABI || isVarArg; 3509 unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; 3510 unsigned NumBytes = LinkageSize; 3511 unsigned AvailableFPRs = Num_FPR_Regs; 3512 unsigned AvailableVRs = Num_VR_Regs; 3513 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 3514 if (Ins[i].Flags.isNest()) 3515 continue; 3516 3517 if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, 3518 PtrByteSize, LinkageSize, ParamAreaSize, 3519 NumBytes, AvailableFPRs, AvailableVRs, 3520 Subtarget.hasQPX())) 3521 HasParameterArea = true; 3522 } 3523 3524 // Add DAG nodes to load the arguments or copy them out of registers. On 3525 // entry to a function on PPC, the arguments start after the linkage area, 3526 // although the first ones are often in registers. 3527 3528 unsigned ArgOffset = LinkageSize; 3529 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 3530 unsigned &QFPR_idx = FPR_idx; 3531 SmallVector<SDValue, 8> MemOps; 3532 Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); 3533 unsigned CurArgIdx = 0; 3534 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { 3535 SDValue ArgVal; 3536 bool needsLoad = false; 3537 EVT ObjectVT = Ins[ArgNo].VT; 3538 EVT OrigVT = Ins[ArgNo].ArgVT; 3539 unsigned ObjSize = ObjectVT.getStoreSize(); 3540 unsigned ArgSize = ObjSize; 3541 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; 3542 if (Ins[ArgNo].isOrigArg()) { 3543 std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); 3544 CurArgIdx = Ins[ArgNo].getOrigArgIndex(); 3545 } 3546 // We re-align the argument offset for each argument, except when using the 3547 // fast calling convention, when we need to make sure we do that only when 3548 // we'll actually use a stack slot. 3549 unsigned CurArgOffset, Align; 3550 auto ComputeArgOffset = [&]() { 3551 /* Respect alignment of argument on the stack. */ 3552 Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); 3553 ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; 3554 CurArgOffset = ArgOffset; 3555 }; 3556 3557 if (CallConv != CallingConv::Fast) { 3558 ComputeArgOffset(); 3559 3560 /* Compute GPR index associated with argument offset. */ 3561 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; 3562 GPR_idx = std::min(GPR_idx, Num_GPR_Regs); 3563 } 3564 3565 // FIXME the codegen can be much improved in some cases. 3566 // We do not have to keep everything in memory. 3567 if (Flags.isByVal()) { 3568 assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); 3569 3570 if (CallConv == CallingConv::Fast) 3571 ComputeArgOffset(); 3572 3573 // ObjSize is the true size, ArgSize rounded up to multiple of registers. 3574 ObjSize = Flags.getByValSize(); 3575 ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3576 // Empty aggregate parameters do not take up registers. Examples: 3577 // struct { } a; 3578 // union { } b; 3579 // int c[0]; 3580 // etc. However, we have to provide a place-holder in InVals, so 3581 // pretend we have an 8-byte item at the current address for that 3582 // purpose. 3583 if (!ObjSize) { 3584 int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true); 3585 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3586 InVals.push_back(FIN); 3587 continue; 3588 } 3589 3590 // Create a stack object covering all stack doublewords occupied 3591 // by the argument. If the argument is (fully or partially) on 3592 // the stack, or if the argument is fully in registers but the 3593 // caller has allocated the parameter save anyway, we can refer 3594 // directly to the caller's stack frame. Otherwise, create a 3595 // local copy in our own frame. 3596 int FI; 3597 if (HasParameterArea || 3598 ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) 3599 FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true); 3600 else 3601 FI = MFI.CreateStackObject(ArgSize, Align, false); 3602 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3603 3604 // Handle aggregates smaller than 8 bytes. 3605 if (ObjSize < PtrByteSize) { 3606 // The value of the object is its address, which differs from the 3607 // address of the enclosing doubleword on big-endian systems. 3608 SDValue Arg = FIN; 3609 if (!isLittleEndian) { 3610 SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT); 3611 Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff); 3612 } 3613 InVals.push_back(Arg); 3614 3615 if (GPR_idx != Num_GPR_Regs) { 3616 unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); 3617 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3618 SDValue Store; 3619 3620 if (ObjSize==1 || ObjSize==2 || ObjSize==4) { 3621 EVT ObjType = (ObjSize == 1 ? MVT::i8 : 3622 (ObjSize == 2 ? MVT::i16 : MVT::i32)); 3623 Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, 3624 MachinePointerInfo(&*FuncArg), ObjType); 3625 } else { 3626 // For sizes that don't fit a truncating store (3, 5, 6, 7), 3627 // store the whole register as-is to the parameter save area 3628 // slot. 3629 Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3630 MachinePointerInfo(&*FuncArg)); 3631 } 3632 3633 MemOps.push_back(Store); 3634 } 3635 // Whether we copied from a register or not, advance the offset 3636 // into the parameter save area by a full doubleword. 3637 ArgOffset += PtrByteSize; 3638 continue; 3639 } 3640 3641 // The value of the object is its address, which is the address of 3642 // its first stack doubleword. 3643 InVals.push_back(FIN); 3644 3645 // Store whatever pieces of the object are in registers to memory. 3646 for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { 3647 if (GPR_idx == Num_GPR_Regs) 3648 break; 3649 3650 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 3651 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3652 SDValue Addr = FIN; 3653 if (j) { 3654 SDValue Off = DAG.getConstant(j, dl, PtrVT); 3655 Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); 3656 } 3657 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, 3658 MachinePointerInfo(&*FuncArg, j)); 3659 MemOps.push_back(Store); 3660 ++GPR_idx; 3661 } 3662 ArgOffset += ArgSize; 3663 continue; 3664 } 3665 3666 switch (ObjectVT.getSimpleVT().SimpleTy) { 3667 default: llvm_unreachable("Unhandled argument type!"); 3668 case MVT::i1: 3669 case MVT::i32: 3670 case MVT::i64: 3671 if (Flags.isNest()) { 3672 // The 'nest' parameter, if any, is passed in R11. 3673 unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass); 3674 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 3675 3676 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) 3677 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); 3678 3679 break; 3680 } 3681 3682 // These can be scalar arguments or elements of an integer array type 3683 // passed directly. Clang may use those instead of "byval" aggregate 3684 // types to avoid forcing arguments to memory unnecessarily. 3685 if (GPR_idx != Num_GPR_Regs) { 3686 unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); 3687 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 3688 3689 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) 3690 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote 3691 // value to MVT::i64 and then truncate to the correct register size. 3692 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); 3693 } else { 3694 if (CallConv == CallingConv::Fast) 3695 ComputeArgOffset(); 3696 3697 needsLoad = true; 3698 ArgSize = PtrByteSize; 3699 } 3700 if (CallConv != CallingConv::Fast || needsLoad) 3701 ArgOffset += 8; 3702 break; 3703 3704 case MVT::f32: 3705 case MVT::f64: 3706 // These can be scalar arguments or elements of a float array type 3707 // passed directly. The latter are used to implement ELFv2 homogenous 3708 // float aggregates. 3709 if (FPR_idx != Num_FPR_Regs) { 3710 unsigned VReg; 3711 3712 if (ObjectVT == MVT::f32) 3713 VReg = MF.addLiveIn(FPR[FPR_idx], 3714 Subtarget.hasP8Vector() 3715 ? &PPC::VSSRCRegClass 3716 : &PPC::F4RCRegClass); 3717 else 3718 VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() 3719 ? &PPC::VSFRCRegClass 3720 : &PPC::F8RCRegClass); 3721 3722 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 3723 ++FPR_idx; 3724 } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) { 3725 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 3726 // once we support fp <-> gpr moves. 3727 3728 // This can only ever happen in the presence of f32 array types, 3729 // since otherwise we never run out of FPRs before running out 3730 // of GPRs. 3731 unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); 3732 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 3733 3734 if (ObjectVT == MVT::f32) { 3735 if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) 3736 ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, 3737 DAG.getConstant(32, dl, MVT::i32)); 3738 ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); 3739 } 3740 3741 ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); 3742 } else { 3743 if (CallConv == CallingConv::Fast) 3744 ComputeArgOffset(); 3745 3746 needsLoad = true; 3747 } 3748 3749 // When passing an array of floats, the array occupies consecutive 3750 // space in the argument area; only round up to the next doubleword 3751 // at the end of the array. Otherwise, each float takes 8 bytes. 3752 if (CallConv != CallingConv::Fast || needsLoad) { 3753 ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; 3754 ArgOffset += ArgSize; 3755 if (Flags.isInConsecutiveRegsLast()) 3756 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3757 } 3758 break; 3759 case MVT::v4f32: 3760 case MVT::v4i32: 3761 case MVT::v8i16: 3762 case MVT::v16i8: 3763 case MVT::v2f64: 3764 case MVT::v2i64: 3765 case MVT::v1i128: 3766 if (!Subtarget.hasQPX()) { 3767 // These can be scalar arguments or elements of a vector array type 3768 // passed directly. The latter are used to implement ELFv2 homogenous 3769 // vector aggregates. 3770 if (VR_idx != Num_VR_Regs) { 3771 unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); 3772 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 3773 ++VR_idx; 3774 } else { 3775 if (CallConv == CallingConv::Fast) 3776 ComputeArgOffset(); 3777 3778 needsLoad = true; 3779 } 3780 if (CallConv != CallingConv::Fast || needsLoad) 3781 ArgOffset += 16; 3782 break; 3783 } // not QPX 3784 3785 assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && 3786 "Invalid QPX parameter type"); 3787 /* fall through */ 3788 3789 case MVT::v4f64: 3790 case MVT::v4i1: 3791 // QPX vectors are treated like their scalar floating-point subregisters 3792 // (except that they're larger). 3793 unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32; 3794 if (QFPR_idx != Num_QFPR_Regs) { 3795 const TargetRegisterClass *RC; 3796 switch (ObjectVT.getSimpleVT().SimpleTy) { 3797 case MVT::v4f64: RC = &PPC::QFRCRegClass; break; 3798 case MVT::v4f32: RC = &PPC::QSRCRegClass; break; 3799 default: RC = &PPC::QBRCRegClass; break; 3800 } 3801 3802 unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC); 3803 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 3804 ++QFPR_idx; 3805 } else { 3806 if (CallConv == CallingConv::Fast) 3807 ComputeArgOffset(); 3808 needsLoad = true; 3809 } 3810 if (CallConv != CallingConv::Fast || needsLoad) 3811 ArgOffset += Sz; 3812 break; 3813 } 3814 3815 // We need to load the argument to a virtual register if we determined 3816 // above that we ran out of physical registers of the appropriate type. 3817 if (needsLoad) { 3818 if (ObjSize < ArgSize && !isLittleEndian) 3819 CurArgOffset += ArgSize - ObjSize; 3820 int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable); 3821 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3822 ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); 3823 } 3824 3825 InVals.push_back(ArgVal); 3826 } 3827 3828 // Area that is at least reserved in the caller of this function. 3829 unsigned MinReservedArea; 3830 if (HasParameterArea) 3831 MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); 3832 else 3833 MinReservedArea = LinkageSize; 3834 3835 // Set the size that is at least reserved in caller of this function. Tail 3836 // call optimized functions' reserved stack space needs to be aligned so that 3837 // taking the difference between two stack areas will result in an aligned 3838 // stack. 3839 MinReservedArea = 3840 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); 3841 FuncInfo->setMinReservedArea(MinReservedArea); 3842 3843 // If the function takes variable number of arguments, make a frame index for 3844 // the start of the first vararg value... for expansion of llvm.va_start. 3845 if (isVarArg) { 3846 int Depth = ArgOffset; 3847 3848 FuncInfo->setVarArgsFrameIndex( 3849 MFI.CreateFixedObject(PtrByteSize, Depth, true)); 3850 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3851 3852 // If this function is vararg, store any remaining integer argument regs 3853 // to their spots on the stack so that they may be loaded by dereferencing 3854 // the result of va_next. 3855 for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; 3856 GPR_idx < Num_GPR_Regs; ++GPR_idx) { 3857 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 3858 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3859 SDValue Store = 3860 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 3861 MemOps.push_back(Store); 3862 // Increment the address by four for the next argument to store 3863 SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT); 3864 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 3865 } 3866 } 3867 3868 if (!MemOps.empty()) 3869 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3870 3871 return Chain; 3872 } 3873 3874 SDValue PPCTargetLowering::LowerFormalArguments_Darwin( 3875 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3876 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3877 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3878 // TODO: add description of PPC stack frame format, or at least some docs. 3879 // 3880 MachineFunction &MF = DAG.getMachineFunction(); 3881 MachineFrameInfo &MFI = MF.getFrameInfo(); 3882 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3883 3884 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3885 bool isPPC64 = PtrVT == MVT::i64; 3886 // Potential tail calls could cause overwriting of argument stack slots. 3887 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && 3888 (CallConv == CallingConv::Fast)); 3889 unsigned PtrByteSize = isPPC64 ? 8 : 4; 3890 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 3891 unsigned ArgOffset = LinkageSize; 3892 // Area that is at least reserved in caller of this function. 3893 unsigned MinReservedArea = ArgOffset; 3894 3895 static const MCPhysReg GPR_32[] = { // 32-bit registers. 3896 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 3897 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 3898 }; 3899 static const MCPhysReg GPR_64[] = { // 64-bit registers. 3900 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 3901 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 3902 }; 3903 static const MCPhysReg VR[] = { 3904 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 3905 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 3906 }; 3907 3908 const unsigned Num_GPR_Regs = array_lengthof(GPR_32); 3909 const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; 3910 const unsigned Num_VR_Regs = array_lengthof( VR); 3911 3912 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 3913 3914 const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; 3915 3916 // In 32-bit non-varargs functions, the stack space for vectors is after the 3917 // stack space for non-vectors. We do not use this space unless we have 3918 // too many vectors to fit in registers, something that only occurs in 3919 // constructed examples:), but we have to walk the arglist to figure 3920 // that out...for the pathological case, compute VecArgOffset as the 3921 // start of the vector parameter area. Computing VecArgOffset is the 3922 // entire point of the following loop. 3923 unsigned VecArgOffset = ArgOffset; 3924 if (!isVarArg && !isPPC64) { 3925 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; 3926 ++ArgNo) { 3927 EVT ObjectVT = Ins[ArgNo].VT; 3928 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; 3929 3930 if (Flags.isByVal()) { 3931 // ObjSize is the true size, ArgSize rounded up to multiple of regs. 3932 unsigned ObjSize = Flags.getByValSize(); 3933 unsigned ArgSize = 3934 ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3935 VecArgOffset += ArgSize; 3936 continue; 3937 } 3938 3939 switch(ObjectVT.getSimpleVT().SimpleTy) { 3940 default: llvm_unreachable("Unhandled argument type!"); 3941 case MVT::i1: 3942 case MVT::i32: 3943 case MVT::f32: 3944 VecArgOffset += 4; 3945 break; 3946 case MVT::i64: // PPC64 3947 case MVT::f64: 3948 // FIXME: We are guaranteed to be !isPPC64 at this point. 3949 // Does MVT::i64 apply? 3950 VecArgOffset += 8; 3951 break; 3952 case MVT::v4f32: 3953 case MVT::v4i32: 3954 case MVT::v8i16: 3955 case MVT::v16i8: 3956 // Nothing to do, we're only looking at Nonvector args here. 3957 break; 3958 } 3959 } 3960 } 3961 // We've found where the vector parameter area in memory is. Skip the 3962 // first 12 parameters; these don't use that memory. 3963 VecArgOffset = ((VecArgOffset+15)/16)*16; 3964 VecArgOffset += 12*16; 3965 3966 // Add DAG nodes to load the arguments or copy them out of registers. On 3967 // entry to a function on PPC, the arguments start after the linkage area, 3968 // although the first ones are often in registers. 3969 3970 SmallVector<SDValue, 8> MemOps; 3971 unsigned nAltivecParamsAtEnd = 0; 3972 Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); 3973 unsigned CurArgIdx = 0; 3974 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { 3975 SDValue ArgVal; 3976 bool needsLoad = false; 3977 EVT ObjectVT = Ins[ArgNo].VT; 3978 unsigned ObjSize = ObjectVT.getSizeInBits()/8; 3979 unsigned ArgSize = ObjSize; 3980 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; 3981 if (Ins[ArgNo].isOrigArg()) { 3982 std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); 3983 CurArgIdx = Ins[ArgNo].getOrigArgIndex(); 3984 } 3985 unsigned CurArgOffset = ArgOffset; 3986 3987 // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. 3988 if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || 3989 ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) { 3990 if (isVarArg || isPPC64) { 3991 MinReservedArea = ((MinReservedArea+15)/16)*16; 3992 MinReservedArea += CalculateStackSlotSize(ObjectVT, 3993 Flags, 3994 PtrByteSize); 3995 } else nAltivecParamsAtEnd++; 3996 } else 3997 // Calculate min reserved area. 3998 MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT, 3999 Flags, 4000 PtrByteSize); 4001 4002 // FIXME the codegen can be much improved in some cases. 4003 // We do not have to keep everything in memory. 4004 if (Flags.isByVal()) { 4005 assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); 4006 4007 // ObjSize is the true size, ArgSize rounded up to multiple of registers. 4008 ObjSize = Flags.getByValSize(); 4009 ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 4010 // Objects of size 1 and 2 are right justified, everything else is 4011 // left justified. This means the memory address is adjusted forwards. 4012 if (ObjSize==1 || ObjSize==2) { 4013 CurArgOffset = CurArgOffset + (4 - ObjSize); 4014 } 4015 // The value of the object is its address. 4016 int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true); 4017 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4018 InVals.push_back(FIN); 4019 if (ObjSize==1 || ObjSize==2) { 4020 if (GPR_idx != Num_GPR_Regs) { 4021 unsigned VReg; 4022 if (isPPC64) 4023 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4024 else 4025 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4026 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 4027 EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16; 4028 SDValue Store = 4029 DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, 4030 MachinePointerInfo(&*FuncArg), ObjType); 4031 MemOps.push_back(Store); 4032 ++GPR_idx; 4033 } 4034 4035 ArgOffset += PtrByteSize; 4036 4037 continue; 4038 } 4039 for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { 4040 // Store whatever pieces of the object are in registers 4041 // to memory. ArgOffset will be the address of the beginning 4042 // of the object. 4043 if (GPR_idx != Num_GPR_Regs) { 4044 unsigned VReg; 4045 if (isPPC64) 4046 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4047 else 4048 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4049 int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true); 4050 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4051 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 4052 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 4053 MachinePointerInfo(&*FuncArg, j)); 4054 MemOps.push_back(Store); 4055 ++GPR_idx; 4056 ArgOffset += PtrByteSize; 4057 } else { 4058 ArgOffset += ArgSize - (ArgOffset-CurArgOffset); 4059 break; 4060 } 4061 } 4062 continue; 4063 } 4064 4065 switch (ObjectVT.getSimpleVT().SimpleTy) { 4066 default: llvm_unreachable("Unhandled argument type!"); 4067 case MVT::i1: 4068 case MVT::i32: 4069 if (!isPPC64) { 4070 if (GPR_idx != Num_GPR_Regs) { 4071 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4072 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 4073 4074 if (ObjectVT == MVT::i1) 4075 ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal); 4076 4077 ++GPR_idx; 4078 } else { 4079 needsLoad = true; 4080 ArgSize = PtrByteSize; 4081 } 4082 // All int arguments reserve stack space in the Darwin ABI. 4083 ArgOffset += PtrByteSize; 4084 break; 4085 } 4086 LLVM_FALLTHROUGH; 4087 case MVT::i64: // PPC64 4088 if (GPR_idx != Num_GPR_Regs) { 4089 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4090 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 4091 4092 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) 4093 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote 4094 // value to MVT::i64 and then truncate to the correct register size. 4095 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); 4096 4097 ++GPR_idx; 4098 } else { 4099 needsLoad = true; 4100 ArgSize = PtrByteSize; 4101 } 4102 // All int arguments reserve stack space in the Darwin ABI. 4103 ArgOffset += 8; 4104 break; 4105 4106 case MVT::f32: 4107 case MVT::f64: 4108 // Every 4 bytes of argument space consumes one of the GPRs available for 4109 // argument passing. 4110 if (GPR_idx != Num_GPR_Regs) { 4111 ++GPR_idx; 4112 if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64) 4113 ++GPR_idx; 4114 } 4115 if (FPR_idx != Num_FPR_Regs) { 4116 unsigned VReg; 4117 4118 if (ObjectVT == MVT::f32) 4119 VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); 4120 else 4121 VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass); 4122 4123 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 4124 ++FPR_idx; 4125 } else { 4126 needsLoad = true; 4127 } 4128 4129 // All FP arguments reserve stack space in the Darwin ABI. 4130 ArgOffset += isPPC64 ? 8 : ObjSize; 4131 break; 4132 case MVT::v4f32: 4133 case MVT::v4i32: 4134 case MVT::v8i16: 4135 case MVT::v16i8: 4136 // Note that vector arguments in registers don't reserve stack space, 4137 // except in varargs functions. 4138 if (VR_idx != Num_VR_Regs) { 4139 unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); 4140 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 4141 if (isVarArg) { 4142 while ((ArgOffset % 16) != 0) { 4143 ArgOffset += PtrByteSize; 4144 if (GPR_idx != Num_GPR_Regs) 4145 GPR_idx++; 4146 } 4147 ArgOffset += 16; 4148 GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64? 4149 } 4150 ++VR_idx; 4151 } else { 4152 if (!isVarArg && !isPPC64) { 4153 // Vectors go after all the nonvectors. 4154 CurArgOffset = VecArgOffset; 4155 VecArgOffset += 16; 4156 } else { 4157 // Vectors are aligned. 4158 ArgOffset = ((ArgOffset+15)/16)*16; 4159 CurArgOffset = ArgOffset; 4160 ArgOffset += 16; 4161 } 4162 needsLoad = true; 4163 } 4164 break; 4165 } 4166 4167 // We need to load the argument to a virtual register if we determined above 4168 // that we ran out of physical registers of the appropriate type. 4169 if (needsLoad) { 4170 int FI = MFI.CreateFixedObject(ObjSize, 4171 CurArgOffset + (ArgSize - ObjSize), 4172 isImmutable); 4173 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4174 ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); 4175 } 4176 4177 InVals.push_back(ArgVal); 4178 } 4179 4180 // Allow for Altivec parameters at the end, if needed. 4181 if (nAltivecParamsAtEnd) { 4182 MinReservedArea = ((MinReservedArea+15)/16)*16; 4183 MinReservedArea += 16*nAltivecParamsAtEnd; 4184 } 4185 4186 // Area that is at least reserved in the caller of this function. 4187 MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); 4188 4189 // Set the size that is at least reserved in caller of this function. Tail 4190 // call optimized functions' reserved stack space needs to be aligned so that 4191 // taking the difference between two stack areas will result in an aligned 4192 // stack. 4193 MinReservedArea = 4194 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); 4195 FuncInfo->setMinReservedArea(MinReservedArea); 4196 4197 // If the function takes variable number of arguments, make a frame index for 4198 // the start of the first vararg value... for expansion of llvm.va_start. 4199 if (isVarArg) { 4200 int Depth = ArgOffset; 4201 4202 FuncInfo->setVarArgsFrameIndex( 4203 MFI.CreateFixedObject(PtrVT.getSizeInBits()/8, 4204 Depth, true)); 4205 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 4206 4207 // If this function is vararg, store any remaining integer argument regs 4208 // to their spots on the stack so that they may be loaded by dereferencing 4209 // the result of va_next. 4210 for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { 4211 unsigned VReg; 4212 4213 if (isPPC64) 4214 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4215 else 4216 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4217 4218 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 4219 SDValue Store = 4220 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 4221 MemOps.push_back(Store); 4222 // Increment the address by four for the next argument to store 4223 SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); 4224 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 4225 } 4226 } 4227 4228 if (!MemOps.empty()) 4229 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 4230 4231 return Chain; 4232 } 4233 4234 /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be 4235 /// adjusted to accommodate the arguments for the tailcall. 4236 static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, 4237 unsigned ParamSize) { 4238 4239 if (!isTailCall) return 0; 4240 4241 PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>(); 4242 unsigned CallerMinReservedArea = FI->getMinReservedArea(); 4243 int SPDiff = (int)CallerMinReservedArea - (int)ParamSize; 4244 // Remember only if the new adjustement is bigger. 4245 if (SPDiff < FI->getTailCallSPDelta()) 4246 FI->setTailCallSPDelta(SPDiff); 4247 4248 return SPDiff; 4249 } 4250 4251 static bool isFunctionGlobalAddress(SDValue Callee); 4252 4253 static bool 4254 resideInSameSection(const Function *Caller, SDValue Callee, 4255 const TargetMachine &TM) { 4256 // If !G, Callee can be an external symbol. 4257 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 4258 if (!G) 4259 return false; 4260 4261 const GlobalValue *GV = G->getGlobal(); 4262 if (!GV->isStrongDefinitionForLinker()) 4263 return false; 4264 4265 // Any explicitly-specified sections and section prefixes must also match. 4266 // Also, if we're using -ffunction-sections, then each function is always in 4267 // a different section (the same is true for COMDAT functions). 4268 if (TM.getFunctionSections() || GV->hasComdat() || Caller->hasComdat() || 4269 GV->getSection() != Caller->getSection()) 4270 return false; 4271 if (const auto *F = dyn_cast<Function>(GV)) { 4272 if (F->getSectionPrefix() != Caller->getSectionPrefix()) 4273 return false; 4274 } 4275 4276 // If the callee might be interposed, then we can't assume the ultimate call 4277 // target will be in the same section. Even in cases where we can assume that 4278 // interposition won't happen, in any case where the linker might insert a 4279 // stub to allow for interposition, we must generate code as though 4280 // interposition might occur. To understand why this matters, consider a 4281 // situation where: a -> b -> c where the arrows indicate calls. b and c are 4282 // in the same section, but a is in a different module (i.e. has a different 4283 // TOC base pointer). If the linker allows for interposition between b and c, 4284 // then it will generate a stub for the call edge between b and c which will 4285 // save the TOC pointer into the designated stack slot allocated by b. If we 4286 // return true here, and therefore allow a tail call between b and c, that 4287 // stack slot won't exist and the b -> c stub will end up saving b'c TOC base 4288 // pointer into the stack slot allocated by a (where the a -> b stub saved 4289 // a's TOC base pointer). If we're not considering a tail call, but rather, 4290 // whether a nop is needed after the call instruction in b, because the linker 4291 // will insert a stub, it might complain about a missing nop if we omit it 4292 // (although many don't complain in this case). 4293 if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV)) 4294 return false; 4295 4296 return true; 4297 } 4298 4299 static bool 4300 needStackSlotPassParameters(const PPCSubtarget &Subtarget, 4301 const SmallVectorImpl<ISD::OutputArg> &Outs) { 4302 assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64()); 4303 4304 const unsigned PtrByteSize = 8; 4305 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 4306 4307 static const MCPhysReg GPR[] = { 4308 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 4309 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 4310 }; 4311 static const MCPhysReg VR[] = { 4312 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 4313 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 4314 }; 4315 4316 const unsigned NumGPRs = array_lengthof(GPR); 4317 const unsigned NumFPRs = 13; 4318 const unsigned NumVRs = array_lengthof(VR); 4319 const unsigned ParamAreaSize = NumGPRs * PtrByteSize; 4320 4321 unsigned NumBytes = LinkageSize; 4322 unsigned AvailableFPRs = NumFPRs; 4323 unsigned AvailableVRs = NumVRs; 4324 4325 for (const ISD::OutputArg& Param : Outs) { 4326 if (Param.Flags.isNest()) continue; 4327 4328 if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, 4329 PtrByteSize, LinkageSize, ParamAreaSize, 4330 NumBytes, AvailableFPRs, AvailableVRs, 4331 Subtarget.hasQPX())) 4332 return true; 4333 } 4334 return false; 4335 } 4336 4337 static bool 4338 hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) { 4339 if (CS.arg_size() != CallerFn->arg_size()) 4340 return false; 4341 4342 ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin(); 4343 ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end(); 4344 Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin(); 4345 4346 for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) { 4347 const Value* CalleeArg = *CalleeArgIter; 4348 const Value* CallerArg = &(*CallerArgIter); 4349 if (CalleeArg == CallerArg) 4350 continue; 4351 4352 // e.g. @caller([4 x i64] %a, [4 x i64] %b) { 4353 // tail call @callee([4 x i64] undef, [4 x i64] %b) 4354 // } 4355 // 1st argument of callee is undef and has the same type as caller. 4356 if (CalleeArg->getType() == CallerArg->getType() && 4357 isa<UndefValue>(CalleeArg)) 4358 continue; 4359 4360 return false; 4361 } 4362 4363 return true; 4364 } 4365 4366 bool 4367 PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( 4368 SDValue Callee, 4369 CallingConv::ID CalleeCC, 4370 ImmutableCallSite CS, 4371 bool isVarArg, 4372 const SmallVectorImpl<ISD::OutputArg> &Outs, 4373 const SmallVectorImpl<ISD::InputArg> &Ins, 4374 SelectionDAG& DAG) const { 4375 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; 4376 4377 if (DisableSCO && !TailCallOpt) return false; 4378 4379 // Variadic argument functions are not supported. 4380 if (isVarArg) return false; 4381 4382 MachineFunction &MF = DAG.getMachineFunction(); 4383 CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); 4384 4385 // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has 4386 // the same calling convention 4387 if (CallerCC != CalleeCC) return false; 4388 4389 // SCO support C calling convention 4390 if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C) 4391 return false; 4392 4393 // Caller contains any byval parameter is not supported. 4394 if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); })) 4395 return false; 4396 4397 // Callee contains any byval parameter is not supported, too. 4398 // Note: This is a quick work around, because in some cases, e.g. 4399 // caller's stack size > callee's stack size, we are still able to apply 4400 // sibling call optimization. See: https://reviews.llvm.org/D23441#513574 4401 if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); })) 4402 return false; 4403 4404 // No TCO/SCO on indirect call because Caller have to restore its TOC 4405 if (!isFunctionGlobalAddress(Callee) && 4406 !isa<ExternalSymbolSDNode>(Callee)) 4407 return false; 4408 4409 // Check if Callee resides in the same section, because for now, PPC64 SVR4 4410 // ABI (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another 4411 // section. 4412 // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977 4413 if (!resideInSameSection(MF.getFunction(), Callee, getTargetMachine())) 4414 return false; 4415 4416 // TCO allows altering callee ABI, so we don't have to check further. 4417 if (CalleeCC == CallingConv::Fast && TailCallOpt) 4418 return true; 4419 4420 if (DisableSCO) return false; 4421 4422 // If callee use the same argument list that caller is using, then we can 4423 // apply SCO on this case. If it is not, then we need to check if callee needs 4424 // stack for passing arguments. 4425 if (!hasSameArgumentList(MF.getFunction(), CS) && 4426 needStackSlotPassParameters(Subtarget, Outs)) { 4427 return false; 4428 } 4429 4430 return true; 4431 } 4432 4433 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 4434 /// for tail call optimization. Targets which want to do tail call 4435 /// optimization should implement this function. 4436 bool 4437 PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 4438 CallingConv::ID CalleeCC, 4439 bool isVarArg, 4440 const SmallVectorImpl<ISD::InputArg> &Ins, 4441 SelectionDAG& DAG) const { 4442 if (!getTargetMachine().Options.GuaranteedTailCallOpt) 4443 return false; 4444 4445 // Variable argument functions are not supported. 4446 if (isVarArg) 4447 return false; 4448 4449 MachineFunction &MF = DAG.getMachineFunction(); 4450 CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); 4451 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { 4452 // Functions containing by val parameters are not supported. 4453 for (unsigned i = 0; i != Ins.size(); i++) { 4454 ISD::ArgFlagsTy Flags = Ins[i].Flags; 4455 if (Flags.isByVal()) return false; 4456 } 4457 4458 // Non-PIC/GOT tail calls are supported. 4459 if (getTargetMachine().getRelocationModel() != Reloc::PIC_) 4460 return true; 4461 4462 // At the moment we can only do local tail calls (in same module, hidden 4463 // or protected) if we are generating PIC. 4464 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 4465 return G->getGlobal()->hasHiddenVisibility() 4466 || G->getGlobal()->hasProtectedVisibility(); 4467 } 4468 4469 return false; 4470 } 4471 4472 /// isCallCompatibleAddress - Return the immediate to use if the specified 4473 /// 32-bit value is representable in the immediate field of a BxA instruction. 4474 static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) { 4475 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4476 if (!C) return nullptr; 4477 4478 int Addr = C->getZExtValue(); 4479 if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero. 4480 SignExtend32<26>(Addr) != Addr) 4481 return nullptr; // Top 6 bits have to be sext of immediate. 4482 4483 return DAG 4484 .getConstant( 4485 (int)C->getZExtValue() >> 2, SDLoc(Op), 4486 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout())) 4487 .getNode(); 4488 } 4489 4490 namespace { 4491 4492 struct TailCallArgumentInfo { 4493 SDValue Arg; 4494 SDValue FrameIdxOp; 4495 int FrameIdx = 0; 4496 4497 TailCallArgumentInfo() = default; 4498 }; 4499 4500 } // end anonymous namespace 4501 4502 /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. 4503 static void StoreTailCallArgumentsToStackSlot( 4504 SelectionDAG &DAG, SDValue Chain, 4505 const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs, 4506 SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) { 4507 for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) { 4508 SDValue Arg = TailCallArgs[i].Arg; 4509 SDValue FIN = TailCallArgs[i].FrameIdxOp; 4510 int FI = TailCallArgs[i].FrameIdx; 4511 // Store relative to framepointer. 4512 MemOpChains.push_back(DAG.getStore( 4513 Chain, dl, Arg, FIN, 4514 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); 4515 } 4516 } 4517 4518 /// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to 4519 /// the appropriate stack slot for the tail call optimized function call. 4520 static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain, 4521 SDValue OldRetAddr, SDValue OldFP, 4522 int SPDiff, const SDLoc &dl) { 4523 if (SPDiff) { 4524 // Calculate the new stack slot for the return address. 4525 MachineFunction &MF = DAG.getMachineFunction(); 4526 const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); 4527 const PPCFrameLowering *FL = Subtarget.getFrameLowering(); 4528 bool isPPC64 = Subtarget.isPPC64(); 4529 int SlotSize = isPPC64 ? 8 : 4; 4530 int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset(); 4531 int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize, 4532 NewRetAddrLoc, true); 4533 EVT VT = isPPC64 ? MVT::i64 : MVT::i32; 4534 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT); 4535 Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx, 4536 MachinePointerInfo::getFixedStack(MF, NewRetAddr)); 4537 4538 // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack 4539 // slot as the FP is never overwritten. 4540 if (Subtarget.isDarwinABI()) { 4541 int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset(); 4542 int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc, 4543 true); 4544 SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT); 4545 Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx, 4546 MachinePointerInfo::getFixedStack( 4547 DAG.getMachineFunction(), NewFPIdx)); 4548 } 4549 } 4550 return Chain; 4551 } 4552 4553 /// CalculateTailCallArgDest - Remember Argument for later processing. Calculate 4554 /// the position of the argument. 4555 static void 4556 CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64, 4557 SDValue Arg, int SPDiff, unsigned ArgOffset, 4558 SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) { 4559 int Offset = ArgOffset + SPDiff; 4560 uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8; 4561 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); 4562 EVT VT = isPPC64 ? MVT::i64 : MVT::i32; 4563 SDValue FIN = DAG.getFrameIndex(FI, VT); 4564 TailCallArgumentInfo Info; 4565 Info.Arg = Arg; 4566 Info.FrameIdxOp = FIN; 4567 Info.FrameIdx = FI; 4568 TailCallArguments.push_back(Info); 4569 } 4570 4571 /// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address 4572 /// stack slot. Returns the chain as result and the loaded frame pointers in 4573 /// LROpOut/FPOpout. Used when tail calling. 4574 SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr( 4575 SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut, 4576 SDValue &FPOpOut, const SDLoc &dl) const { 4577 if (SPDiff) { 4578 // Load the LR and FP stack slot for later adjusting. 4579 EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; 4580 LROpOut = getReturnAddrFrameIndex(DAG); 4581 LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo()); 4582 Chain = SDValue(LROpOut.getNode(), 1); 4583 4584 // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack 4585 // slot as the FP is never overwritten. 4586 if (Subtarget.isDarwinABI()) { 4587 FPOpOut = getFramePointerFrameIndex(DAG); 4588 FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo()); 4589 Chain = SDValue(FPOpOut.getNode(), 1); 4590 } 4591 } 4592 return Chain; 4593 } 4594 4595 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 4596 /// by "Src" to address "Dst" of size "Size". Alignment information is 4597 /// specified by the specific parameter attribute. The copy will be passed as 4598 /// a byval function parameter. 4599 /// Sometimes what we are copying is the end of a larger object, the part that 4600 /// does not fit in registers. 4601 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, 4602 SDValue Chain, ISD::ArgFlagsTy Flags, 4603 SelectionDAG &DAG, const SDLoc &dl) { 4604 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); 4605 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 4606 false, false, false, MachinePointerInfo(), 4607 MachinePointerInfo()); 4608 } 4609 4610 /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of 4611 /// tail calls. 4612 static void LowerMemOpCallTo( 4613 SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg, 4614 SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64, 4615 bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains, 4616 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) { 4617 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 4618 if (!isTailCall) { 4619 if (isVector) { 4620 SDValue StackPtr; 4621 if (isPPC64) 4622 StackPtr = DAG.getRegister(PPC::X1, MVT::i64); 4623 else 4624 StackPtr = DAG.getRegister(PPC::R1, MVT::i32); 4625 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, 4626 DAG.getConstant(ArgOffset, dl, PtrVT)); 4627 } 4628 MemOpChains.push_back( 4629 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); 4630 // Calculate and remember argument location. 4631 } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset, 4632 TailCallArguments); 4633 } 4634 4635 static void 4636 PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, 4637 const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp, 4638 SDValue FPOp, 4639 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) { 4640 // Emit a sequence of copyto/copyfrom virtual registers for arguments that 4641 // might overwrite each other in case of tail call optimization. 4642 SmallVector<SDValue, 8> MemOpChains2; 4643 // Do not flag preceding copytoreg stuff together with the following stuff. 4644 InFlag = SDValue(); 4645 StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments, 4646 MemOpChains2, dl); 4647 if (!MemOpChains2.empty()) 4648 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); 4649 4650 // Store the return address to the appropriate stack slot. 4651 Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl); 4652 4653 // Emit callseq_end just before tailcall node. 4654 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 4655 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 4656 InFlag = Chain.getValue(1); 4657 } 4658 4659 // Is this global address that of a function that can be called by name? (as 4660 // opposed to something that must hold a descriptor for an indirect call). 4661 static bool isFunctionGlobalAddress(SDValue Callee) { 4662 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 4663 if (Callee.getOpcode() == ISD::GlobalTLSAddress || 4664 Callee.getOpcode() == ISD::TargetGlobalTLSAddress) 4665 return false; 4666 4667 return G->getGlobal()->getValueType()->isFunctionTy(); 4668 } 4669 4670 return false; 4671 } 4672 4673 static unsigned 4674 PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, 4675 SDValue CallSeqStart, const SDLoc &dl, int SPDiff, bool isTailCall, 4676 bool isPatchPoint, bool hasNest, 4677 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, 4678 SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys, 4679 ImmutableCallSite CS, const PPCSubtarget &Subtarget) { 4680 bool isPPC64 = Subtarget.isPPC64(); 4681 bool isSVR4ABI = Subtarget.isSVR4ABI(); 4682 bool isELFv2ABI = Subtarget.isELFv2ABI(); 4683 4684 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 4685 NodeTys.push_back(MVT::Other); // Returns a chain 4686 NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use. 4687 4688 unsigned CallOpc = PPCISD::CALL; 4689 4690 bool needIndirectCall = true; 4691 if (!isSVR4ABI || !isPPC64) 4692 if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { 4693 // If this is an absolute destination address, use the munged value. 4694 Callee = SDValue(Dest, 0); 4695 needIndirectCall = false; 4696 } 4697 4698 // PC-relative references to external symbols should go through $stub, unless 4699 // we're building with the leopard linker or later, which automatically 4700 // synthesizes these stubs. 4701 const TargetMachine &TM = DAG.getTarget(); 4702 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent(); 4703 const GlobalValue *GV = nullptr; 4704 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) 4705 GV = G->getGlobal(); 4706 bool Local = TM.shouldAssumeDSOLocal(*Mod, GV); 4707 bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64; 4708 4709 if (isFunctionGlobalAddress(Callee)) { 4710 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee); 4711 // A call to a TLS address is actually an indirect call to a 4712 // thread-specific pointer. 4713 unsigned OpFlags = 0; 4714 if (UsePlt) 4715 OpFlags = PPCII::MO_PLT; 4716 4717 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, 4718 // every direct call is) turn it into a TargetGlobalAddress / 4719 // TargetExternalSymbol node so that legalize doesn't hack it. 4720 Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, 4721 Callee.getValueType(), 0, OpFlags); 4722 needIndirectCall = false; 4723 } 4724 4725 if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 4726 unsigned char OpFlags = 0; 4727 4728 if (UsePlt) 4729 OpFlags = PPCII::MO_PLT; 4730 4731 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(), 4732 OpFlags); 4733 needIndirectCall = false; 4734 } 4735 4736 if (isPatchPoint) { 4737 // We'll form an invalid direct call when lowering a patchpoint; the full 4738 // sequence for an indirect call is complicated, and many of the 4739 // instructions introduced might have side effects (and, thus, can't be 4740 // removed later). The call itself will be removed as soon as the 4741 // argument/return lowering is complete, so the fact that it has the wrong 4742 // kind of operands should not really matter. 4743 needIndirectCall = false; 4744 } 4745 4746 if (needIndirectCall) { 4747 // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair 4748 // to do the call, we can't use PPCISD::CALL. 4749 SDValue MTCTROps[] = {Chain, Callee, InFlag}; 4750 4751 if (isSVR4ABI && isPPC64 && !isELFv2ABI) { 4752 // Function pointers in the 64-bit SVR4 ABI do not point to the function 4753 // entry point, but to the function descriptor (the function entry point 4754 // address is part of the function descriptor though). 4755 // The function descriptor is a three doubleword structure with the 4756 // following fields: function entry point, TOC base address and 4757 // environment pointer. 4758 // Thus for a call through a function pointer, the following actions need 4759 // to be performed: 4760 // 1. Save the TOC of the caller in the TOC save area of its stack 4761 // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()). 4762 // 2. Load the address of the function entry point from the function 4763 // descriptor. 4764 // 3. Load the TOC of the callee from the function descriptor into r2. 4765 // 4. Load the environment pointer from the function descriptor into 4766 // r11. 4767 // 5. Branch to the function entry point address. 4768 // 6. On return of the callee, the TOC of the caller needs to be 4769 // restored (this is done in FinishCall()). 4770 // 4771 // The loads are scheduled at the beginning of the call sequence, and the 4772 // register copies are flagged together to ensure that no other 4773 // operations can be scheduled in between. E.g. without flagging the 4774 // copies together, a TOC access in the caller could be scheduled between 4775 // the assignment of the callee TOC and the branch to the callee, which 4776 // results in the TOC access going through the TOC of the callee instead 4777 // of going through the TOC of the caller, which leads to incorrect code. 4778 4779 // Load the address of the function entry point from the function 4780 // descriptor. 4781 SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1); 4782 if (LDChain.getValueType() == MVT::Glue) 4783 LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2); 4784 4785 auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors() 4786 ? (MachineMemOperand::MODereferenceable | 4787 MachineMemOperand::MOInvariant) 4788 : MachineMemOperand::MONone; 4789 4790 MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr); 4791 SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI, 4792 /* Alignment = */ 8, MMOFlags); 4793 4794 // Load environment pointer into r11. 4795 SDValue PtrOff = DAG.getIntPtrConstant(16, dl); 4796 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff); 4797 SDValue LoadEnvPtr = 4798 DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16), 4799 /* Alignment = */ 8, MMOFlags); 4800 4801 SDValue TOCOff = DAG.getIntPtrConstant(8, dl); 4802 SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); 4803 SDValue TOCPtr = 4804 DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8), 4805 /* Alignment = */ 8, MMOFlags); 4806 4807 setUsesTOCBasePtr(DAG); 4808 SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr, 4809 InFlag); 4810 Chain = TOCVal.getValue(0); 4811 InFlag = TOCVal.getValue(1); 4812 4813 // If the function call has an explicit 'nest' parameter, it takes the 4814 // place of the environment pointer. 4815 if (!hasNest) { 4816 SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, 4817 InFlag); 4818 4819 Chain = EnvVal.getValue(0); 4820 InFlag = EnvVal.getValue(1); 4821 } 4822 4823 MTCTROps[0] = Chain; 4824 MTCTROps[1] = LoadFuncPtr; 4825 MTCTROps[2] = InFlag; 4826 } 4827 4828 Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, 4829 makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2)); 4830 InFlag = Chain.getValue(1); 4831 4832 NodeTys.clear(); 4833 NodeTys.push_back(MVT::Other); 4834 NodeTys.push_back(MVT::Glue); 4835 Ops.push_back(Chain); 4836 CallOpc = PPCISD::BCTRL; 4837 Callee.setNode(nullptr); 4838 // Add use of X11 (holding environment pointer) 4839 if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest) 4840 Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); 4841 // Add CTR register as callee so a bctr can be emitted later. 4842 if (isTailCall) 4843 Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT)); 4844 } 4845 4846 // If this is a direct call, pass the chain and the callee. 4847 if (Callee.getNode()) { 4848 Ops.push_back(Chain); 4849 Ops.push_back(Callee); 4850 } 4851 // If this is a tail call add stack pointer delta. 4852 if (isTailCall) 4853 Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32)); 4854 4855 // Add argument registers to the end of the list so that they are known live 4856 // into the call. 4857 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 4858 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 4859 RegsToPass[i].second.getValueType())); 4860 4861 // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live 4862 // into the call. 4863 if (isSVR4ABI && isPPC64 && !isPatchPoint) { 4864 setUsesTOCBasePtr(DAG); 4865 Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); 4866 } 4867 4868 return CallOpc; 4869 } 4870 4871 SDValue PPCTargetLowering::LowerCallResult( 4872 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 4873 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 4874 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 4875 SmallVector<CCValAssign, 16> RVLocs; 4876 CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 4877 *DAG.getContext()); 4878 CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC); 4879 4880 // Copy all of the result registers out of their specified physreg. 4881 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 4882 CCValAssign &VA = RVLocs[i]; 4883 assert(VA.isRegLoc() && "Can only return in registers!"); 4884 4885 SDValue Val = DAG.getCopyFromReg(Chain, dl, 4886 VA.getLocReg(), VA.getLocVT(), InFlag); 4887 Chain = Val.getValue(1); 4888 InFlag = Val.getValue(2); 4889 4890 switch (VA.getLocInfo()) { 4891 default: llvm_unreachable("Unknown loc info!"); 4892 case CCValAssign::Full: break; 4893 case CCValAssign::AExt: 4894 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 4895 break; 4896 case CCValAssign::ZExt: 4897 Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val, 4898 DAG.getValueType(VA.getValVT())); 4899 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 4900 break; 4901 case CCValAssign::SExt: 4902 Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val, 4903 DAG.getValueType(VA.getValVT())); 4904 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 4905 break; 4906 } 4907 4908 InVals.push_back(Val); 4909 } 4910 4911 return Chain; 4912 } 4913 4914 SDValue PPCTargetLowering::FinishCall( 4915 CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg, 4916 bool isPatchPoint, bool hasNest, SelectionDAG &DAG, 4917 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag, 4918 SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff, 4919 unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins, 4920 SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const { 4921 std::vector<EVT> NodeTys; 4922 SmallVector<SDValue, 8> Ops; 4923 unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, 4924 SPDiff, isTailCall, isPatchPoint, hasNest, 4925 RegsToPass, Ops, NodeTys, CS, Subtarget); 4926 4927 // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls 4928 if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) 4929 Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); 4930 4931 // When performing tail call optimization the callee pops its arguments off 4932 // the stack. Account for this here so these bytes can be pushed back on in 4933 // PPCFrameLowering::eliminateCallFramePseudoInstr. 4934 int BytesCalleePops = 4935 (CallConv == CallingConv::Fast && 4936 getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; 4937 4938 // Add a register mask operand representing the call-preserved registers. 4939 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); 4940 const uint32_t *Mask = 4941 TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv); 4942 assert(Mask && "Missing call preserved mask for calling convention"); 4943 Ops.push_back(DAG.getRegisterMask(Mask)); 4944 4945 if (InFlag.getNode()) 4946 Ops.push_back(InFlag); 4947 4948 // Emit tail call. 4949 if (isTailCall) { 4950 assert(((Callee.getOpcode() == ISD::Register && 4951 cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) || 4952 Callee.getOpcode() == ISD::TargetExternalSymbol || 4953 Callee.getOpcode() == ISD::TargetGlobalAddress || 4954 isa<ConstantSDNode>(Callee)) && 4955 "Expecting an global address, external symbol, absolute value or register"); 4956 4957 DAG.getMachineFunction().getFrameInfo().setHasTailCall(); 4958 return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops); 4959 } 4960 4961 // Add a NOP immediately after the branch instruction when using the 64-bit 4962 // SVR4 ABI. At link time, if caller and callee are in a different module and 4963 // thus have a different TOC, the call will be replaced with a call to a stub 4964 // function which saves the current TOC, loads the TOC of the callee and 4965 // branches to the callee. The NOP will be replaced with a load instruction 4966 // which restores the TOC of the caller from the TOC save slot of the current 4967 // stack frame. If caller and callee belong to the same module (and have the 4968 // same TOC), the NOP will remain unchanged. 4969 4970 MachineFunction &MF = DAG.getMachineFunction(); 4971 if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() && 4972 !isPatchPoint) { 4973 if (CallOpc == PPCISD::BCTRL) { 4974 // This is a call through a function pointer. 4975 // Restore the caller TOC from the save area into R2. 4976 // See PrepareCall() for more information about calls through function 4977 // pointers in the 64-bit SVR4 ABI. 4978 // We are using a target-specific load with r2 hard coded, because the 4979 // result of a target-independent load would never go directly into r2, 4980 // since r2 is a reserved register (which prevents the register allocator 4981 // from allocating it), resulting in an additional register being 4982 // allocated and an unnecessary move instruction being generated. 4983 CallOpc = PPCISD::BCTRL_LOAD_TOC; 4984 4985 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4986 SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); 4987 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); 4988 SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); 4989 SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); 4990 4991 // The address needs to go after the chain input but before the flag (or 4992 // any other variadic arguments). 4993 Ops.insert(std::next(Ops.begin()), AddTOC); 4994 } else if (CallOpc == PPCISD::CALL && 4995 !resideInSameSection(MF.getFunction(), Callee, DAG.getTarget())) { 4996 // Otherwise insert NOP for non-local calls. 4997 CallOpc = PPCISD::CALL_NOP; 4998 } 4999 } 5000 5001 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 5002 InFlag = Chain.getValue(1); 5003 5004 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 5005 DAG.getIntPtrConstant(BytesCalleePops, dl, true), 5006 InFlag, dl); 5007 if (!Ins.empty()) 5008 InFlag = Chain.getValue(1); 5009 5010 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 5011 Ins, dl, DAG, InVals); 5012 } 5013 5014 SDValue 5015 PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 5016 SmallVectorImpl<SDValue> &InVals) const { 5017 SelectionDAG &DAG = CLI.DAG; 5018 SDLoc &dl = CLI.DL; 5019 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 5020 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 5021 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 5022 SDValue Chain = CLI.Chain; 5023 SDValue Callee = CLI.Callee; 5024 bool &isTailCall = CLI.IsTailCall; 5025 CallingConv::ID CallConv = CLI.CallConv; 5026 bool isVarArg = CLI.IsVarArg; 5027 bool isPatchPoint = CLI.IsPatchPoint; 5028 ImmutableCallSite CS = CLI.CS; 5029 5030 if (isTailCall) { 5031 if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall())) 5032 isTailCall = false; 5033 else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) 5034 isTailCall = 5035 IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS, 5036 isVarArg, Outs, Ins, DAG); 5037 else 5038 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, 5039 Ins, DAG); 5040 if (isTailCall) { 5041 ++NumTailCalls; 5042 if (!getTargetMachine().Options.GuaranteedTailCallOpt) 5043 ++NumSiblingCalls; 5044 5045 assert(isa<GlobalAddressSDNode>(Callee) && 5046 "Callee should be an llvm::Function object."); 5047 DEBUG( 5048 const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 5049 const unsigned Width = 80 - strlen("TCO caller: ") 5050 - strlen(", callee linkage: 0, 0"); 5051 dbgs() << "TCO caller: " 5052 << left_justify(DAG.getMachineFunction().getName(), Width) 5053 << ", callee linkage: " 5054 << GV->getVisibility() << ", " << GV->getLinkage() << "\n" 5055 ); 5056 } 5057 } 5058 5059 if (!isTailCall && CS && CS.isMustTailCall()) 5060 report_fatal_error("failed to perform tail call elimination on a call " 5061 "site marked musttail"); 5062 5063 // When long calls (i.e. indirect calls) are always used, calls are always 5064 // made via function pointer. If we have a function name, first translate it 5065 // into a pointer. 5066 if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) && 5067 !isTailCall) 5068 Callee = LowerGlobalAddress(Callee, DAG); 5069 5070 if (Subtarget.isSVR4ABI()) { 5071 if (Subtarget.isPPC64()) 5072 return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, 5073 isTailCall, isPatchPoint, Outs, OutVals, Ins, 5074 dl, DAG, InVals, CS); 5075 else 5076 return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, 5077 isTailCall, isPatchPoint, Outs, OutVals, Ins, 5078 dl, DAG, InVals, CS); 5079 } 5080 5081 return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, 5082 isTailCall, isPatchPoint, Outs, OutVals, Ins, 5083 dl, DAG, InVals, CS); 5084 } 5085 5086 SDValue PPCTargetLowering::LowerCall_32SVR4( 5087 SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, 5088 bool isTailCall, bool isPatchPoint, 5089 const SmallVectorImpl<ISD::OutputArg> &Outs, 5090 const SmallVectorImpl<SDValue> &OutVals, 5091 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 5092 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 5093 ImmutableCallSite CS) const { 5094 // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description 5095 // of the 32-bit SVR4 ABI stack frame layout. 5096 5097 assert((CallConv == CallingConv::C || 5098 CallConv == CallingConv::Fast) && "Unknown calling convention!"); 5099 5100 unsigned PtrByteSize = 4; 5101 5102 MachineFunction &MF = DAG.getMachineFunction(); 5103 5104 // Mark this function as potentially containing a function that contains a 5105 // tail call. As a consequence the frame pointer will be used for dynamicalloc 5106 // and restoring the callers stack pointer in this functions epilog. This is 5107 // done because by tail calling the called function might overwrite the value 5108 // in this function's (MF) stack pointer stack slot 0(SP). 5109 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5110 CallConv == CallingConv::Fast) 5111 MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); 5112 5113 // Count how many bytes are to be pushed on the stack, including the linkage 5114 // area, parameter list area and the part of the local variable space which 5115 // contains copies of aggregates which are passed by value. 5116 5117 // Assign locations to all of the outgoing arguments. 5118 SmallVector<CCValAssign, 16> ArgLocs; 5119 PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 5120 5121 // Reserve space for the linkage area on the stack. 5122 CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(), 5123 PtrByteSize); 5124 if (useSoftFloat()) 5125 CCInfo.PreAnalyzeCallOperands(Outs); 5126 5127 if (isVarArg) { 5128 // Handle fixed and variable vector arguments differently. 5129 // Fixed vector arguments go into registers as long as registers are 5130 // available. Variable vector arguments always go into memory. 5131 unsigned NumArgs = Outs.size(); 5132 5133 for (unsigned i = 0; i != NumArgs; ++i) { 5134 MVT ArgVT = Outs[i].VT; 5135 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 5136 bool Result; 5137 5138 if (Outs[i].IsFixed) { 5139 Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, 5140 CCInfo); 5141 } else { 5142 Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full, 5143 ArgFlags, CCInfo); 5144 } 5145 5146 if (Result) { 5147 #ifndef NDEBUG 5148 errs() << "Call operand #" << i << " has unhandled type " 5149 << EVT(ArgVT).getEVTString() << "\n"; 5150 #endif 5151 llvm_unreachable(nullptr); 5152 } 5153 } 5154 } else { 5155 // All arguments are treated the same. 5156 CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4); 5157 } 5158 CCInfo.clearWasPPCF128(); 5159 5160 // Assign locations to all of the outgoing aggregate by value arguments. 5161 SmallVector<CCValAssign, 16> ByValArgLocs; 5162 CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext()); 5163 5164 // Reserve stack space for the allocations in CCInfo. 5165 CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); 5166 5167 CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal); 5168 5169 // Size of the linkage area, parameter list area and the part of the local 5170 // space variable where copies of aggregates which are passed by value are 5171 // stored. 5172 unsigned NumBytes = CCByValInfo.getNextStackOffset(); 5173 5174 // Calculate by how many bytes the stack has to be adjusted in case of tail 5175 // call optimization. 5176 int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); 5177 5178 // Adjust the stack pointer for the new arguments... 5179 // These operations are automatically eliminated by the prolog/epilog pass 5180 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 5181 SDValue CallSeqStart = Chain; 5182 5183 // Load the return address and frame pointer so it can be moved somewhere else 5184 // later. 5185 SDValue LROp, FPOp; 5186 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); 5187 5188 // Set up a copy of the stack pointer for use loading and storing any 5189 // arguments that may not fit in the registers available for argument 5190 // passing. 5191 SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32); 5192 5193 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 5194 SmallVector<TailCallArgumentInfo, 8> TailCallArguments; 5195 SmallVector<SDValue, 8> MemOpChains; 5196 5197 bool seenFloatArg = false; 5198 // Walk the register/memloc assignments, inserting copies/loads. 5199 for (unsigned i = 0, j = 0, e = ArgLocs.size(); 5200 i != e; 5201 ++i) { 5202 CCValAssign &VA = ArgLocs[i]; 5203 SDValue Arg = OutVals[i]; 5204 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5205 5206 if (Flags.isByVal()) { 5207 // Argument is an aggregate which is passed by value, thus we need to 5208 // create a copy of it in the local variable space of the current stack 5209 // frame (which is the stack frame of the caller) and pass the address of 5210 // this copy to the callee. 5211 assert((j < ByValArgLocs.size()) && "Index out of bounds!"); 5212 CCValAssign &ByValVA = ByValArgLocs[j++]; 5213 assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!"); 5214 5215 // Memory reserved in the local variable space of the callers stack frame. 5216 unsigned LocMemOffset = ByValVA.getLocMemOffset(); 5217 5218 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 5219 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), 5220 StackPtr, PtrOff); 5221 5222 // Create a copy of the argument in the local area of the current 5223 // stack frame. 5224 SDValue MemcpyCall = 5225 CreateCopyOfByValArgument(Arg, PtrOff, 5226 CallSeqStart.getNode()->getOperand(0), 5227 Flags, DAG, dl); 5228 5229 // This must go outside the CALLSEQ_START..END. 5230 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0, 5231 SDLoc(MemcpyCall)); 5232 DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), 5233 NewCallSeqStart.getNode()); 5234 Chain = CallSeqStart = NewCallSeqStart; 5235 5236 // Pass the address of the aggregate copy on the stack either in a 5237 // physical register or in the parameter list area of the current stack 5238 // frame to the callee. 5239 Arg = PtrOff; 5240 } 5241 5242 if (VA.isRegLoc()) { 5243 if (Arg.getValueType() == MVT::i1) 5244 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg); 5245 5246 seenFloatArg |= VA.getLocVT().isFloatingPoint(); 5247 // Put argument in a physical register. 5248 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 5249 } else { 5250 // Put argument in the parameter list area of the current stack frame. 5251 assert(VA.isMemLoc()); 5252 unsigned LocMemOffset = VA.getLocMemOffset(); 5253 5254 if (!isTailCall) { 5255 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 5256 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), 5257 StackPtr, PtrOff); 5258 5259 MemOpChains.push_back( 5260 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); 5261 } else { 5262 // Calculate and remember argument location. 5263 CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset, 5264 TailCallArguments); 5265 } 5266 } 5267 } 5268 5269 if (!MemOpChains.empty()) 5270 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 5271 5272 // Build a sequence of copy-to-reg nodes chained together with token chain 5273 // and flag operands which copy the outgoing args into the appropriate regs. 5274 SDValue InFlag; 5275 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 5276 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 5277 RegsToPass[i].second, InFlag); 5278 InFlag = Chain.getValue(1); 5279 } 5280 5281 // Set CR bit 6 to true if this is a vararg call with floating args passed in 5282 // registers. 5283 if (isVarArg) { 5284 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 5285 SDValue Ops[] = { Chain, InFlag }; 5286 5287 Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, 5288 dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1)); 5289 5290 InFlag = Chain.getValue(1); 5291 } 5292 5293 if (isTailCall) 5294 PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, 5295 TailCallArguments); 5296 5297 return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, 5298 /* unused except on PPC64 ELFv1 */ false, DAG, 5299 RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, 5300 NumBytes, Ins, InVals, CS); 5301 } 5302 5303 // Copy an argument into memory, being careful to do this outside the 5304 // call sequence for the call to which the argument belongs. 5305 SDValue PPCTargetLowering::createMemcpyOutsideCallSeq( 5306 SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags, 5307 SelectionDAG &DAG, const SDLoc &dl) const { 5308 SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, 5309 CallSeqStart.getNode()->getOperand(0), 5310 Flags, DAG, dl); 5311 // The MEMCPY must go outside the CALLSEQ_START..END. 5312 int64_t FrameSize = CallSeqStart.getConstantOperandVal(1); 5313 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0, 5314 SDLoc(MemcpyCall)); 5315 DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), 5316 NewCallSeqStart.getNode()); 5317 return NewCallSeqStart; 5318 } 5319 5320 SDValue PPCTargetLowering::LowerCall_64SVR4( 5321 SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, 5322 bool isTailCall, bool isPatchPoint, 5323 const SmallVectorImpl<ISD::OutputArg> &Outs, 5324 const SmallVectorImpl<SDValue> &OutVals, 5325 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 5326 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 5327 ImmutableCallSite CS) const { 5328 bool isELFv2ABI = Subtarget.isELFv2ABI(); 5329 bool isLittleEndian = Subtarget.isLittleEndian(); 5330 unsigned NumOps = Outs.size(); 5331 bool hasNest = false; 5332 bool IsSibCall = false; 5333 5334 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5335 unsigned PtrByteSize = 8; 5336 5337 MachineFunction &MF = DAG.getMachineFunction(); 5338 5339 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt) 5340 IsSibCall = true; 5341 5342 // Mark this function as potentially containing a function that contains a 5343 // tail call. As a consequence the frame pointer will be used for dynamicalloc 5344 // and restoring the callers stack pointer in this functions epilog. This is 5345 // done because by tail calling the called function might overwrite the value 5346 // in this function's (MF) stack pointer stack slot 0(SP). 5347 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5348 CallConv == CallingConv::Fast) 5349 MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); 5350 5351 assert(!(CallConv == CallingConv::Fast && isVarArg) && 5352 "fastcc not supported on varargs functions"); 5353 5354 // Count how many bytes are to be pushed on the stack, including the linkage 5355 // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes 5356 // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage 5357 // area is 32 bytes reserved space for [SP][CR][LR][TOC]. 5358 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 5359 unsigned NumBytes = LinkageSize; 5360 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 5361 unsigned &QFPR_idx = FPR_idx; 5362 5363 static const MCPhysReg GPR[] = { 5364 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 5365 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 5366 }; 5367 static const MCPhysReg VR[] = { 5368 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 5369 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 5370 }; 5371 5372 const unsigned NumGPRs = array_lengthof(GPR); 5373 const unsigned NumFPRs = useSoftFloat() ? 0 : 13; 5374 const unsigned NumVRs = array_lengthof(VR); 5375 const unsigned NumQFPRs = NumFPRs; 5376 5377 // On ELFv2, we can avoid allocating the parameter area if all the arguments 5378 // can be passed to the callee in registers. 5379 // For the fast calling convention, there is another check below. 5380 // Note: We should keep consistent with LowerFormalArguments_64SVR4() 5381 bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast; 5382 if (!HasParameterArea) { 5383 unsigned ParamAreaSize = NumGPRs * PtrByteSize; 5384 unsigned AvailableFPRs = NumFPRs; 5385 unsigned AvailableVRs = NumVRs; 5386 unsigned NumBytesTmp = NumBytes; 5387 for (unsigned i = 0; i != NumOps; ++i) { 5388 if (Outs[i].Flags.isNest()) continue; 5389 if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags, 5390 PtrByteSize, LinkageSize, ParamAreaSize, 5391 NumBytesTmp, AvailableFPRs, AvailableVRs, 5392 Subtarget.hasQPX())) 5393 HasParameterArea = true; 5394 } 5395 } 5396 5397 // When using the fast calling convention, we don't provide backing for 5398 // arguments that will be in registers. 5399 unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0; 5400 5401 // Add up all the space actually used. 5402 for (unsigned i = 0; i != NumOps; ++i) { 5403 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5404 EVT ArgVT = Outs[i].VT; 5405 EVT OrigVT = Outs[i].ArgVT; 5406 5407 if (Flags.isNest()) 5408 continue; 5409 5410 if (CallConv == CallingConv::Fast) { 5411 if (Flags.isByVal()) 5412 NumGPRsUsed += (Flags.getByValSize()+7)/8; 5413 else 5414 switch (ArgVT.getSimpleVT().SimpleTy) { 5415 default: llvm_unreachable("Unexpected ValueType for argument!"); 5416 case MVT::i1: 5417 case MVT::i32: 5418 case MVT::i64: 5419 if (++NumGPRsUsed <= NumGPRs) 5420 continue; 5421 break; 5422 case MVT::v4i32: 5423 case MVT::v8i16: 5424 case MVT::v16i8: 5425 case MVT::v2f64: 5426 case MVT::v2i64: 5427 case MVT::v1i128: 5428 if (++NumVRsUsed <= NumVRs) 5429 continue; 5430 break; 5431 case MVT::v4f32: 5432 // When using QPX, this is handled like a FP register, otherwise, it 5433 // is an Altivec register. 5434 if (Subtarget.hasQPX()) { 5435 if (++NumFPRsUsed <= NumFPRs) 5436 continue; 5437 } else { 5438 if (++NumVRsUsed <= NumVRs) 5439 continue; 5440 } 5441 break; 5442 case MVT::f32: 5443 case MVT::f64: 5444 case MVT::v4f64: // QPX 5445 case MVT::v4i1: // QPX 5446 if (++NumFPRsUsed <= NumFPRs) 5447 continue; 5448 break; 5449 } 5450 } 5451 5452 /* Respect alignment of argument on the stack. */ 5453 unsigned Align = 5454 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); 5455 NumBytes = ((NumBytes + Align - 1) / Align) * Align; 5456 5457 NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); 5458 if (Flags.isInConsecutiveRegsLast()) 5459 NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 5460 } 5461 5462 unsigned NumBytesActuallyUsed = NumBytes; 5463 5464 // In the old ELFv1 ABI, 5465 // the prolog code of the callee may store up to 8 GPR argument registers to 5466 // the stack, allowing va_start to index over them in memory if its varargs. 5467 // Because we cannot tell if this is needed on the caller side, we have to 5468 // conservatively assume that it is needed. As such, make sure we have at 5469 // least enough stack space for the caller to store the 8 GPRs. 5470 // In the ELFv2 ABI, we allocate the parameter area iff a callee 5471 // really requires memory operands, e.g. a vararg function. 5472 if (HasParameterArea) 5473 NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); 5474 else 5475 NumBytes = LinkageSize; 5476 5477 // Tail call needs the stack to be aligned. 5478 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5479 CallConv == CallingConv::Fast) 5480 NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); 5481 5482 int SPDiff = 0; 5483 5484 // Calculate by how many bytes the stack has to be adjusted in case of tail 5485 // call optimization. 5486 if (!IsSibCall) 5487 SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); 5488 5489 // To protect arguments on the stack from being clobbered in a tail call, 5490 // force all the loads to happen before doing any other lowering. 5491 if (isTailCall) 5492 Chain = DAG.getStackArgumentTokenFactor(Chain); 5493 5494 // Adjust the stack pointer for the new arguments... 5495 // These operations are automatically eliminated by the prolog/epilog pass 5496 if (!IsSibCall) 5497 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 5498 SDValue CallSeqStart = Chain; 5499 5500 // Load the return address and frame pointer so it can be move somewhere else 5501 // later. 5502 SDValue LROp, FPOp; 5503 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); 5504 5505 // Set up a copy of the stack pointer for use loading and storing any 5506 // arguments that may not fit in the registers available for argument 5507 // passing. 5508 SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64); 5509 5510 // Figure out which arguments are going to go in registers, and which in 5511 // memory. Also, if this is a vararg function, floating point operations 5512 // must be stored to our stack, and loaded into integer regs as well, if 5513 // any integer regs are available for argument passing. 5514 unsigned ArgOffset = LinkageSize; 5515 5516 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 5517 SmallVector<TailCallArgumentInfo, 8> TailCallArguments; 5518 5519 SmallVector<SDValue, 8> MemOpChains; 5520 for (unsigned i = 0; i != NumOps; ++i) { 5521 SDValue Arg = OutVals[i]; 5522 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5523 EVT ArgVT = Outs[i].VT; 5524 EVT OrigVT = Outs[i].ArgVT; 5525 5526 // PtrOff will be used to store the current argument to the stack if a 5527 // register cannot be found for it. 5528 SDValue PtrOff; 5529 5530 // We re-align the argument offset for each argument, except when using the 5531 // fast calling convention, when we need to make sure we do that only when 5532 // we'll actually use a stack slot. 5533 auto ComputePtrOff = [&]() { 5534 /* Respect alignment of argument on the stack. */ 5535 unsigned Align = 5536 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); 5537 ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; 5538 5539 PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); 5540 5541 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); 5542 }; 5543 5544 if (CallConv != CallingConv::Fast) { 5545 ComputePtrOff(); 5546 5547 /* Compute GPR index associated with argument offset. */ 5548 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; 5549 GPR_idx = std::min(GPR_idx, NumGPRs); 5550 } 5551 5552 // Promote integers to 64-bit values. 5553 if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) { 5554 // FIXME: Should this use ANY_EXTEND if neither sext nor zext? 5555 unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 5556 Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); 5557 } 5558 5559 // FIXME memcpy is used way more than necessary. Correctness first. 5560 // Note: "by value" is code for passing a structure by value, not 5561 // basic types. 5562 if (Flags.isByVal()) { 5563 // Note: Size includes alignment padding, so 5564 // struct x { short a; char b; } 5565 // will have Size = 4. With #pragma pack(1), it will have Size = 3. 5566 // These are the proper values we need for right-justifying the 5567 // aggregate in a parameter register. 5568 unsigned Size = Flags.getByValSize(); 5569 5570 // An empty aggregate parameter takes up no storage and no 5571 // registers. 5572 if (Size == 0) 5573 continue; 5574 5575 if (CallConv == CallingConv::Fast) 5576 ComputePtrOff(); 5577 5578 // All aggregates smaller than 8 bytes must be passed right-justified. 5579 if (Size==1 || Size==2 || Size==4) { 5580 EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); 5581 if (GPR_idx != NumGPRs) { 5582 SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, 5583 MachinePointerInfo(), VT); 5584 MemOpChains.push_back(Load.getValue(1)); 5585 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5586 5587 ArgOffset += PtrByteSize; 5588 continue; 5589 } 5590 } 5591 5592 if (GPR_idx == NumGPRs && Size < 8) { 5593 SDValue AddPtr = PtrOff; 5594 if (!isLittleEndian) { 5595 SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, 5596 PtrOff.getValueType()); 5597 AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); 5598 } 5599 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, 5600 CallSeqStart, 5601 Flags, DAG, dl); 5602 ArgOffset += PtrByteSize; 5603 continue; 5604 } 5605 // Copy entire object into memory. There are cases where gcc-generated 5606 // code assumes it is there, even if it could be put entirely into 5607 // registers. (This is not what the doc says.) 5608 5609 // FIXME: The above statement is likely due to a misunderstanding of the 5610 // documents. All arguments must be copied into the parameter area BY 5611 // THE CALLEE in the event that the callee takes the address of any 5612 // formal argument. That has not yet been implemented. However, it is 5613 // reasonable to use the stack area as a staging area for the register 5614 // load. 5615 5616 // Skip this for small aggregates, as we will use the same slot for a 5617 // right-justified copy, below. 5618 if (Size >= 8) 5619 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, 5620 CallSeqStart, 5621 Flags, DAG, dl); 5622 5623 // When a register is available, pass a small aggregate right-justified. 5624 if (Size < 8 && GPR_idx != NumGPRs) { 5625 // The easiest way to get this right-justified in a register 5626 // is to copy the structure into the rightmost portion of a 5627 // local variable slot, then load the whole slot into the 5628 // register. 5629 // FIXME: The memcpy seems to produce pretty awful code for 5630 // small aggregates, particularly for packed ones. 5631 // FIXME: It would be preferable to use the slot in the 5632 // parameter save area instead of a new local variable. 5633 SDValue AddPtr = PtrOff; 5634 if (!isLittleEndian) { 5635 SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType()); 5636 AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); 5637 } 5638 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, 5639 CallSeqStart, 5640 Flags, DAG, dl); 5641 5642 // Load the slot into the register. 5643 SDValue Load = 5644 DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo()); 5645 MemOpChains.push_back(Load.getValue(1)); 5646 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5647 5648 // Done with this argument. 5649 ArgOffset += PtrByteSize; 5650 continue; 5651 } 5652 5653 // For aggregates larger than PtrByteSize, copy the pieces of the 5654 // object that fit into registers from the parameter save area. 5655 for (unsigned j=0; j<Size; j+=PtrByteSize) { 5656 SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType()); 5657 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 5658 if (GPR_idx != NumGPRs) { 5659 SDValue Load = 5660 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo()); 5661 MemOpChains.push_back(Load.getValue(1)); 5662 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5663 ArgOffset += PtrByteSize; 5664 } else { 5665 ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; 5666 break; 5667 } 5668 } 5669 continue; 5670 } 5671 5672 switch (Arg.getSimpleValueType().SimpleTy) { 5673 default: llvm_unreachable("Unexpected ValueType for argument!"); 5674 case MVT::i1: 5675 case MVT::i32: 5676 case MVT::i64: 5677 if (Flags.isNest()) { 5678 // The 'nest' parameter, if any, is passed in R11. 5679 RegsToPass.push_back(std::make_pair(PPC::X11, Arg)); 5680 hasNest = true; 5681 break; 5682 } 5683 5684 // These can be scalar arguments or elements of an integer array type 5685 // passed directly. Clang may use those instead of "byval" aggregate 5686 // types to avoid forcing arguments to memory unnecessarily. 5687 if (GPR_idx != NumGPRs) { 5688 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); 5689 } else { 5690 if (CallConv == CallingConv::Fast) 5691 ComputePtrOff(); 5692 5693 assert(HasParameterArea && 5694 "Parameter area must exist to pass an argument in memory."); 5695 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 5696 true, isTailCall, false, MemOpChains, 5697 TailCallArguments, dl); 5698 if (CallConv == CallingConv::Fast) 5699 ArgOffset += PtrByteSize; 5700 } 5701 if (CallConv != CallingConv::Fast) 5702 ArgOffset += PtrByteSize; 5703 break; 5704 case MVT::f32: 5705 case MVT::f64: { 5706 // These can be scalar arguments or elements of a float array type 5707 // passed directly. The latter are used to implement ELFv2 homogenous 5708 // float aggregates. 5709 5710 // Named arguments go into FPRs first, and once they overflow, the 5711 // remaining arguments go into GPRs and then the parameter save area. 5712 // Unnamed arguments for vararg functions always go to GPRs and 5713 // then the parameter save area. For now, put all arguments to vararg 5714 // routines always in both locations (FPR *and* GPR or stack slot). 5715 bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs; 5716 bool NeededLoad = false; 5717 5718 // First load the argument into the next available FPR. 5719 if (FPR_idx != NumFPRs) 5720 RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); 5721 5722 // Next, load the argument into GPR or stack slot if needed. 5723 if (!NeedGPROrStack) 5724 ; 5725 else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) { 5726 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 5727 // once we support fp <-> gpr moves. 5728 5729 // In the non-vararg case, this can only ever happen in the 5730 // presence of f32 array types, since otherwise we never run 5731 // out of FPRs before running out of GPRs. 5732 SDValue ArgVal; 5733 5734 // Double values are always passed in a single GPR. 5735 if (Arg.getValueType() != MVT::f32) { 5736 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 5737 5738 // Non-array float values are extended and passed in a GPR. 5739 } else if (!Flags.isInConsecutiveRegs()) { 5740 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); 5741 ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); 5742 5743 // If we have an array of floats, we collect every odd element 5744 // together with its predecessor into one GPR. 5745 } else if (ArgOffset % PtrByteSize != 0) { 5746 SDValue Lo, Hi; 5747 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); 5748 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); 5749 if (!isLittleEndian) 5750 std::swap(Lo, Hi); 5751 ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 5752 5753 // The final element, if even, goes into the first half of a GPR. 5754 } else if (Flags.isInConsecutiveRegsLast()) { 5755 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); 5756 ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); 5757 if (!isLittleEndian) 5758 ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, 5759 DAG.getConstant(32, dl, MVT::i32)); 5760 5761 // Non-final even elements are skipped; they will be handled 5762 // together the with subsequent argument on the next go-around. 5763 } else 5764 ArgVal = SDValue(); 5765 5766 if (ArgVal.getNode()) 5767 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal)); 5768 } else { 5769 if (CallConv == CallingConv::Fast) 5770 ComputePtrOff(); 5771 5772 // Single-precision floating-point values are mapped to the 5773 // second (rightmost) word of the stack doubleword. 5774 if (Arg.getValueType() == MVT::f32 && 5775 !isLittleEndian && !Flags.isInConsecutiveRegs()) { 5776 SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); 5777 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); 5778 } 5779 5780 assert(HasParameterArea && 5781 "Parameter area must exist to pass an argument in memory."); 5782 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 5783 true, isTailCall, false, MemOpChains, 5784 TailCallArguments, dl); 5785 5786 NeededLoad = true; 5787 } 5788 // When passing an array of floats, the array occupies consecutive 5789 // space in the argument area; only round up to the next doubleword 5790 // at the end of the array. Otherwise, each float takes 8 bytes. 5791 if (CallConv != CallingConv::Fast || NeededLoad) { 5792 ArgOffset += (Arg.getValueType() == MVT::f32 && 5793 Flags.isInConsecutiveRegs()) ? 4 : 8; 5794 if (Flags.isInConsecutiveRegsLast()) 5795 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 5796 } 5797 break; 5798 } 5799 case MVT::v4f32: 5800 case MVT::v4i32: 5801 case MVT::v8i16: 5802 case MVT::v16i8: 5803 case MVT::v2f64: 5804 case MVT::v2i64: 5805 case MVT::v1i128: 5806 if (!Subtarget.hasQPX()) { 5807 // These can be scalar arguments or elements of a vector array type 5808 // passed directly. The latter are used to implement ELFv2 homogenous 5809 // vector aggregates. 5810 5811 // For a varargs call, named arguments go into VRs or on the stack as 5812 // usual; unnamed arguments always go to the stack or the corresponding 5813 // GPRs when within range. For now, we always put the value in both 5814 // locations (or even all three). 5815 if (isVarArg) { 5816 assert(HasParameterArea && 5817 "Parameter area must exist if we have a varargs call."); 5818 // We could elide this store in the case where the object fits 5819 // entirely in R registers. Maybe later. 5820 SDValue Store = 5821 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 5822 MemOpChains.push_back(Store); 5823 if (VR_idx != NumVRs) { 5824 SDValue Load = 5825 DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo()); 5826 MemOpChains.push_back(Load.getValue(1)); 5827 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load)); 5828 } 5829 ArgOffset += 16; 5830 for (unsigned i=0; i<16; i+=PtrByteSize) { 5831 if (GPR_idx == NumGPRs) 5832 break; 5833 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, 5834 DAG.getConstant(i, dl, PtrVT)); 5835 SDValue Load = 5836 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); 5837 MemOpChains.push_back(Load.getValue(1)); 5838 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5839 } 5840 break; 5841 } 5842 5843 // Non-varargs Altivec params go into VRs or on the stack. 5844 if (VR_idx != NumVRs) { 5845 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); 5846 } else { 5847 if (CallConv == CallingConv::Fast) 5848 ComputePtrOff(); 5849 5850 assert(HasParameterArea && 5851 "Parameter area must exist to pass an argument in memory."); 5852 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 5853 true, isTailCall, true, MemOpChains, 5854 TailCallArguments, dl); 5855 if (CallConv == CallingConv::Fast) 5856 ArgOffset += 16; 5857 } 5858 5859 if (CallConv != CallingConv::Fast) 5860 ArgOffset += 16; 5861 break; 5862 } // not QPX 5863 5864 assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && 5865 "Invalid QPX parameter type"); 5866 5867 /* fall through */ 5868 case MVT::v4f64: 5869 case MVT::v4i1: { 5870 bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; 5871 if (isVarArg) { 5872 assert(HasParameterArea && 5873 "Parameter area must exist if we have a varargs call."); 5874 // We could elide this store in the case where the object fits 5875 // entirely in R registers. Maybe later. 5876 SDValue Store = 5877 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 5878 MemOpChains.push_back(Store); 5879 if (QFPR_idx != NumQFPRs) { 5880 SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store, 5881 PtrOff, MachinePointerInfo()); 5882 MemOpChains.push_back(Load.getValue(1)); 5883 RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load)); 5884 } 5885 ArgOffset += (IsF32 ? 16 : 32); 5886 for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) { 5887 if (GPR_idx == NumGPRs) 5888 break; 5889 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, 5890 DAG.getConstant(i, dl, PtrVT)); 5891 SDValue Load = 5892 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); 5893 MemOpChains.push_back(Load.getValue(1)); 5894 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5895 } 5896 break; 5897 } 5898 5899 // Non-varargs QPX params go into registers or on the stack. 5900 if (QFPR_idx != NumQFPRs) { 5901 RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg)); 5902 } else { 5903 if (CallConv == CallingConv::Fast) 5904 ComputePtrOff(); 5905 5906 assert(HasParameterArea && 5907 "Parameter area must exist to pass an argument in memory."); 5908 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 5909 true, isTailCall, true, MemOpChains, 5910 TailCallArguments, dl); 5911 if (CallConv == CallingConv::Fast) 5912 ArgOffset += (IsF32 ? 16 : 32); 5913 } 5914 5915 if (CallConv != CallingConv::Fast) 5916 ArgOffset += (IsF32 ? 16 : 32); 5917 break; 5918 } 5919 } 5920 } 5921 5922 assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) && 5923 "mismatch in size of parameter area"); 5924 (void)NumBytesActuallyUsed; 5925 5926 if (!MemOpChains.empty()) 5927 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 5928 5929 // Check if this is an indirect call (MTCTR/BCTRL). 5930 // See PrepareCall() for more information about calls through function 5931 // pointers in the 64-bit SVR4 ABI. 5932 if (!isTailCall && !isPatchPoint && 5933 !isFunctionGlobalAddress(Callee) && 5934 !isa<ExternalSymbolSDNode>(Callee)) { 5935 // Load r2 into a virtual register and store it to the TOC save area. 5936 setUsesTOCBasePtr(DAG); 5937 SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); 5938 // TOC save area offset. 5939 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); 5940 SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); 5941 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); 5942 Chain = DAG.getStore( 5943 Val.getValue(1), dl, Val, AddPtr, 5944 MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset)); 5945 // In the ELFv2 ABI, R12 must contain the address of an indirect callee. 5946 // This does not mean the MTCTR instruction must use R12; it's easier 5947 // to model this as an extra parameter, so do that. 5948 if (isELFv2ABI && !isPatchPoint) 5949 RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); 5950 } 5951 5952 // Build a sequence of copy-to-reg nodes chained together with token chain 5953 // and flag operands which copy the outgoing args into the appropriate regs. 5954 SDValue InFlag; 5955 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 5956 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 5957 RegsToPass[i].second, InFlag); 5958 InFlag = Chain.getValue(1); 5959 } 5960 5961 if (isTailCall && !IsSibCall) 5962 PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, 5963 TailCallArguments); 5964 5965 return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest, 5966 DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, 5967 SPDiff, NumBytes, Ins, InVals, CS); 5968 } 5969 5970 SDValue PPCTargetLowering::LowerCall_Darwin( 5971 SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, 5972 bool isTailCall, bool isPatchPoint, 5973 const SmallVectorImpl<ISD::OutputArg> &Outs, 5974 const SmallVectorImpl<SDValue> &OutVals, 5975 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 5976 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 5977 ImmutableCallSite CS) const { 5978 unsigned NumOps = Outs.size(); 5979 5980 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5981 bool isPPC64 = PtrVT == MVT::i64; 5982 unsigned PtrByteSize = isPPC64 ? 8 : 4; 5983 5984 MachineFunction &MF = DAG.getMachineFunction(); 5985 5986 // Mark this function as potentially containing a function that contains a 5987 // tail call. As a consequence the frame pointer will be used for dynamicalloc 5988 // and restoring the callers stack pointer in this functions epilog. This is 5989 // done because by tail calling the called function might overwrite the value 5990 // in this function's (MF) stack pointer stack slot 0(SP). 5991 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5992 CallConv == CallingConv::Fast) 5993 MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); 5994 5995 // Count how many bytes are to be pushed on the stack, including the linkage 5996 // area, and parameter passing area. We start with 24/48 bytes, which is 5997 // prereserved space for [SP][CR][LR][3 x unused]. 5998 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 5999 unsigned NumBytes = LinkageSize; 6000 6001 // Add up all the space actually used. 6002 // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually 6003 // they all go in registers, but we must reserve stack space for them for 6004 // possible use by the caller. In varargs or 64-bit calls, parameters are 6005 // assigned stack space in order, with padding so Altivec parameters are 6006 // 16-byte aligned. 6007 unsigned nAltivecParamsAtEnd = 0; 6008 for (unsigned i = 0; i != NumOps; ++i) { 6009 ISD::ArgFlagsTy Flags = Outs[i].Flags; 6010 EVT ArgVT = Outs[i].VT; 6011 // Varargs Altivec parameters are padded to a 16 byte boundary. 6012 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || 6013 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || 6014 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) { 6015 if (!isVarArg && !isPPC64) { 6016 // Non-varargs Altivec parameters go after all the non-Altivec 6017 // parameters; handle those later so we know how much padding we need. 6018 nAltivecParamsAtEnd++; 6019 continue; 6020 } 6021 // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. 6022 NumBytes = ((NumBytes+15)/16)*16; 6023 } 6024 NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); 6025 } 6026 6027 // Allow for Altivec parameters at the end, if needed. 6028 if (nAltivecParamsAtEnd) { 6029 NumBytes = ((NumBytes+15)/16)*16; 6030 NumBytes += 16*nAltivecParamsAtEnd; 6031 } 6032 6033 // The prolog code of the callee may store up to 8 GPR argument registers to 6034 // the stack, allowing va_start to index over them in memory if its varargs. 6035 // Because we cannot tell if this is needed on the caller side, we have to 6036 // conservatively assume that it is needed. As such, make sure we have at 6037 // least enough stack space for the caller to store the 8 GPRs. 6038 NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); 6039 6040 // Tail call needs the stack to be aligned. 6041 if (getTargetMachine().Options.GuaranteedTailCallOpt && 6042 CallConv == CallingConv::Fast) 6043 NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); 6044 6045 // Calculate by how many bytes the stack has to be adjusted in case of tail 6046 // call optimization. 6047 int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); 6048 6049 // To protect arguments on the stack from being clobbered in a tail call, 6050 // force all the loads to happen before doing any other lowering. 6051 if (isTailCall) 6052 Chain = DAG.getStackArgumentTokenFactor(Chain); 6053 6054 // Adjust the stack pointer for the new arguments... 6055 // These operations are automatically eliminated by the prolog/epilog pass 6056 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 6057 SDValue CallSeqStart = Chain; 6058 6059 // Load the return address and frame pointer so it can be move somewhere else 6060 // later. 6061 SDValue LROp, FPOp; 6062 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); 6063 6064 // Set up a copy of the stack pointer for use loading and storing any 6065 // arguments that may not fit in the registers available for argument 6066 // passing. 6067 SDValue StackPtr; 6068 if (isPPC64) 6069 StackPtr = DAG.getRegister(PPC::X1, MVT::i64); 6070 else 6071 StackPtr = DAG.getRegister(PPC::R1, MVT::i32); 6072 6073 // Figure out which arguments are going to go in registers, and which in 6074 // memory. Also, if this is a vararg function, floating point operations 6075 // must be stored to our stack, and loaded into integer regs as well, if 6076 // any integer regs are available for argument passing. 6077 unsigned ArgOffset = LinkageSize; 6078 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 6079 6080 static const MCPhysReg GPR_32[] = { // 32-bit registers. 6081 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 6082 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 6083 }; 6084 static const MCPhysReg GPR_64[] = { // 64-bit registers. 6085 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 6086 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 6087 }; 6088 static const MCPhysReg VR[] = { 6089 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 6090 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 6091 }; 6092 const unsigned NumGPRs = array_lengthof(GPR_32); 6093 const unsigned NumFPRs = 13; 6094 const unsigned NumVRs = array_lengthof(VR); 6095 6096 const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; 6097 6098 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 6099 SmallVector<TailCallArgumentInfo, 8> TailCallArguments; 6100 6101 SmallVector<SDValue, 8> MemOpChains; 6102 for (unsigned i = 0; i != NumOps; ++i) { 6103 SDValue Arg = OutVals[i]; 6104 ISD::ArgFlagsTy Flags = Outs[i].Flags; 6105 6106 // PtrOff will be used to store the current argument to the stack if a 6107 // register cannot be found for it. 6108 SDValue PtrOff; 6109 6110 PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); 6111 6112 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); 6113 6114 // On PPC64, promote integers to 64-bit values. 6115 if (isPPC64 && Arg.getValueType() == MVT::i32) { 6116 // FIXME: Should this use ANY_EXTEND if neither sext nor zext? 6117 unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 6118 Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); 6119 } 6120 6121 // FIXME memcpy is used way more than necessary. Correctness first. 6122 // Note: "by value" is code for passing a structure by value, not 6123 // basic types. 6124 if (Flags.isByVal()) { 6125 unsigned Size = Flags.getByValSize(); 6126 // Very small objects are passed right-justified. Everything else is 6127 // passed left-justified. 6128 if (Size==1 || Size==2) { 6129 EVT VT = (Size==1) ? MVT::i8 : MVT::i16; 6130 if (GPR_idx != NumGPRs) { 6131 SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, 6132 MachinePointerInfo(), VT); 6133 MemOpChains.push_back(Load.getValue(1)); 6134 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6135 6136 ArgOffset += PtrByteSize; 6137 } else { 6138 SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, 6139 PtrOff.getValueType()); 6140 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); 6141 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, 6142 CallSeqStart, 6143 Flags, DAG, dl); 6144 ArgOffset += PtrByteSize; 6145 } 6146 continue; 6147 } 6148 // Copy entire object into memory. There are cases where gcc-generated 6149 // code assumes it is there, even if it could be put entirely into 6150 // registers. (This is not what the doc says.) 6151 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, 6152 CallSeqStart, 6153 Flags, DAG, dl); 6154 6155 // For small aggregates (Darwin only) and aggregates >= PtrByteSize, 6156 // copy the pieces of the object that fit into registers from the 6157 // parameter save area. 6158 for (unsigned j=0; j<Size; j+=PtrByteSize) { 6159 SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType()); 6160 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 6161 if (GPR_idx != NumGPRs) { 6162 SDValue Load = 6163 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo()); 6164 MemOpChains.push_back(Load.getValue(1)); 6165 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6166 ArgOffset += PtrByteSize; 6167 } else { 6168 ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; 6169 break; 6170 } 6171 } 6172 continue; 6173 } 6174 6175 switch (Arg.getSimpleValueType().SimpleTy) { 6176 default: llvm_unreachable("Unexpected ValueType for argument!"); 6177 case MVT::i1: 6178 case MVT::i32: 6179 case MVT::i64: 6180 if (GPR_idx != NumGPRs) { 6181 if (Arg.getValueType() == MVT::i1) 6182 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg); 6183 6184 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); 6185 } else { 6186 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6187 isPPC64, isTailCall, false, MemOpChains, 6188 TailCallArguments, dl); 6189 } 6190 ArgOffset += PtrByteSize; 6191 break; 6192 case MVT::f32: 6193 case MVT::f64: 6194 if (FPR_idx != NumFPRs) { 6195 RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); 6196 6197 if (isVarArg) { 6198 SDValue Store = 6199 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 6200 MemOpChains.push_back(Store); 6201 6202 // Float varargs are always shadowed in available integer registers 6203 if (GPR_idx != NumGPRs) { 6204 SDValue Load = 6205 DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo()); 6206 MemOpChains.push_back(Load.getValue(1)); 6207 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6208 } 6209 if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){ 6210 SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); 6211 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); 6212 SDValue Load = 6213 DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo()); 6214 MemOpChains.push_back(Load.getValue(1)); 6215 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6216 } 6217 } else { 6218 // If we have any FPRs remaining, we may also have GPRs remaining. 6219 // Args passed in FPRs consume either 1 (f32) or 2 (f64) available 6220 // GPRs. 6221 if (GPR_idx != NumGPRs) 6222 ++GPR_idx; 6223 if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && 6224 !isPPC64) // PPC64 has 64-bit GPR's obviously :) 6225 ++GPR_idx; 6226 } 6227 } else 6228 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6229 isPPC64, isTailCall, false, MemOpChains, 6230 TailCallArguments, dl); 6231 if (isPPC64) 6232 ArgOffset += 8; 6233 else 6234 ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8; 6235 break; 6236 case MVT::v4f32: 6237 case MVT::v4i32: 6238 case MVT::v8i16: 6239 case MVT::v16i8: 6240 if (isVarArg) { 6241 // These go aligned on the stack, or in the corresponding R registers 6242 // when within range. The Darwin PPC ABI doc claims they also go in 6243 // V registers; in fact gcc does this only for arguments that are 6244 // prototyped, not for those that match the ... We do it for all 6245 // arguments, seems to work. 6246 while (ArgOffset % 16 !=0) { 6247 ArgOffset += PtrByteSize; 6248 if (GPR_idx != NumGPRs) 6249 GPR_idx++; 6250 } 6251 // We could elide this store in the case where the object fits 6252 // entirely in R registers. Maybe later. 6253 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, 6254 DAG.getConstant(ArgOffset, dl, PtrVT)); 6255 SDValue Store = 6256 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 6257 MemOpChains.push_back(Store); 6258 if (VR_idx != NumVRs) { 6259 SDValue Load = 6260 DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo()); 6261 MemOpChains.push_back(Load.getValue(1)); 6262 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load)); 6263 } 6264 ArgOffset += 16; 6265 for (unsigned i=0; i<16; i+=PtrByteSize) { 6266 if (GPR_idx == NumGPRs) 6267 break; 6268 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, 6269 DAG.getConstant(i, dl, PtrVT)); 6270 SDValue Load = 6271 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); 6272 MemOpChains.push_back(Load.getValue(1)); 6273 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6274 } 6275 break; 6276 } 6277 6278 // Non-varargs Altivec params generally go in registers, but have 6279 // stack space allocated at the end. 6280 if (VR_idx != NumVRs) { 6281 // Doesn't have GPR space allocated. 6282 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); 6283 } else if (nAltivecParamsAtEnd==0) { 6284 // We are emitting Altivec params in order. 6285 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6286 isPPC64, isTailCall, true, MemOpChains, 6287 TailCallArguments, dl); 6288 ArgOffset += 16; 6289 } 6290 break; 6291 } 6292 } 6293 // If all Altivec parameters fit in registers, as they usually do, 6294 // they get stack space following the non-Altivec parameters. We 6295 // don't track this here because nobody below needs it. 6296 // If there are more Altivec parameters than fit in registers emit 6297 // the stores here. 6298 if (!isVarArg && nAltivecParamsAtEnd > NumVRs) { 6299 unsigned j = 0; 6300 // Offset is aligned; skip 1st 12 params which go in V registers. 6301 ArgOffset = ((ArgOffset+15)/16)*16; 6302 ArgOffset += 12*16; 6303 for (unsigned i = 0; i != NumOps; ++i) { 6304 SDValue Arg = OutVals[i]; 6305 EVT ArgType = Outs[i].VT; 6306 if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 || 6307 ArgType==MVT::v8i16 || ArgType==MVT::v16i8) { 6308 if (++j > NumVRs) { 6309 SDValue PtrOff; 6310 // We are emitting Altivec params in order. 6311 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6312 isPPC64, isTailCall, true, MemOpChains, 6313 TailCallArguments, dl); 6314 ArgOffset += 16; 6315 } 6316 } 6317 } 6318 } 6319 6320 if (!MemOpChains.empty()) 6321 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 6322 6323 // On Darwin, R12 must contain the address of an indirect callee. This does 6324 // not mean the MTCTR instruction must use R12; it's easier to model this as 6325 // an extra parameter, so do that. 6326 if (!isTailCall && 6327 !isFunctionGlobalAddress(Callee) && 6328 !isa<ExternalSymbolSDNode>(Callee) && 6329 !isBLACompatibleAddress(Callee, DAG)) 6330 RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : 6331 PPC::R12), Callee)); 6332 6333 // Build a sequence of copy-to-reg nodes chained together with token chain 6334 // and flag operands which copy the outgoing args into the appropriate regs. 6335 SDValue InFlag; 6336 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 6337 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 6338 RegsToPass[i].second, InFlag); 6339 InFlag = Chain.getValue(1); 6340 } 6341 6342 if (isTailCall) 6343 PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, 6344 TailCallArguments); 6345 6346 return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, 6347 /* unused except on PPC64 ELFv1 */ false, DAG, 6348 RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, 6349 NumBytes, Ins, InVals, CS); 6350 } 6351 6352 bool 6353 PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 6354 MachineFunction &MF, bool isVarArg, 6355 const SmallVectorImpl<ISD::OutputArg> &Outs, 6356 LLVMContext &Context) const { 6357 SmallVector<CCValAssign, 16> RVLocs; 6358 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 6359 return CCInfo.CheckReturn(Outs, RetCC_PPC); 6360 } 6361 6362 SDValue 6363 PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 6364 bool isVarArg, 6365 const SmallVectorImpl<ISD::OutputArg> &Outs, 6366 const SmallVectorImpl<SDValue> &OutVals, 6367 const SDLoc &dl, SelectionDAG &DAG) const { 6368 SmallVector<CCValAssign, 16> RVLocs; 6369 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 6370 *DAG.getContext()); 6371 CCInfo.AnalyzeReturn(Outs, RetCC_PPC); 6372 6373 SDValue Flag; 6374 SmallVector<SDValue, 4> RetOps(1, Chain); 6375 6376 // Copy the result values into the output registers. 6377 for (unsigned i = 0; i != RVLocs.size(); ++i) { 6378 CCValAssign &VA = RVLocs[i]; 6379 assert(VA.isRegLoc() && "Can only return in registers!"); 6380 6381 SDValue Arg = OutVals[i]; 6382 6383 switch (VA.getLocInfo()) { 6384 default: llvm_unreachable("Unknown loc info!"); 6385 case CCValAssign::Full: break; 6386 case CCValAssign::AExt: 6387 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 6388 break; 6389 case CCValAssign::ZExt: 6390 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 6391 break; 6392 case CCValAssign::SExt: 6393 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 6394 break; 6395 } 6396 6397 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 6398 Flag = Chain.getValue(1); 6399 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 6400 } 6401 6402 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); 6403 const MCPhysReg *I = 6404 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 6405 if (I) { 6406 for (; *I; ++I) { 6407 6408 if (PPC::G8RCRegClass.contains(*I)) 6409 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 6410 else if (PPC::F8RCRegClass.contains(*I)) 6411 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 6412 else if (PPC::CRRCRegClass.contains(*I)) 6413 RetOps.push_back(DAG.getRegister(*I, MVT::i1)); 6414 else if (PPC::VRRCRegClass.contains(*I)) 6415 RetOps.push_back(DAG.getRegister(*I, MVT::Other)); 6416 else 6417 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 6418 } 6419 } 6420 6421 RetOps[0] = Chain; // Update chain. 6422 6423 // Add the flag if we have it. 6424 if (Flag.getNode()) 6425 RetOps.push_back(Flag); 6426 6427 return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); 6428 } 6429 6430 SDValue 6431 PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, 6432 SelectionDAG &DAG) const { 6433 SDLoc dl(Op); 6434 6435 // Get the correct type for integers. 6436 EVT IntVT = Op.getValueType(); 6437 6438 // Get the inputs. 6439 SDValue Chain = Op.getOperand(0); 6440 SDValue FPSIdx = getFramePointerFrameIndex(DAG); 6441 // Build a DYNAREAOFFSET node. 6442 SDValue Ops[2] = {Chain, FPSIdx}; 6443 SDVTList VTs = DAG.getVTList(IntVT); 6444 return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops); 6445 } 6446 6447 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, 6448 SelectionDAG &DAG) const { 6449 // When we pop the dynamic allocation we need to restore the SP link. 6450 SDLoc dl(Op); 6451 6452 // Get the correct type for pointers. 6453 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6454 6455 // Construct the stack pointer operand. 6456 bool isPPC64 = Subtarget.isPPC64(); 6457 unsigned SP = isPPC64 ? PPC::X1 : PPC::R1; 6458 SDValue StackPtr = DAG.getRegister(SP, PtrVT); 6459 6460 // Get the operands for the STACKRESTORE. 6461 SDValue Chain = Op.getOperand(0); 6462 SDValue SaveSP = Op.getOperand(1); 6463 6464 // Load the old link SP. 6465 SDValue LoadLinkSP = 6466 DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo()); 6467 6468 // Restore the stack pointer. 6469 Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP); 6470 6471 // Store the old link SP. 6472 return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo()); 6473 } 6474 6475 SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const { 6476 MachineFunction &MF = DAG.getMachineFunction(); 6477 bool isPPC64 = Subtarget.isPPC64(); 6478 EVT PtrVT = getPointerTy(MF.getDataLayout()); 6479 6480 // Get current frame pointer save index. The users of this index will be 6481 // primarily DYNALLOC instructions. 6482 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); 6483 int RASI = FI->getReturnAddrSaveIndex(); 6484 6485 // If the frame pointer save index hasn't been defined yet. 6486 if (!RASI) { 6487 // Find out what the fix offset of the frame pointer save area. 6488 int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset(); 6489 // Allocate the frame index for frame pointer save area. 6490 RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false); 6491 // Save the result. 6492 FI->setReturnAddrSaveIndex(RASI); 6493 } 6494 return DAG.getFrameIndex(RASI, PtrVT); 6495 } 6496 6497 SDValue 6498 PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { 6499 MachineFunction &MF = DAG.getMachineFunction(); 6500 bool isPPC64 = Subtarget.isPPC64(); 6501 EVT PtrVT = getPointerTy(MF.getDataLayout()); 6502 6503 // Get current frame pointer save index. The users of this index will be 6504 // primarily DYNALLOC instructions. 6505 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); 6506 int FPSI = FI->getFramePointerSaveIndex(); 6507 6508 // If the frame pointer save index hasn't been defined yet. 6509 if (!FPSI) { 6510 // Find out what the fix offset of the frame pointer save area. 6511 int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset(); 6512 // Allocate the frame index for frame pointer save area. 6513 FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true); 6514 // Save the result. 6515 FI->setFramePointerSaveIndex(FPSI); 6516 } 6517 return DAG.getFrameIndex(FPSI, PtrVT); 6518 } 6519 6520 SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6521 SelectionDAG &DAG) const { 6522 // Get the inputs. 6523 SDValue Chain = Op.getOperand(0); 6524 SDValue Size = Op.getOperand(1); 6525 SDLoc dl(Op); 6526 6527 // Get the correct type for pointers. 6528 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6529 // Negate the size. 6530 SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT, 6531 DAG.getConstant(0, dl, PtrVT), Size); 6532 // Construct a node for the frame pointer save index. 6533 SDValue FPSIdx = getFramePointerFrameIndex(DAG); 6534 // Build a DYNALLOC node. 6535 SDValue Ops[3] = { Chain, NegSize, FPSIdx }; 6536 SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other); 6537 return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops); 6538 } 6539 6540 SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op, 6541 SelectionDAG &DAG) const { 6542 MachineFunction &MF = DAG.getMachineFunction(); 6543 6544 bool isPPC64 = Subtarget.isPPC64(); 6545 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6546 6547 int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false); 6548 return DAG.getFrameIndex(FI, PtrVT); 6549 } 6550 6551 SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, 6552 SelectionDAG &DAG) const { 6553 SDLoc DL(Op); 6554 return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL, 6555 DAG.getVTList(MVT::i32, MVT::Other), 6556 Op.getOperand(0), Op.getOperand(1)); 6557 } 6558 6559 SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, 6560 SelectionDAG &DAG) const { 6561 SDLoc DL(Op); 6562 return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other, 6563 Op.getOperand(0), Op.getOperand(1)); 6564 } 6565 6566 SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 6567 if (Op.getValueType().isVector()) 6568 return LowerVectorLoad(Op, DAG); 6569 6570 assert(Op.getValueType() == MVT::i1 && 6571 "Custom lowering only for i1 loads"); 6572 6573 // First, load 8 bits into 32 bits, then truncate to 1 bit. 6574 6575 SDLoc dl(Op); 6576 LoadSDNode *LD = cast<LoadSDNode>(Op); 6577 6578 SDValue Chain = LD->getChain(); 6579 SDValue BasePtr = LD->getBasePtr(); 6580 MachineMemOperand *MMO = LD->getMemOperand(); 6581 6582 SDValue NewLD = 6583 DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain, 6584 BasePtr, MVT::i8, MMO); 6585 SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD); 6586 6587 SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) }; 6588 return DAG.getMergeValues(Ops, dl); 6589 } 6590 6591 SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 6592 if (Op.getOperand(1).getValueType().isVector()) 6593 return LowerVectorStore(Op, DAG); 6594 6595 assert(Op.getOperand(1).getValueType() == MVT::i1 && 6596 "Custom lowering only for i1 stores"); 6597 6598 // First, zero extend to 32 bits, then use a truncating store to 8 bits. 6599 6600 SDLoc dl(Op); 6601 StoreSDNode *ST = cast<StoreSDNode>(Op); 6602 6603 SDValue Chain = ST->getChain(); 6604 SDValue BasePtr = ST->getBasePtr(); 6605 SDValue Value = ST->getValue(); 6606 MachineMemOperand *MMO = ST->getMemOperand(); 6607 6608 Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()), 6609 Value); 6610 return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO); 6611 } 6612 6613 // FIXME: Remove this once the ANDI glue bug is fixed: 6614 SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { 6615 assert(Op.getValueType() == MVT::i1 && 6616 "Custom lowering only for i1 results"); 6617 6618 SDLoc DL(Op); 6619 return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1, 6620 Op.getOperand(0)); 6621 } 6622 6623 /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when 6624 /// possible. 6625 SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 6626 // Not FP? Not a fsel. 6627 if (!Op.getOperand(0).getValueType().isFloatingPoint() || 6628 !Op.getOperand(2).getValueType().isFloatingPoint()) 6629 return Op; 6630 6631 // We might be able to do better than this under some circumstances, but in 6632 // general, fsel-based lowering of select is a finite-math-only optimization. 6633 // For more information, see section F.3 of the 2.06 ISA specification. 6634 if (!DAG.getTarget().Options.NoInfsFPMath || 6635 !DAG.getTarget().Options.NoNaNsFPMath) 6636 return Op; 6637 // TODO: Propagate flags from the select rather than global settings. 6638 SDNodeFlags Flags; 6639 Flags.setNoInfs(true); 6640 Flags.setNoNaNs(true); 6641 6642 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 6643 6644 EVT ResVT = Op.getValueType(); 6645 EVT CmpVT = Op.getOperand(0).getValueType(); 6646 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 6647 SDValue TV = Op.getOperand(2), FV = Op.getOperand(3); 6648 SDLoc dl(Op); 6649 6650 // If the RHS of the comparison is a 0.0, we don't need to do the 6651 // subtraction at all. 6652 SDValue Sel1; 6653 if (isFloatingPointZero(RHS)) 6654 switch (CC) { 6655 default: break; // SETUO etc aren't handled by fsel. 6656 case ISD::SETNE: 6657 std::swap(TV, FV); 6658 LLVM_FALLTHROUGH; 6659 case ISD::SETEQ: 6660 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits 6661 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); 6662 Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); 6663 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits 6664 Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); 6665 return DAG.getNode(PPCISD::FSEL, dl, ResVT, 6666 DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV); 6667 case ISD::SETULT: 6668 case ISD::SETLT: 6669 std::swap(TV, FV); // fsel is natively setge, swap operands for setlt 6670 LLVM_FALLTHROUGH; 6671 case ISD::SETOGE: 6672 case ISD::SETGE: 6673 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits 6674 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); 6675 return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); 6676 case ISD::SETUGT: 6677 case ISD::SETGT: 6678 std::swap(TV, FV); // fsel is natively setge, swap operands for setlt 6679 LLVM_FALLTHROUGH; 6680 case ISD::SETOLE: 6681 case ISD::SETLE: 6682 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits 6683 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); 6684 return DAG.getNode(PPCISD::FSEL, dl, ResVT, 6685 DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV); 6686 } 6687 6688 SDValue Cmp; 6689 switch (CC) { 6690 default: break; // SETUO etc aren't handled by fsel. 6691 case ISD::SETNE: 6692 std::swap(TV, FV); 6693 LLVM_FALLTHROUGH; 6694 case ISD::SETEQ: 6695 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); 6696 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6697 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6698 Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); 6699 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits 6700 Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); 6701 return DAG.getNode(PPCISD::FSEL, dl, ResVT, 6702 DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); 6703 case ISD::SETULT: 6704 case ISD::SETLT: 6705 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); 6706 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6707 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6708 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); 6709 case ISD::SETOGE: 6710 case ISD::SETGE: 6711 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); 6712 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6713 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6714 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); 6715 case ISD::SETUGT: 6716 case ISD::SETGT: 6717 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); 6718 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6719 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6720 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); 6721 case ISD::SETOLE: 6722 case ISD::SETLE: 6723 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); 6724 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6725 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6726 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); 6727 } 6728 return Op; 6729 } 6730 6731 void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, 6732 SelectionDAG &DAG, 6733 const SDLoc &dl) const { 6734 assert(Op.getOperand(0).getValueType().isFloatingPoint()); 6735 SDValue Src = Op.getOperand(0); 6736 if (Src.getValueType() == MVT::f32) 6737 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); 6738 6739 SDValue Tmp; 6740 switch (Op.getSimpleValueType().SimpleTy) { 6741 default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); 6742 case MVT::i32: 6743 Tmp = DAG.getNode( 6744 Op.getOpcode() == ISD::FP_TO_SINT 6745 ? PPCISD::FCTIWZ 6746 : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), 6747 dl, MVT::f64, Src); 6748 break; 6749 case MVT::i64: 6750 assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && 6751 "i64 FP_TO_UINT is supported only with FPCVT"); 6752 Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : 6753 PPCISD::FCTIDUZ, 6754 dl, MVT::f64, Src); 6755 break; 6756 } 6757 6758 // Convert the FP value to an int value through memory. 6759 bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() && 6760 (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); 6761 SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); 6762 int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex(); 6763 MachinePointerInfo MPI = 6764 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 6765 6766 // Emit a store to the stack slot. 6767 SDValue Chain; 6768 if (i32Stack) { 6769 MachineFunction &MF = DAG.getMachineFunction(); 6770 MachineMemOperand *MMO = 6771 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4); 6772 SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr }; 6773 Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, 6774 DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO); 6775 } else 6776 Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI); 6777 6778 // Result is a load from the stack slot. If loading 4 bytes, make sure to 6779 // add in a bias on big endian. 6780 if (Op.getValueType() == MVT::i32 && !i32Stack) { 6781 FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, 6782 DAG.getConstant(4, dl, FIPtr.getValueType())); 6783 MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4); 6784 } 6785 6786 RLI.Chain = Chain; 6787 RLI.Ptr = FIPtr; 6788 RLI.MPI = MPI; 6789 } 6790 6791 /// \brief Custom lowers floating point to integer conversions to use 6792 /// the direct move instructions available in ISA 2.07 to avoid the 6793 /// need for load/store combinations. 6794 SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op, 6795 SelectionDAG &DAG, 6796 const SDLoc &dl) const { 6797 assert(Op.getOperand(0).getValueType().isFloatingPoint()); 6798 SDValue Src = Op.getOperand(0); 6799 6800 if (Src.getValueType() == MVT::f32) 6801 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); 6802 6803 SDValue Tmp; 6804 switch (Op.getSimpleValueType().SimpleTy) { 6805 default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); 6806 case MVT::i32: 6807 Tmp = DAG.getNode( 6808 Op.getOpcode() == ISD::FP_TO_SINT 6809 ? PPCISD::FCTIWZ 6810 : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), 6811 dl, MVT::f64, Src); 6812 Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp); 6813 break; 6814 case MVT::i64: 6815 assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && 6816 "i64 FP_TO_UINT is supported only with FPCVT"); 6817 Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : 6818 PPCISD::FCTIDUZ, 6819 dl, MVT::f64, Src); 6820 Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp); 6821 break; 6822 } 6823 return Tmp; 6824 } 6825 6826 SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, 6827 const SDLoc &dl) const { 6828 if (Subtarget.hasDirectMove() && Subtarget.isPPC64()) 6829 return LowerFP_TO_INTDirectMove(Op, DAG, dl); 6830 6831 ReuseLoadInfo RLI; 6832 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); 6833 6834 return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI, 6835 RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges); 6836 } 6837 6838 // We're trying to insert a regular store, S, and then a load, L. If the 6839 // incoming value, O, is a load, we might just be able to have our load use the 6840 // address used by O. However, we don't know if anything else will store to 6841 // that address before we can load from it. To prevent this situation, we need 6842 // to insert our load, L, into the chain as a peer of O. To do this, we give L 6843 // the same chain operand as O, we create a token factor from the chain results 6844 // of O and L, and we replace all uses of O's chain result with that token 6845 // factor (see spliceIntoChain below for this last part). 6846 bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, 6847 ReuseLoadInfo &RLI, 6848 SelectionDAG &DAG, 6849 ISD::LoadExtType ET) const { 6850 SDLoc dl(Op); 6851 if (ET == ISD::NON_EXTLOAD && 6852 (Op.getOpcode() == ISD::FP_TO_UINT || 6853 Op.getOpcode() == ISD::FP_TO_SINT) && 6854 isOperationLegalOrCustom(Op.getOpcode(), 6855 Op.getOperand(0).getValueType())) { 6856 6857 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); 6858 return true; 6859 } 6860 6861 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op); 6862 if (!LD || LD->getExtensionType() != ET || LD->isVolatile() || 6863 LD->isNonTemporal()) 6864 return false; 6865 if (LD->getMemoryVT() != MemVT) 6866 return false; 6867 6868 RLI.Ptr = LD->getBasePtr(); 6869 if (LD->isIndexed() && !LD->getOffset().isUndef()) { 6870 assert(LD->getAddressingMode() == ISD::PRE_INC && 6871 "Non-pre-inc AM on PPC?"); 6872 RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr, 6873 LD->getOffset()); 6874 } 6875 6876 RLI.Chain = LD->getChain(); 6877 RLI.MPI = LD->getPointerInfo(); 6878 RLI.IsDereferenceable = LD->isDereferenceable(); 6879 RLI.IsInvariant = LD->isInvariant(); 6880 RLI.Alignment = LD->getAlignment(); 6881 RLI.AAInfo = LD->getAAInfo(); 6882 RLI.Ranges = LD->getRanges(); 6883 6884 RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1); 6885 return true; 6886 } 6887 6888 // Given the head of the old chain, ResChain, insert a token factor containing 6889 // it and NewResChain, and make users of ResChain now be users of that token 6890 // factor. 6891 // TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead. 6892 void PPCTargetLowering::spliceIntoChain(SDValue ResChain, 6893 SDValue NewResChain, 6894 SelectionDAG &DAG) const { 6895 if (!ResChain) 6896 return; 6897 6898 SDLoc dl(NewResChain); 6899 6900 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6901 NewResChain, DAG.getUNDEF(MVT::Other)); 6902 assert(TF.getNode() != NewResChain.getNode() && 6903 "A new TF really is required here"); 6904 6905 DAG.ReplaceAllUsesOfValueWith(ResChain, TF); 6906 DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain); 6907 } 6908 6909 /// \brief Analyze profitability of direct move 6910 /// prefer float load to int load plus direct move 6911 /// when there is no integer use of int load 6912 bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const { 6913 SDNode *Origin = Op.getOperand(0).getNode(); 6914 if (Origin->getOpcode() != ISD::LOAD) 6915 return true; 6916 6917 // If there is no LXSIBZX/LXSIHZX, like Power8, 6918 // prefer direct move if the memory size is 1 or 2 bytes. 6919 MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand(); 6920 if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2) 6921 return true; 6922 6923 for (SDNode::use_iterator UI = Origin->use_begin(), 6924 UE = Origin->use_end(); 6925 UI != UE; ++UI) { 6926 6927 // Only look at the users of the loaded value. 6928 if (UI.getUse().get().getResNo() != 0) 6929 continue; 6930 6931 if (UI->getOpcode() != ISD::SINT_TO_FP && 6932 UI->getOpcode() != ISD::UINT_TO_FP) 6933 return true; 6934 } 6935 6936 return false; 6937 } 6938 6939 /// \brief Custom lowers integer to floating point conversions to use 6940 /// the direct move instructions available in ISA 2.07 to avoid the 6941 /// need for load/store combinations. 6942 SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op, 6943 SelectionDAG &DAG, 6944 const SDLoc &dl) const { 6945 assert((Op.getValueType() == MVT::f32 || 6946 Op.getValueType() == MVT::f64) && 6947 "Invalid floating point type as target of conversion"); 6948 assert(Subtarget.hasFPCVT() && 6949 "Int to FP conversions with direct moves require FPCVT"); 6950 SDValue FP; 6951 SDValue Src = Op.getOperand(0); 6952 bool SinglePrec = Op.getValueType() == MVT::f32; 6953 bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32; 6954 bool Signed = Op.getOpcode() == ISD::SINT_TO_FP; 6955 unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) : 6956 (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU); 6957 6958 if (WordInt) { 6959 FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ, 6960 dl, MVT::f64, Src); 6961 FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); 6962 } 6963 else { 6964 FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src); 6965 FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); 6966 } 6967 6968 return FP; 6969 } 6970 6971 SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, 6972 SelectionDAG &DAG) const { 6973 SDLoc dl(Op); 6974 6975 if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) { 6976 if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64) 6977 return SDValue(); 6978 6979 SDValue Value = Op.getOperand(0); 6980 // The values are now known to be -1 (false) or 1 (true). To convert this 6981 // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). 6982 // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 6983 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); 6984 6985 SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); 6986 6987 Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 6988 6989 if (Op.getValueType() != MVT::v4f64) 6990 Value = DAG.getNode(ISD::FP_ROUND, dl, 6991 Op.getValueType(), Value, 6992 DAG.getIntPtrConstant(1, dl)); 6993 return Value; 6994 } 6995 6996 // Don't handle ppc_fp128 here; let it be lowered to a libcall. 6997 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) 6998 return SDValue(); 6999 7000 if (Op.getOperand(0).getValueType() == MVT::i1) 7001 return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0), 7002 DAG.getConstantFP(1.0, dl, Op.getValueType()), 7003 DAG.getConstantFP(0.0, dl, Op.getValueType())); 7004 7005 // If we have direct moves, we can do all the conversion, skip the store/load 7006 // however, without FPCVT we can't do most conversions. 7007 if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) && 7008 Subtarget.isPPC64() && Subtarget.hasFPCVT()) 7009 return LowerINT_TO_FPDirectMove(Op, DAG, dl); 7010 7011 assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && 7012 "UINT_TO_FP is supported only with FPCVT"); 7013 7014 // If we have FCFIDS, then use it when converting to single-precision. 7015 // Otherwise, convert to double-precision and then round. 7016 unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 7017 ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS 7018 : PPCISD::FCFIDS) 7019 : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU 7020 : PPCISD::FCFID); 7021 MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 7022 ? MVT::f32 7023 : MVT::f64; 7024 7025 if (Op.getOperand(0).getValueType() == MVT::i64) { 7026 SDValue SINT = Op.getOperand(0); 7027 // When converting to single-precision, we actually need to convert 7028 // to double-precision first and then round to single-precision. 7029 // To avoid double-rounding effects during that operation, we have 7030 // to prepare the input operand. Bits that might be truncated when 7031 // converting to double-precision are replaced by a bit that won't 7032 // be lost at this stage, but is below the single-precision rounding 7033 // position. 7034 // 7035 // However, if -enable-unsafe-fp-math is in effect, accept double 7036 // rounding to avoid the extra overhead. 7037 if (Op.getValueType() == MVT::f32 && 7038 !Subtarget.hasFPCVT() && 7039 !DAG.getTarget().Options.UnsafeFPMath) { 7040 7041 // Twiddle input to make sure the low 11 bits are zero. (If this 7042 // is the case, we are guaranteed the value will fit into the 53 bit 7043 // mantissa of an IEEE double-precision value without rounding.) 7044 // If any of those low 11 bits were not zero originally, make sure 7045 // bit 12 (value 2048) is set instead, so that the final rounding 7046 // to single-precision gets the correct result. 7047 SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64, 7048 SINT, DAG.getConstant(2047, dl, MVT::i64)); 7049 Round = DAG.getNode(ISD::ADD, dl, MVT::i64, 7050 Round, DAG.getConstant(2047, dl, MVT::i64)); 7051 Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT); 7052 Round = DAG.getNode(ISD::AND, dl, MVT::i64, 7053 Round, DAG.getConstant(-2048, dl, MVT::i64)); 7054 7055 // However, we cannot use that value unconditionally: if the magnitude 7056 // of the input value is small, the bit-twiddling we did above might 7057 // end up visibly changing the output. Fortunately, in that case, we 7058 // don't need to twiddle bits since the original input will convert 7059 // exactly to double-precision floating-point already. Therefore, 7060 // construct a conditional to use the original value if the top 11 7061 // bits are all sign-bit copies, and use the rounded value computed 7062 // above otherwise. 7063 SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64, 7064 SINT, DAG.getConstant(53, dl, MVT::i32)); 7065 Cond = DAG.getNode(ISD::ADD, dl, MVT::i64, 7066 Cond, DAG.getConstant(1, dl, MVT::i64)); 7067 Cond = DAG.getSetCC(dl, MVT::i32, 7068 Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT); 7069 7070 SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT); 7071 } 7072 7073 ReuseLoadInfo RLI; 7074 SDValue Bits; 7075 7076 MachineFunction &MF = DAG.getMachineFunction(); 7077 if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) { 7078 Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, 7079 RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges); 7080 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); 7081 } else if (Subtarget.hasLFIWAX() && 7082 canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) { 7083 MachineMemOperand *MMO = 7084 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7085 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7086 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7087 Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl, 7088 DAG.getVTList(MVT::f64, MVT::Other), 7089 Ops, MVT::i32, MMO); 7090 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); 7091 } else if (Subtarget.hasFPCVT() && 7092 canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) { 7093 MachineMemOperand *MMO = 7094 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7095 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7096 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7097 Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl, 7098 DAG.getVTList(MVT::f64, MVT::Other), 7099 Ops, MVT::i32, MMO); 7100 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); 7101 } else if (((Subtarget.hasLFIWAX() && 7102 SINT.getOpcode() == ISD::SIGN_EXTEND) || 7103 (Subtarget.hasFPCVT() && 7104 SINT.getOpcode() == ISD::ZERO_EXTEND)) && 7105 SINT.getOperand(0).getValueType() == MVT::i32) { 7106 MachineFrameInfo &MFI = MF.getFrameInfo(); 7107 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 7108 7109 int FrameIdx = MFI.CreateStackObject(4, 4, false); 7110 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7111 7112 SDValue Store = 7113 DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, 7114 MachinePointerInfo::getFixedStack( 7115 DAG.getMachineFunction(), FrameIdx)); 7116 7117 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && 7118 "Expected an i32 store"); 7119 7120 RLI.Ptr = FIdx; 7121 RLI.Chain = Store; 7122 RLI.MPI = 7123 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 7124 RLI.Alignment = 4; 7125 7126 MachineMemOperand *MMO = 7127 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7128 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7129 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7130 Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ? 7131 PPCISD::LFIWZX : PPCISD::LFIWAX, 7132 dl, DAG.getVTList(MVT::f64, MVT::Other), 7133 Ops, MVT::i32, MMO); 7134 } else 7135 Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); 7136 7137 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); 7138 7139 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) 7140 FP = DAG.getNode(ISD::FP_ROUND, dl, 7141 MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); 7142 return FP; 7143 } 7144 7145 assert(Op.getOperand(0).getValueType() == MVT::i32 && 7146 "Unhandled INT_TO_FP type in custom expander!"); 7147 // Since we only generate this in 64-bit mode, we can take advantage of 7148 // 64-bit registers. In particular, sign extend the input value into the 7149 // 64-bit register with extsw, store the WHOLE 64-bit value into the stack 7150 // then lfd it and fcfid it. 7151 MachineFunction &MF = DAG.getMachineFunction(); 7152 MachineFrameInfo &MFI = MF.getFrameInfo(); 7153 EVT PtrVT = getPointerTy(MF.getDataLayout()); 7154 7155 SDValue Ld; 7156 if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { 7157 ReuseLoadInfo RLI; 7158 bool ReusingLoad; 7159 if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI, 7160 DAG))) { 7161 int FrameIdx = MFI.CreateStackObject(4, 4, false); 7162 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7163 7164 SDValue Store = 7165 DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, 7166 MachinePointerInfo::getFixedStack( 7167 DAG.getMachineFunction(), FrameIdx)); 7168 7169 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && 7170 "Expected an i32 store"); 7171 7172 RLI.Ptr = FIdx; 7173 RLI.Chain = Store; 7174 RLI.MPI = 7175 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 7176 RLI.Alignment = 4; 7177 } 7178 7179 MachineMemOperand *MMO = 7180 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7181 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7182 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7183 Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? 7184 PPCISD::LFIWZX : PPCISD::LFIWAX, 7185 dl, DAG.getVTList(MVT::f64, MVT::Other), 7186 Ops, MVT::i32, MMO); 7187 if (ReusingLoad) 7188 spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG); 7189 } else { 7190 assert(Subtarget.isPPC64() && 7191 "i32->FP without LFIWAX supported only on PPC64"); 7192 7193 int FrameIdx = MFI.CreateStackObject(8, 8, false); 7194 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7195 7196 SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, 7197 Op.getOperand(0)); 7198 7199 // STD the extended value into the stack slot. 7200 SDValue Store = DAG.getStore( 7201 DAG.getEntryNode(), dl, Ext64, FIdx, 7202 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); 7203 7204 // Load the value as a double. 7205 Ld = DAG.getLoad( 7206 MVT::f64, dl, Store, FIdx, 7207 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); 7208 } 7209 7210 // FCFID it and return it. 7211 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); 7212 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) 7213 FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, 7214 DAG.getIntPtrConstant(0, dl)); 7215 return FP; 7216 } 7217 7218 SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7219 SelectionDAG &DAG) const { 7220 SDLoc dl(Op); 7221 /* 7222 The rounding mode is in bits 30:31 of FPSR, and has the following 7223 settings: 7224 00 Round to nearest 7225 01 Round to 0 7226 10 Round to +inf 7227 11 Round to -inf 7228 7229 FLT_ROUNDS, on the other hand, expects the following: 7230 -1 Undefined 7231 0 Round to 0 7232 1 Round to nearest 7233 2 Round to +inf 7234 3 Round to -inf 7235 7236 To perform the conversion, we do: 7237 ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1)) 7238 */ 7239 7240 MachineFunction &MF = DAG.getMachineFunction(); 7241 EVT VT = Op.getValueType(); 7242 EVT PtrVT = getPointerTy(MF.getDataLayout()); 7243 7244 // Save FP Control Word to register 7245 EVT NodeTys[] = { 7246 MVT::f64, // return register 7247 MVT::Glue // unused in this context 7248 }; 7249 SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None); 7250 7251 // Save FP register to stack slot 7252 int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false); 7253 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); 7254 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot, 7255 MachinePointerInfo()); 7256 7257 // Load FP Control Word from low 32 bits of stack slot. 7258 SDValue Four = DAG.getConstant(4, dl, PtrVT); 7259 SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four); 7260 SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo()); 7261 7262 // Transform as necessary 7263 SDValue CWD1 = 7264 DAG.getNode(ISD::AND, dl, MVT::i32, 7265 CWD, DAG.getConstant(3, dl, MVT::i32)); 7266 SDValue CWD2 = 7267 DAG.getNode(ISD::SRL, dl, MVT::i32, 7268 DAG.getNode(ISD::AND, dl, MVT::i32, 7269 DAG.getNode(ISD::XOR, dl, MVT::i32, 7270 CWD, DAG.getConstant(3, dl, MVT::i32)), 7271 DAG.getConstant(3, dl, MVT::i32)), 7272 DAG.getConstant(1, dl, MVT::i32)); 7273 7274 SDValue RetVal = 7275 DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2); 7276 7277 return DAG.getNode((VT.getSizeInBits() < 16 ? 7278 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7279 } 7280 7281 SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { 7282 EVT VT = Op.getValueType(); 7283 unsigned BitWidth = VT.getSizeInBits(); 7284 SDLoc dl(Op); 7285 assert(Op.getNumOperands() == 3 && 7286 VT == Op.getOperand(1).getValueType() && 7287 "Unexpected SHL!"); 7288 7289 // Expand into a bunch of logical ops. Note that these ops 7290 // depend on the PPC behavior for oversized shift amounts. 7291 SDValue Lo = Op.getOperand(0); 7292 SDValue Hi = Op.getOperand(1); 7293 SDValue Amt = Op.getOperand(2); 7294 EVT AmtVT = Amt.getValueType(); 7295 7296 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, 7297 DAG.getConstant(BitWidth, dl, AmtVT), Amt); 7298 SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt); 7299 SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1); 7300 SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3); 7301 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, 7302 DAG.getConstant(-BitWidth, dl, AmtVT)); 7303 SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5); 7304 SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); 7305 SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt); 7306 SDValue OutOps[] = { OutLo, OutHi }; 7307 return DAG.getMergeValues(OutOps, dl); 7308 } 7309 7310 SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { 7311 EVT VT = Op.getValueType(); 7312 SDLoc dl(Op); 7313 unsigned BitWidth = VT.getSizeInBits(); 7314 assert(Op.getNumOperands() == 3 && 7315 VT == Op.getOperand(1).getValueType() && 7316 "Unexpected SRL!"); 7317 7318 // Expand into a bunch of logical ops. Note that these ops 7319 // depend on the PPC behavior for oversized shift amounts. 7320 SDValue Lo = Op.getOperand(0); 7321 SDValue Hi = Op.getOperand(1); 7322 SDValue Amt = Op.getOperand(2); 7323 EVT AmtVT = Amt.getValueType(); 7324 7325 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, 7326 DAG.getConstant(BitWidth, dl, AmtVT), Amt); 7327 SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); 7328 SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); 7329 SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); 7330 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, 7331 DAG.getConstant(-BitWidth, dl, AmtVT)); 7332 SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5); 7333 SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); 7334 SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt); 7335 SDValue OutOps[] = { OutLo, OutHi }; 7336 return DAG.getMergeValues(OutOps, dl); 7337 } 7338 7339 SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { 7340 SDLoc dl(Op); 7341 EVT VT = Op.getValueType(); 7342 unsigned BitWidth = VT.getSizeInBits(); 7343 assert(Op.getNumOperands() == 3 && 7344 VT == Op.getOperand(1).getValueType() && 7345 "Unexpected SRA!"); 7346 7347 // Expand into a bunch of logical ops, followed by a select_cc. 7348 SDValue Lo = Op.getOperand(0); 7349 SDValue Hi = Op.getOperand(1); 7350 SDValue Amt = Op.getOperand(2); 7351 EVT AmtVT = Amt.getValueType(); 7352 7353 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, 7354 DAG.getConstant(BitWidth, dl, AmtVT), Amt); 7355 SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); 7356 SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); 7357 SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); 7358 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, 7359 DAG.getConstant(-BitWidth, dl, AmtVT)); 7360 SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5); 7361 SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt); 7362 SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT), 7363 Tmp4, Tmp6, ISD::SETLE); 7364 SDValue OutOps[] = { OutLo, OutHi }; 7365 return DAG.getMergeValues(OutOps, dl); 7366 } 7367 7368 //===----------------------------------------------------------------------===// 7369 // Vector related lowering. 7370 // 7371 7372 /// BuildSplatI - Build a canonical splati of Val with an element size of 7373 /// SplatSize. Cast the result to VT. 7374 static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, 7375 SelectionDAG &DAG, const SDLoc &dl) { 7376 assert(Val >= -16 && Val <= 15 && "vsplti is out of range!"); 7377 7378 static const MVT VTys[] = { // canonical VT to use for each size. 7379 MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32 7380 }; 7381 7382 EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1]; 7383 7384 // Force vspltis[hw] -1 to vspltisb -1 to canonicalize. 7385 if (Val == -1) 7386 SplatSize = 1; 7387 7388 EVT CanonicalVT = VTys[SplatSize-1]; 7389 7390 // Build a canonical splat for this value. 7391 return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT)); 7392 } 7393 7394 /// BuildIntrinsicOp - Return a unary operator intrinsic node with the 7395 /// specified intrinsic ID. 7396 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG, 7397 const SDLoc &dl, EVT DestVT = MVT::Other) { 7398 if (DestVT == MVT::Other) DestVT = Op.getValueType(); 7399 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, 7400 DAG.getConstant(IID, dl, MVT::i32), Op); 7401 } 7402 7403 /// BuildIntrinsicOp - Return a binary operator intrinsic node with the 7404 /// specified intrinsic ID. 7405 static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS, 7406 SelectionDAG &DAG, const SDLoc &dl, 7407 EVT DestVT = MVT::Other) { 7408 if (DestVT == MVT::Other) DestVT = LHS.getValueType(); 7409 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, 7410 DAG.getConstant(IID, dl, MVT::i32), LHS, RHS); 7411 } 7412 7413 /// BuildIntrinsicOp - Return a ternary operator intrinsic node with the 7414 /// specified intrinsic ID. 7415 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1, 7416 SDValue Op2, SelectionDAG &DAG, const SDLoc &dl, 7417 EVT DestVT = MVT::Other) { 7418 if (DestVT == MVT::Other) DestVT = Op0.getValueType(); 7419 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, 7420 DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2); 7421 } 7422 7423 /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified 7424 /// amount. The result has the specified value type. 7425 static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT, 7426 SelectionDAG &DAG, const SDLoc &dl) { 7427 // Force LHS/RHS to be the right type. 7428 LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS); 7429 RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS); 7430 7431 int Ops[16]; 7432 for (unsigned i = 0; i != 16; ++i) 7433 Ops[i] = i + Amt; 7434 SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops); 7435 return DAG.getNode(ISD::BITCAST, dl, VT, T); 7436 } 7437 7438 /// Do we have an efficient pattern in a .td file for this node? 7439 /// 7440 /// \param V - pointer to the BuildVectorSDNode being matched 7441 /// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves? 7442 /// 7443 /// There are some patterns where it is beneficial to keep a BUILD_VECTOR 7444 /// node as a BUILD_VECTOR node rather than expanding it. The patterns where 7445 /// the opposite is true (expansion is beneficial) are: 7446 /// - The node builds a vector out of integers that are not 32 or 64-bits 7447 /// - The node builds a vector out of constants 7448 /// - The node is a "load-and-splat" 7449 /// In all other cases, we will choose to keep the BUILD_VECTOR. 7450 static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V, 7451 bool HasDirectMove) { 7452 EVT VecVT = V->getValueType(0); 7453 bool RightType = VecVT == MVT::v2f64 || VecVT == MVT::v4f32 || 7454 (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32)); 7455 if (!RightType) 7456 return false; 7457 7458 bool IsSplat = true; 7459 bool IsLoad = false; 7460 SDValue Op0 = V->getOperand(0); 7461 7462 // This function is called in a block that confirms the node is not a constant 7463 // splat. So a constant BUILD_VECTOR here means the vector is built out of 7464 // different constants. 7465 if (V->isConstant()) 7466 return false; 7467 for (int i = 0, e = V->getNumOperands(); i < e; ++i) { 7468 if (V->getOperand(i).isUndef()) 7469 return false; 7470 // We want to expand nodes that represent load-and-splat even if the 7471 // loaded value is a floating point truncation or conversion to int. 7472 if (V->getOperand(i).getOpcode() == ISD::LOAD || 7473 (V->getOperand(i).getOpcode() == ISD::FP_ROUND && 7474 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) || 7475 (V->getOperand(i).getOpcode() == ISD::FP_TO_SINT && 7476 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) || 7477 (V->getOperand(i).getOpcode() == ISD::FP_TO_UINT && 7478 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD)) 7479 IsLoad = true; 7480 // If the operands are different or the input is not a load and has more 7481 // uses than just this BV node, then it isn't a splat. 7482 if (V->getOperand(i) != Op0 || 7483 (!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode()))) 7484 IsSplat = false; 7485 } 7486 return !(IsSplat && IsLoad); 7487 } 7488 7489 // If this is a case we can't handle, return null and let the default 7490 // expansion code take care of it. If we CAN select this case, and if it 7491 // selects to a single instruction, return Op. Otherwise, if we can codegen 7492 // this case more efficiently than a constant pool load, lower it to the 7493 // sequence of ops that should be used. 7494 SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, 7495 SelectionDAG &DAG) const { 7496 SDLoc dl(Op); 7497 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 7498 assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); 7499 7500 if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) { 7501 // We first build an i32 vector, load it into a QPX register, 7502 // then convert it to a floating-point vector and compare it 7503 // to a zero vector to get the boolean result. 7504 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 7505 int FrameIdx = MFI.CreateStackObject(16, 16, false); 7506 MachinePointerInfo PtrInfo = 7507 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 7508 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 7509 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7510 7511 assert(BVN->getNumOperands() == 4 && 7512 "BUILD_VECTOR for v4i1 does not have 4 operands"); 7513 7514 bool IsConst = true; 7515 for (unsigned i = 0; i < 4; ++i) { 7516 if (BVN->getOperand(i).isUndef()) continue; 7517 if (!isa<ConstantSDNode>(BVN->getOperand(i))) { 7518 IsConst = false; 7519 break; 7520 } 7521 } 7522 7523 if (IsConst) { 7524 Constant *One = 7525 ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0); 7526 Constant *NegOne = 7527 ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0); 7528 7529 Constant *CV[4]; 7530 for (unsigned i = 0; i < 4; ++i) { 7531 if (BVN->getOperand(i).isUndef()) 7532 CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); 7533 else if (isNullConstant(BVN->getOperand(i))) 7534 CV[i] = NegOne; 7535 else 7536 CV[i] = One; 7537 } 7538 7539 Constant *CP = ConstantVector::get(CV); 7540 SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), 7541 16 /* alignment */); 7542 7543 SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; 7544 SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other}); 7545 return DAG.getMemIntrinsicNode( 7546 PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, 7547 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 7548 } 7549 7550 SmallVector<SDValue, 4> Stores; 7551 for (unsigned i = 0; i < 4; ++i) { 7552 if (BVN->getOperand(i).isUndef()) continue; 7553 7554 unsigned Offset = 4*i; 7555 SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); 7556 Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); 7557 7558 unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize(); 7559 if (StoreSize > 4) { 7560 Stores.push_back( 7561 DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx, 7562 PtrInfo.getWithOffset(Offset), MVT::i32)); 7563 } else { 7564 SDValue StoreValue = BVN->getOperand(i); 7565 if (StoreSize < 4) 7566 StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue); 7567 7568 Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx, 7569 PtrInfo.getWithOffset(Offset))); 7570 } 7571 } 7572 7573 SDValue StoreChain; 7574 if (!Stores.empty()) 7575 StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); 7576 else 7577 StoreChain = DAG.getEntryNode(); 7578 7579 // Now load from v4i32 into the QPX register; this will extend it to 7580 // v4i64 but not yet convert it to a floating point. Nevertheless, this 7581 // is typed as v4f64 because the QPX register integer states are not 7582 // explicitly represented. 7583 7584 SDValue Ops[] = {StoreChain, 7585 DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32), 7586 FIdx}; 7587 SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other}); 7588 7589 SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, 7590 dl, VTs, Ops, MVT::v4i32, PtrInfo); 7591 LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, 7592 DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32), 7593 LoadedVect); 7594 7595 SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64); 7596 7597 return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ); 7598 } 7599 7600 // All other QPX vectors are handled by generic code. 7601 if (Subtarget.hasQPX()) 7602 return SDValue(); 7603 7604 // Check if this is a splat of a constant value. 7605 APInt APSplatBits, APSplatUndef; 7606 unsigned SplatBitSize; 7607 bool HasAnyUndefs; 7608 if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, 7609 HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || 7610 SplatBitSize > 32) { 7611 // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be 7612 // lowered to VSX instructions under certain conditions. 7613 // Without VSX, there is no pattern more efficient than expanding the node. 7614 if (Subtarget.hasVSX() && 7615 haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove())) 7616 return Op; 7617 return SDValue(); 7618 } 7619 7620 unsigned SplatBits = APSplatBits.getZExtValue(); 7621 unsigned SplatUndef = APSplatUndef.getZExtValue(); 7622 unsigned SplatSize = SplatBitSize / 8; 7623 7624 // First, handle single instruction cases. 7625 7626 // All zeros? 7627 if (SplatBits == 0) { 7628 // Canonicalize all zero vectors to be v4i32. 7629 if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) { 7630 SDValue Z = DAG.getConstant(0, dl, MVT::v4i32); 7631 Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z); 7632 } 7633 return Op; 7634 } 7635 7636 // We have XXSPLTIB for constant splats one byte wide 7637 if (Subtarget.hasP9Vector() && SplatSize == 1) { 7638 // This is a splat of 1-byte elements with some elements potentially undef. 7639 // Rather than trying to match undef in the SDAG patterns, ensure that all 7640 // elements are the same constant. 7641 if (HasAnyUndefs || ISD::isBuildVectorAllOnes(BVN)) { 7642 SmallVector<SDValue, 16> Ops(16, DAG.getConstant(SplatBits, 7643 dl, MVT::i32)); 7644 SDValue NewBV = DAG.getBuildVector(MVT::v16i8, dl, Ops); 7645 if (Op.getValueType() != MVT::v16i8) 7646 return DAG.getBitcast(Op.getValueType(), NewBV); 7647 return NewBV; 7648 } 7649 return Op; 7650 } 7651 7652 // If the sign extended value is in the range [-16,15], use VSPLTI[bhw]. 7653 int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >> 7654 (32-SplatBitSize)); 7655 if (SextVal >= -16 && SextVal <= 15) 7656 return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl); 7657 7658 // Two instruction sequences. 7659 7660 // If this value is in the range [-32,30] and is even, use: 7661 // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2) 7662 // If this value is in the range [17,31] and is odd, use: 7663 // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16) 7664 // If this value is in the range [-31,-17] and is odd, use: 7665 // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16) 7666 // Note the last two are three-instruction sequences. 7667 if (SextVal >= -32 && SextVal <= 31) { 7668 // To avoid having these optimizations undone by constant folding, 7669 // we convert to a pseudo that will be expanded later into one of 7670 // the above forms. 7671 SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32); 7672 EVT VT = (SplatSize == 1 ? MVT::v16i8 : 7673 (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32)); 7674 SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32); 7675 SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize); 7676 if (VT == Op.getValueType()) 7677 return RetVal; 7678 else 7679 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal); 7680 } 7681 7682 // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is 7683 // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important 7684 // for fneg/fabs. 7685 if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) { 7686 // Make -1 and vspltisw -1: 7687 SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl); 7688 7689 // Make the VSLW intrinsic, computing 0x8000_0000. 7690 SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV, 7691 OnesV, DAG, dl); 7692 7693 // xor by OnesV to invert it. 7694 Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV); 7695 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 7696 } 7697 7698 // Check to see if this is a wide variety of vsplti*, binop self cases. 7699 static const signed char SplatCsts[] = { 7700 -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, 7701 -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16 7702 }; 7703 7704 for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) { 7705 // Indirect through the SplatCsts array so that we favor 'vsplti -1' for 7706 // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1' 7707 int i = SplatCsts[idx]; 7708 7709 // Figure out what shift amount will be used by altivec if shifted by i in 7710 // this splat size. 7711 unsigned TypeShiftAmt = i & (SplatBitSize-1); 7712 7713 // vsplti + shl self. 7714 if (SextVal == (int)((unsigned)i << TypeShiftAmt)) { 7715 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 7716 static const unsigned IIDs[] = { // Intrinsic to use for each size. 7717 Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0, 7718 Intrinsic::ppc_altivec_vslw 7719 }; 7720 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 7721 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 7722 } 7723 7724 // vsplti + srl self. 7725 if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { 7726 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 7727 static const unsigned IIDs[] = { // Intrinsic to use for each size. 7728 Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0, 7729 Intrinsic::ppc_altivec_vsrw 7730 }; 7731 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 7732 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 7733 } 7734 7735 // vsplti + sra self. 7736 if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { 7737 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 7738 static const unsigned IIDs[] = { // Intrinsic to use for each size. 7739 Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0, 7740 Intrinsic::ppc_altivec_vsraw 7741 }; 7742 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 7743 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 7744 } 7745 7746 // vsplti + rol self. 7747 if (SextVal == (int)(((unsigned)i << TypeShiftAmt) | 7748 ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) { 7749 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 7750 static const unsigned IIDs[] = { // Intrinsic to use for each size. 7751 Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0, 7752 Intrinsic::ppc_altivec_vrlw 7753 }; 7754 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 7755 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 7756 } 7757 7758 // t = vsplti c, result = vsldoi t, t, 1 7759 if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) { 7760 SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); 7761 unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1; 7762 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); 7763 } 7764 // t = vsplti c, result = vsldoi t, t, 2 7765 if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) { 7766 SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); 7767 unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2; 7768 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); 7769 } 7770 // t = vsplti c, result = vsldoi t, t, 3 7771 if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) { 7772 SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); 7773 unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3; 7774 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); 7775 } 7776 } 7777 7778 return SDValue(); 7779 } 7780 7781 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 7782 /// the specified operations to build the shuffle. 7783 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 7784 SDValue RHS, SelectionDAG &DAG, 7785 const SDLoc &dl) { 7786 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7787 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 7788 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 7789 7790 enum { 7791 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 7792 OP_VMRGHW, 7793 OP_VMRGLW, 7794 OP_VSPLTISW0, 7795 OP_VSPLTISW1, 7796 OP_VSPLTISW2, 7797 OP_VSPLTISW3, 7798 OP_VSLDOI4, 7799 OP_VSLDOI8, 7800 OP_VSLDOI12 7801 }; 7802 7803 if (OpNum == OP_COPY) { 7804 if (LHSID == (1*9+2)*9+3) return LHS; 7805 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 7806 return RHS; 7807 } 7808 7809 SDValue OpLHS, OpRHS; 7810 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 7811 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 7812 7813 int ShufIdxs[16]; 7814 switch (OpNum) { 7815 default: llvm_unreachable("Unknown i32 permute!"); 7816 case OP_VMRGHW: 7817 ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3; 7818 ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19; 7819 ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7; 7820 ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23; 7821 break; 7822 case OP_VMRGLW: 7823 ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11; 7824 ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27; 7825 ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15; 7826 ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31; 7827 break; 7828 case OP_VSPLTISW0: 7829 for (unsigned i = 0; i != 16; ++i) 7830 ShufIdxs[i] = (i&3)+0; 7831 break; 7832 case OP_VSPLTISW1: 7833 for (unsigned i = 0; i != 16; ++i) 7834 ShufIdxs[i] = (i&3)+4; 7835 break; 7836 case OP_VSPLTISW2: 7837 for (unsigned i = 0; i != 16; ++i) 7838 ShufIdxs[i] = (i&3)+8; 7839 break; 7840 case OP_VSPLTISW3: 7841 for (unsigned i = 0; i != 16; ++i) 7842 ShufIdxs[i] = (i&3)+12; 7843 break; 7844 case OP_VSLDOI4: 7845 return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl); 7846 case OP_VSLDOI8: 7847 return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl); 7848 case OP_VSLDOI12: 7849 return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl); 7850 } 7851 EVT VT = OpLHS.getValueType(); 7852 OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS); 7853 OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS); 7854 SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs); 7855 return DAG.getNode(ISD::BITCAST, dl, VT, T); 7856 } 7857 7858 /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this 7859 /// is a shuffle we can handle in a single instruction, return it. Otherwise, 7860 /// return the code it can be lowered into. Worst case, it can always be 7861 /// lowered into a vperm. 7862 SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 7863 SelectionDAG &DAG) const { 7864 SDLoc dl(Op); 7865 SDValue V1 = Op.getOperand(0); 7866 SDValue V2 = Op.getOperand(1); 7867 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 7868 EVT VT = Op.getValueType(); 7869 bool isLittleEndian = Subtarget.isLittleEndian(); 7870 7871 unsigned ShiftElts, InsertAtByte; 7872 bool Swap; 7873 if (Subtarget.hasP9Vector() && 7874 PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, 7875 isLittleEndian)) { 7876 if (Swap) 7877 std::swap(V1, V2); 7878 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 7879 SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2); 7880 if (ShiftElts) { 7881 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2, 7882 DAG.getConstant(ShiftElts, dl, MVT::i32)); 7883 SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Shl, 7884 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 7885 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); 7886 } 7887 SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Conv2, 7888 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 7889 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); 7890 } 7891 7892 7893 if (Subtarget.hasVSX() && 7894 PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { 7895 if (Swap) 7896 std::swap(V1, V2); 7897 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 7898 SDValue Conv2 = 7899 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2); 7900 7901 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2, 7902 DAG.getConstant(ShiftElts, dl, MVT::i32)); 7903 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl); 7904 } 7905 7906 if (Subtarget.hasVSX() && 7907 PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { 7908 if (Swap) 7909 std::swap(V1, V2); 7910 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); 7911 SDValue Conv2 = 7912 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2); 7913 7914 SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2, 7915 DAG.getConstant(ShiftElts, dl, MVT::i32)); 7916 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI); 7917 } 7918 7919 if (Subtarget.hasP9Vector()) { 7920 if (PPC::isXXBRHShuffleMask(SVOp)) { 7921 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 7922 SDValue ReveHWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v8i16, Conv); 7923 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord); 7924 } else if (PPC::isXXBRWShuffleMask(SVOp)) { 7925 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 7926 SDValue ReveWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v4i32, Conv); 7927 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord); 7928 } else if (PPC::isXXBRDShuffleMask(SVOp)) { 7929 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); 7930 SDValue ReveDWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Conv); 7931 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord); 7932 } else if (PPC::isXXBRQShuffleMask(SVOp)) { 7933 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1); 7934 SDValue ReveQWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v1i128, Conv); 7935 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord); 7936 } 7937 } 7938 7939 if (Subtarget.hasVSX()) { 7940 if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { 7941 int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); 7942 7943 // If the source for the shuffle is a scalar_to_vector that came from a 7944 // 32-bit load, it will have used LXVWSX so we don't need to splat again. 7945 if (Subtarget.hasP9Vector() && 7946 ((isLittleEndian && SplatIdx == 3) || 7947 (!isLittleEndian && SplatIdx == 0))) { 7948 SDValue Src = V1.getOperand(0); 7949 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR && 7950 Src.getOperand(0).getOpcode() == ISD::LOAD && 7951 Src.getOperand(0).hasOneUse()) 7952 return V1; 7953 } 7954 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 7955 SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv, 7956 DAG.getConstant(SplatIdx, dl, MVT::i32)); 7957 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat); 7958 } 7959 7960 // Left shifts of 8 bytes are actually swaps. Convert accordingly. 7961 if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) { 7962 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 7963 SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv); 7964 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap); 7965 } 7966 } 7967 7968 if (Subtarget.hasQPX()) { 7969 if (VT.getVectorNumElements() != 4) 7970 return SDValue(); 7971 7972 if (V2.isUndef()) V2 = V1; 7973 7974 int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp); 7975 if (AlignIdx != -1) { 7976 return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2, 7977 DAG.getConstant(AlignIdx, dl, MVT::i32)); 7978 } else if (SVOp->isSplat()) { 7979 int SplatIdx = SVOp->getSplatIndex(); 7980 if (SplatIdx >= 4) { 7981 std::swap(V1, V2); 7982 SplatIdx -= 4; 7983 } 7984 7985 return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, 7986 DAG.getConstant(SplatIdx, dl, MVT::i32)); 7987 } 7988 7989 // Lower this into a qvgpci/qvfperm pair. 7990 7991 // Compute the qvgpci literal 7992 unsigned idx = 0; 7993 for (unsigned i = 0; i < 4; ++i) { 7994 int m = SVOp->getMaskElt(i); 7995 unsigned mm = m >= 0 ? (unsigned) m : i; 7996 idx |= mm << (3-i)*3; 7997 } 7998 7999 SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64, 8000 DAG.getConstant(idx, dl, MVT::i32)); 8001 return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3); 8002 } 8003 8004 // Cases that are handled by instructions that take permute immediates 8005 // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be 8006 // selected by the instruction selector. 8007 if (V2.isUndef()) { 8008 if (PPC::isSplatShuffleMask(SVOp, 1) || 8009 PPC::isSplatShuffleMask(SVOp, 2) || 8010 PPC::isSplatShuffleMask(SVOp, 4) || 8011 PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) || 8012 PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) || 8013 PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 || 8014 PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) || 8015 PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) || 8016 PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) || 8017 PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) || 8018 PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) || 8019 PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) || 8020 (Subtarget.hasP8Altivec() && ( 8021 PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) || 8022 PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) || 8023 PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) { 8024 return Op; 8025 } 8026 } 8027 8028 // Altivec has a variety of "shuffle immediates" that take two vector inputs 8029 // and produce a fixed permutation. If any of these match, do not lower to 8030 // VPERM. 8031 unsigned int ShuffleKind = isLittleEndian ? 2 : 0; 8032 if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) || 8033 PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) || 8034 PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 || 8035 PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) || 8036 PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) || 8037 PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) || 8038 PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) || 8039 PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) || 8040 PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) || 8041 (Subtarget.hasP8Altivec() && ( 8042 PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) || 8043 PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) || 8044 PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG)))) 8045 return Op; 8046 8047 // Check to see if this is a shuffle of 4-byte values. If so, we can use our 8048 // perfect shuffle table to emit an optimal matching sequence. 8049 ArrayRef<int> PermMask = SVOp->getMask(); 8050 8051 unsigned PFIndexes[4]; 8052 bool isFourElementShuffle = true; 8053 for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number 8054 unsigned EltNo = 8; // Start out undef. 8055 for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. 8056 if (PermMask[i*4+j] < 0) 8057 continue; // Undef, ignore it. 8058 8059 unsigned ByteSource = PermMask[i*4+j]; 8060 if ((ByteSource & 3) != j) { 8061 isFourElementShuffle = false; 8062 break; 8063 } 8064 8065 if (EltNo == 8) { 8066 EltNo = ByteSource/4; 8067 } else if (EltNo != ByteSource/4) { 8068 isFourElementShuffle = false; 8069 break; 8070 } 8071 } 8072 PFIndexes[i] = EltNo; 8073 } 8074 8075 // If this shuffle can be expressed as a shuffle of 4-byte elements, use the 8076 // perfect shuffle vector to determine if it is cost effective to do this as 8077 // discrete instructions, or whether we should use a vperm. 8078 // For now, we skip this for little endian until such time as we have a 8079 // little-endian perfect shuffle table. 8080 if (isFourElementShuffle && !isLittleEndian) { 8081 // Compute the index in the perfect shuffle table. 8082 unsigned PFTableIndex = 8083 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8084 8085 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8086 unsigned Cost = (PFEntry >> 30); 8087 8088 // Determining when to avoid vperm is tricky. Many things affect the cost 8089 // of vperm, particularly how many times the perm mask needs to be computed. 8090 // For example, if the perm mask can be hoisted out of a loop or is already 8091 // used (perhaps because there are multiple permutes with the same shuffle 8092 // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of 8093 // the loop requires an extra register. 8094 // 8095 // As a compromise, we only emit discrete instructions if the shuffle can be 8096 // generated in 3 or fewer operations. When we have loop information 8097 // available, if this block is within a loop, we should avoid using vperm 8098 // for 3-operation perms and use a constant pool load instead. 8099 if (Cost < 3) 8100 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8101 } 8102 8103 // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant 8104 // vector that will get spilled to the constant pool. 8105 if (V2.isUndef()) V2 = V1; 8106 8107 // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except 8108 // that it is in input element units, not in bytes. Convert now. 8109 8110 // For little endian, the order of the input vectors is reversed, and 8111 // the permutation mask is complemented with respect to 31. This is 8112 // necessary to produce proper semantics with the big-endian-biased vperm 8113 // instruction. 8114 EVT EltVT = V1.getValueType().getVectorElementType(); 8115 unsigned BytesPerElement = EltVT.getSizeInBits()/8; 8116 8117 SmallVector<SDValue, 16> ResultMask; 8118 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { 8119 unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; 8120 8121 for (unsigned j = 0; j != BytesPerElement; ++j) 8122 if (isLittleEndian) 8123 ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j), 8124 dl, MVT::i32)); 8125 else 8126 ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl, 8127 MVT::i32)); 8128 } 8129 8130 SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask); 8131 if (isLittleEndian) 8132 return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), 8133 V2, V1, VPermMask); 8134 else 8135 return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), 8136 V1, V2, VPermMask); 8137 } 8138 8139 /// getVectorCompareInfo - Given an intrinsic, return false if it is not a 8140 /// vector comparison. If it is, return true and fill in Opc/isDot with 8141 /// information about the intrinsic. 8142 static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, 8143 bool &isDot, const PPCSubtarget &Subtarget) { 8144 unsigned IntrinsicID = 8145 cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); 8146 CompareOpc = -1; 8147 isDot = false; 8148 switch (IntrinsicID) { 8149 default: 8150 return false; 8151 // Comparison predicates. 8152 case Intrinsic::ppc_altivec_vcmpbfp_p: 8153 CompareOpc = 966; 8154 isDot = true; 8155 break; 8156 case Intrinsic::ppc_altivec_vcmpeqfp_p: 8157 CompareOpc = 198; 8158 isDot = true; 8159 break; 8160 case Intrinsic::ppc_altivec_vcmpequb_p: 8161 CompareOpc = 6; 8162 isDot = true; 8163 break; 8164 case Intrinsic::ppc_altivec_vcmpequh_p: 8165 CompareOpc = 70; 8166 isDot = true; 8167 break; 8168 case Intrinsic::ppc_altivec_vcmpequw_p: 8169 CompareOpc = 134; 8170 isDot = true; 8171 break; 8172 case Intrinsic::ppc_altivec_vcmpequd_p: 8173 if (Subtarget.hasP8Altivec()) { 8174 CompareOpc = 199; 8175 isDot = true; 8176 } else 8177 return false; 8178 break; 8179 case Intrinsic::ppc_altivec_vcmpneb_p: 8180 case Intrinsic::ppc_altivec_vcmpneh_p: 8181 case Intrinsic::ppc_altivec_vcmpnew_p: 8182 case Intrinsic::ppc_altivec_vcmpnezb_p: 8183 case Intrinsic::ppc_altivec_vcmpnezh_p: 8184 case Intrinsic::ppc_altivec_vcmpnezw_p: 8185 if (Subtarget.hasP9Altivec()) { 8186 switch (IntrinsicID) { 8187 default: 8188 llvm_unreachable("Unknown comparison intrinsic."); 8189 case Intrinsic::ppc_altivec_vcmpneb_p: 8190 CompareOpc = 7; 8191 break; 8192 case Intrinsic::ppc_altivec_vcmpneh_p: 8193 CompareOpc = 71; 8194 break; 8195 case Intrinsic::ppc_altivec_vcmpnew_p: 8196 CompareOpc = 135; 8197 break; 8198 case Intrinsic::ppc_altivec_vcmpnezb_p: 8199 CompareOpc = 263; 8200 break; 8201 case Intrinsic::ppc_altivec_vcmpnezh_p: 8202 CompareOpc = 327; 8203 break; 8204 case Intrinsic::ppc_altivec_vcmpnezw_p: 8205 CompareOpc = 391; 8206 break; 8207 } 8208 isDot = true; 8209 } else 8210 return false; 8211 break; 8212 case Intrinsic::ppc_altivec_vcmpgefp_p: 8213 CompareOpc = 454; 8214 isDot = true; 8215 break; 8216 case Intrinsic::ppc_altivec_vcmpgtfp_p: 8217 CompareOpc = 710; 8218 isDot = true; 8219 break; 8220 case Intrinsic::ppc_altivec_vcmpgtsb_p: 8221 CompareOpc = 774; 8222 isDot = true; 8223 break; 8224 case Intrinsic::ppc_altivec_vcmpgtsh_p: 8225 CompareOpc = 838; 8226 isDot = true; 8227 break; 8228 case Intrinsic::ppc_altivec_vcmpgtsw_p: 8229 CompareOpc = 902; 8230 isDot = true; 8231 break; 8232 case Intrinsic::ppc_altivec_vcmpgtsd_p: 8233 if (Subtarget.hasP8Altivec()) { 8234 CompareOpc = 967; 8235 isDot = true; 8236 } else 8237 return false; 8238 break; 8239 case Intrinsic::ppc_altivec_vcmpgtub_p: 8240 CompareOpc = 518; 8241 isDot = true; 8242 break; 8243 case Intrinsic::ppc_altivec_vcmpgtuh_p: 8244 CompareOpc = 582; 8245 isDot = true; 8246 break; 8247 case Intrinsic::ppc_altivec_vcmpgtuw_p: 8248 CompareOpc = 646; 8249 isDot = true; 8250 break; 8251 case Intrinsic::ppc_altivec_vcmpgtud_p: 8252 if (Subtarget.hasP8Altivec()) { 8253 CompareOpc = 711; 8254 isDot = true; 8255 } else 8256 return false; 8257 break; 8258 8259 // VSX predicate comparisons use the same infrastructure 8260 case Intrinsic::ppc_vsx_xvcmpeqdp_p: 8261 case Intrinsic::ppc_vsx_xvcmpgedp_p: 8262 case Intrinsic::ppc_vsx_xvcmpgtdp_p: 8263 case Intrinsic::ppc_vsx_xvcmpeqsp_p: 8264 case Intrinsic::ppc_vsx_xvcmpgesp_p: 8265 case Intrinsic::ppc_vsx_xvcmpgtsp_p: 8266 if (Subtarget.hasVSX()) { 8267 switch (IntrinsicID) { 8268 case Intrinsic::ppc_vsx_xvcmpeqdp_p: 8269 CompareOpc = 99; 8270 break; 8271 case Intrinsic::ppc_vsx_xvcmpgedp_p: 8272 CompareOpc = 115; 8273 break; 8274 case Intrinsic::ppc_vsx_xvcmpgtdp_p: 8275 CompareOpc = 107; 8276 break; 8277 case Intrinsic::ppc_vsx_xvcmpeqsp_p: 8278 CompareOpc = 67; 8279 break; 8280 case Intrinsic::ppc_vsx_xvcmpgesp_p: 8281 CompareOpc = 83; 8282 break; 8283 case Intrinsic::ppc_vsx_xvcmpgtsp_p: 8284 CompareOpc = 75; 8285 break; 8286 } 8287 isDot = true; 8288 } else 8289 return false; 8290 break; 8291 8292 // Normal Comparisons. 8293 case Intrinsic::ppc_altivec_vcmpbfp: 8294 CompareOpc = 966; 8295 break; 8296 case Intrinsic::ppc_altivec_vcmpeqfp: 8297 CompareOpc = 198; 8298 break; 8299 case Intrinsic::ppc_altivec_vcmpequb: 8300 CompareOpc = 6; 8301 break; 8302 case Intrinsic::ppc_altivec_vcmpequh: 8303 CompareOpc = 70; 8304 break; 8305 case Intrinsic::ppc_altivec_vcmpequw: 8306 CompareOpc = 134; 8307 break; 8308 case Intrinsic::ppc_altivec_vcmpequd: 8309 if (Subtarget.hasP8Altivec()) 8310 CompareOpc = 199; 8311 else 8312 return false; 8313 break; 8314 case Intrinsic::ppc_altivec_vcmpneb: 8315 case Intrinsic::ppc_altivec_vcmpneh: 8316 case Intrinsic::ppc_altivec_vcmpnew: 8317 case Intrinsic::ppc_altivec_vcmpnezb: 8318 case Intrinsic::ppc_altivec_vcmpnezh: 8319 case Intrinsic::ppc_altivec_vcmpnezw: 8320 if (Subtarget.hasP9Altivec()) 8321 switch (IntrinsicID) { 8322 default: 8323 llvm_unreachable("Unknown comparison intrinsic."); 8324 case Intrinsic::ppc_altivec_vcmpneb: 8325 CompareOpc = 7; 8326 break; 8327 case Intrinsic::ppc_altivec_vcmpneh: 8328 CompareOpc = 71; 8329 break; 8330 case Intrinsic::ppc_altivec_vcmpnew: 8331 CompareOpc = 135; 8332 break; 8333 case Intrinsic::ppc_altivec_vcmpnezb: 8334 CompareOpc = 263; 8335 break; 8336 case Intrinsic::ppc_altivec_vcmpnezh: 8337 CompareOpc = 327; 8338 break; 8339 case Intrinsic::ppc_altivec_vcmpnezw: 8340 CompareOpc = 391; 8341 break; 8342 } 8343 else 8344 return false; 8345 break; 8346 case Intrinsic::ppc_altivec_vcmpgefp: 8347 CompareOpc = 454; 8348 break; 8349 case Intrinsic::ppc_altivec_vcmpgtfp: 8350 CompareOpc = 710; 8351 break; 8352 case Intrinsic::ppc_altivec_vcmpgtsb: 8353 CompareOpc = 774; 8354 break; 8355 case Intrinsic::ppc_altivec_vcmpgtsh: 8356 CompareOpc = 838; 8357 break; 8358 case Intrinsic::ppc_altivec_vcmpgtsw: 8359 CompareOpc = 902; 8360 break; 8361 case Intrinsic::ppc_altivec_vcmpgtsd: 8362 if (Subtarget.hasP8Altivec()) 8363 CompareOpc = 967; 8364 else 8365 return false; 8366 break; 8367 case Intrinsic::ppc_altivec_vcmpgtub: 8368 CompareOpc = 518; 8369 break; 8370 case Intrinsic::ppc_altivec_vcmpgtuh: 8371 CompareOpc = 582; 8372 break; 8373 case Intrinsic::ppc_altivec_vcmpgtuw: 8374 CompareOpc = 646; 8375 break; 8376 case Intrinsic::ppc_altivec_vcmpgtud: 8377 if (Subtarget.hasP8Altivec()) 8378 CompareOpc = 711; 8379 else 8380 return false; 8381 break; 8382 } 8383 return true; 8384 } 8385 8386 /// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom 8387 /// lower, do it, otherwise return null. 8388 SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 8389 SelectionDAG &DAG) const { 8390 unsigned IntrinsicID = 8391 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8392 8393 if (IntrinsicID == Intrinsic::thread_pointer) { 8394 // Reads the thread pointer register, used for __builtin_thread_pointer. 8395 if (Subtarget.isPPC64()) 8396 return DAG.getRegister(PPC::X13, MVT::i64); 8397 return DAG.getRegister(PPC::R2, MVT::i32); 8398 } 8399 8400 // If this is a lowered altivec predicate compare, CompareOpc is set to the 8401 // opcode number of the comparison. 8402 SDLoc dl(Op); 8403 int CompareOpc; 8404 bool isDot; 8405 if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget)) 8406 return SDValue(); // Don't custom lower most intrinsics. 8407 8408 // If this is a non-dot comparison, make the VCMP node and we are done. 8409 if (!isDot) { 8410 SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(), 8411 Op.getOperand(1), Op.getOperand(2), 8412 DAG.getConstant(CompareOpc, dl, MVT::i32)); 8413 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp); 8414 } 8415 8416 // Create the PPCISD altivec 'dot' comparison node. 8417 SDValue Ops[] = { 8418 Op.getOperand(2), // LHS 8419 Op.getOperand(3), // RHS 8420 DAG.getConstant(CompareOpc, dl, MVT::i32) 8421 }; 8422 EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue }; 8423 SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); 8424 8425 // Now that we have the comparison, emit a copy from the CR to a GPR. 8426 // This is flagged to the above dot comparison. 8427 SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32, 8428 DAG.getRegister(PPC::CR6, MVT::i32), 8429 CompNode.getValue(1)); 8430 8431 // Unpack the result based on how the target uses it. 8432 unsigned BitNo; // Bit # of CR6. 8433 bool InvertBit; // Invert result? 8434 switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) { 8435 default: // Can't happen, don't crash on invalid number though. 8436 case 0: // Return the value of the EQ bit of CR6. 8437 BitNo = 0; InvertBit = false; 8438 break; 8439 case 1: // Return the inverted value of the EQ bit of CR6. 8440 BitNo = 0; InvertBit = true; 8441 break; 8442 case 2: // Return the value of the LT bit of CR6. 8443 BitNo = 2; InvertBit = false; 8444 break; 8445 case 3: // Return the inverted value of the LT bit of CR6. 8446 BitNo = 2; InvertBit = true; 8447 break; 8448 } 8449 8450 // Shift the bit into the low position. 8451 Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags, 8452 DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32)); 8453 // Isolate the bit. 8454 Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags, 8455 DAG.getConstant(1, dl, MVT::i32)); 8456 8457 // If we are supposed to, toggle the bit. 8458 if (InvertBit) 8459 Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags, 8460 DAG.getConstant(1, dl, MVT::i32)); 8461 return Flags; 8462 } 8463 8464 SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, 8465 SelectionDAG &DAG) const { 8466 // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to 8467 // the beginning of the argument list. 8468 int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1; 8469 SDLoc DL(Op); 8470 switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) { 8471 case Intrinsic::ppc_cfence: { 8472 assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument."); 8473 assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); 8474 return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, 8475 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, 8476 Op.getOperand(ArgStart + 1)), 8477 Op.getOperand(0)), 8478 0); 8479 } 8480 default: 8481 break; 8482 } 8483 return SDValue(); 8484 } 8485 8486 SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const { 8487 // Check for a DIV with the same operands as this REM. 8488 for (auto UI : Op.getOperand(1)->uses()) { 8489 if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) || 8490 (Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV)) 8491 if (UI->getOperand(0) == Op.getOperand(0) && 8492 UI->getOperand(1) == Op.getOperand(1)) 8493 return SDValue(); 8494 } 8495 return Op; 8496 } 8497 8498 SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 8499 SelectionDAG &DAG) const { 8500 SDLoc dl(Op); 8501 // For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int 8502 // instructions), but for smaller types, we need to first extend up to v2i32 8503 // before doing going farther. 8504 if (Op.getValueType() == MVT::v2i64) { 8505 EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 8506 if (ExtVT != MVT::v2i32) { 8507 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)); 8508 Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op, 8509 DAG.getValueType(EVT::getVectorVT(*DAG.getContext(), 8510 ExtVT.getVectorElementType(), 4))); 8511 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op); 8512 Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op, 8513 DAG.getValueType(MVT::v2i32)); 8514 } 8515 8516 return Op; 8517 } 8518 8519 return SDValue(); 8520 } 8521 8522 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, 8523 SelectionDAG &DAG) const { 8524 SDLoc dl(Op); 8525 // Create a stack slot that is 16-byte aligned. 8526 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8527 int FrameIdx = MFI.CreateStackObject(16, 16, false); 8528 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8529 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 8530 8531 // Store the input value into Value#0 of the stack slot. 8532 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, 8533 MachinePointerInfo()); 8534 // Load it out. 8535 return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo()); 8536 } 8537 8538 SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8539 SelectionDAG &DAG) const { 8540 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && 8541 "Should only be called for ISD::INSERT_VECTOR_ELT"); 8542 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 8543 // We have legal lowering for constant indices but not for variable ones. 8544 if (C) 8545 return Op; 8546 return SDValue(); 8547 } 8548 8549 SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 8550 SelectionDAG &DAG) const { 8551 SDLoc dl(Op); 8552 SDNode *N = Op.getNode(); 8553 8554 assert(N->getOperand(0).getValueType() == MVT::v4i1 && 8555 "Unknown extract_vector_elt type"); 8556 8557 SDValue Value = N->getOperand(0); 8558 8559 // The first part of this is like the store lowering except that we don't 8560 // need to track the chain. 8561 8562 // The values are now known to be -1 (false) or 1 (true). To convert this 8563 // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). 8564 // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 8565 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); 8566 8567 // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to 8568 // understand how to form the extending load. 8569 SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); 8570 8571 Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 8572 8573 // Now convert to an integer and store. 8574 Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, 8575 DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), 8576 Value); 8577 8578 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8579 int FrameIdx = MFI.CreateStackObject(16, 16, false); 8580 MachinePointerInfo PtrInfo = 8581 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 8582 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8583 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 8584 8585 SDValue StoreChain = DAG.getEntryNode(); 8586 SDValue Ops[] = {StoreChain, 8587 DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), 8588 Value, FIdx}; 8589 SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); 8590 8591 StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, 8592 dl, VTs, Ops, MVT::v4i32, PtrInfo); 8593 8594 // Extract the value requested. 8595 unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 8596 SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); 8597 Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); 8598 8599 SDValue IntVal = 8600 DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); 8601 8602 if (!Subtarget.useCRBits()) 8603 return IntVal; 8604 8605 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal); 8606 } 8607 8608 /// Lowering for QPX v4i1 loads 8609 SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, 8610 SelectionDAG &DAG) const { 8611 SDLoc dl(Op); 8612 LoadSDNode *LN = cast<LoadSDNode>(Op.getNode()); 8613 SDValue LoadChain = LN->getChain(); 8614 SDValue BasePtr = LN->getBasePtr(); 8615 8616 if (Op.getValueType() == MVT::v4f64 || 8617 Op.getValueType() == MVT::v4f32) { 8618 EVT MemVT = LN->getMemoryVT(); 8619 unsigned Alignment = LN->getAlignment(); 8620 8621 // If this load is properly aligned, then it is legal. 8622 if (Alignment >= MemVT.getStoreSize()) 8623 return Op; 8624 8625 EVT ScalarVT = Op.getValueType().getScalarType(), 8626 ScalarMemVT = MemVT.getScalarType(); 8627 unsigned Stride = ScalarMemVT.getStoreSize(); 8628 8629 SDValue Vals[4], LoadChains[4]; 8630 for (unsigned Idx = 0; Idx < 4; ++Idx) { 8631 SDValue Load; 8632 if (ScalarVT != ScalarMemVT) 8633 Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain, 8634 BasePtr, 8635 LN->getPointerInfo().getWithOffset(Idx * Stride), 8636 ScalarMemVT, MinAlign(Alignment, Idx * Stride), 8637 LN->getMemOperand()->getFlags(), LN->getAAInfo()); 8638 else 8639 Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr, 8640 LN->getPointerInfo().getWithOffset(Idx * Stride), 8641 MinAlign(Alignment, Idx * Stride), 8642 LN->getMemOperand()->getFlags(), LN->getAAInfo()); 8643 8644 if (Idx == 0 && LN->isIndexed()) { 8645 assert(LN->getAddressingMode() == ISD::PRE_INC && 8646 "Unknown addressing mode on vector load"); 8647 Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(), 8648 LN->getAddressingMode()); 8649 } 8650 8651 Vals[Idx] = Load; 8652 LoadChains[Idx] = Load.getValue(1); 8653 8654 BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 8655 DAG.getConstant(Stride, dl, 8656 BasePtr.getValueType())); 8657 } 8658 8659 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); 8660 SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals); 8661 8662 if (LN->isIndexed()) { 8663 SDValue RetOps[] = { Value, Vals[0].getValue(1), TF }; 8664 return DAG.getMergeValues(RetOps, dl); 8665 } 8666 8667 SDValue RetOps[] = { Value, TF }; 8668 return DAG.getMergeValues(RetOps, dl); 8669 } 8670 8671 assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower"); 8672 assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported"); 8673 8674 // To lower v4i1 from a byte array, we load the byte elements of the 8675 // vector and then reuse the BUILD_VECTOR logic. 8676 8677 SDValue VectElmts[4], VectElmtChains[4]; 8678 for (unsigned i = 0; i < 4; ++i) { 8679 SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); 8680 Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); 8681 8682 VectElmts[i] = DAG.getExtLoad( 8683 ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx, 8684 LN->getPointerInfo().getWithOffset(i), MVT::i8, 8685 /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo()); 8686 VectElmtChains[i] = VectElmts[i].getValue(1); 8687 } 8688 8689 LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains); 8690 SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts); 8691 8692 SDValue RVals[] = { Value, LoadChain }; 8693 return DAG.getMergeValues(RVals, dl); 8694 } 8695 8696 /// Lowering for QPX v4i1 stores 8697 SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, 8698 SelectionDAG &DAG) const { 8699 SDLoc dl(Op); 8700 StoreSDNode *SN = cast<StoreSDNode>(Op.getNode()); 8701 SDValue StoreChain = SN->getChain(); 8702 SDValue BasePtr = SN->getBasePtr(); 8703 SDValue Value = SN->getValue(); 8704 8705 if (Value.getValueType() == MVT::v4f64 || 8706 Value.getValueType() == MVT::v4f32) { 8707 EVT MemVT = SN->getMemoryVT(); 8708 unsigned Alignment = SN->getAlignment(); 8709 8710 // If this store is properly aligned, then it is legal. 8711 if (Alignment >= MemVT.getStoreSize()) 8712 return Op; 8713 8714 EVT ScalarVT = Value.getValueType().getScalarType(), 8715 ScalarMemVT = MemVT.getScalarType(); 8716 unsigned Stride = ScalarMemVT.getStoreSize(); 8717 8718 SDValue Stores[4]; 8719 for (unsigned Idx = 0; Idx < 4; ++Idx) { 8720 SDValue Ex = DAG.getNode( 8721 ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, 8722 DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout()))); 8723 SDValue Store; 8724 if (ScalarVT != ScalarMemVT) 8725 Store = 8726 DAG.getTruncStore(StoreChain, dl, Ex, BasePtr, 8727 SN->getPointerInfo().getWithOffset(Idx * Stride), 8728 ScalarMemVT, MinAlign(Alignment, Idx * Stride), 8729 SN->getMemOperand()->getFlags(), SN->getAAInfo()); 8730 else 8731 Store = DAG.getStore(StoreChain, dl, Ex, BasePtr, 8732 SN->getPointerInfo().getWithOffset(Idx * Stride), 8733 MinAlign(Alignment, Idx * Stride), 8734 SN->getMemOperand()->getFlags(), SN->getAAInfo()); 8735 8736 if (Idx == 0 && SN->isIndexed()) { 8737 assert(SN->getAddressingMode() == ISD::PRE_INC && 8738 "Unknown addressing mode on vector store"); 8739 Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(), 8740 SN->getAddressingMode()); 8741 } 8742 8743 BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 8744 DAG.getConstant(Stride, dl, 8745 BasePtr.getValueType())); 8746 Stores[Idx] = Store; 8747 } 8748 8749 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); 8750 8751 if (SN->isIndexed()) { 8752 SDValue RetOps[] = { TF, Stores[0].getValue(1) }; 8753 return DAG.getMergeValues(RetOps, dl); 8754 } 8755 8756 return TF; 8757 } 8758 8759 assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported"); 8760 assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower"); 8761 8762 // The values are now known to be -1 (false) or 1 (true). To convert this 8763 // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). 8764 // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 8765 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); 8766 8767 // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to 8768 // understand how to form the extending load. 8769 SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); 8770 8771 Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 8772 8773 // Now convert to an integer and store. 8774 Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, 8775 DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), 8776 Value); 8777 8778 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 8779 int FrameIdx = MFI.CreateStackObject(16, 16, false); 8780 MachinePointerInfo PtrInfo = 8781 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 8782 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 8783 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 8784 8785 SDValue Ops[] = {StoreChain, 8786 DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), 8787 Value, FIdx}; 8788 SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); 8789 8790 StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, 8791 dl, VTs, Ops, MVT::v4i32, PtrInfo); 8792 8793 // Move data into the byte array. 8794 SDValue Loads[4], LoadChains[4]; 8795 for (unsigned i = 0; i < 4; ++i) { 8796 unsigned Offset = 4*i; 8797 SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); 8798 Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); 8799 8800 Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, 8801 PtrInfo.getWithOffset(Offset)); 8802 LoadChains[i] = Loads[i].getValue(1); 8803 } 8804 8805 StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); 8806 8807 SDValue Stores[4]; 8808 for (unsigned i = 0; i < 4; ++i) { 8809 SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); 8810 Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); 8811 8812 Stores[i] = DAG.getTruncStore( 8813 StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), 8814 MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(), 8815 SN->getAAInfo()); 8816 } 8817 8818 StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); 8819 8820 return StoreChain; 8821 } 8822 8823 SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { 8824 SDLoc dl(Op); 8825 if (Op.getValueType() == MVT::v4i32) { 8826 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 8827 8828 SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl); 8829 SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt. 8830 8831 SDValue RHSSwap = // = vrlw RHS, 16 8832 BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl); 8833 8834 // Shrinkify inputs to v8i16. 8835 LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS); 8836 RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS); 8837 RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap); 8838 8839 // Low parts multiplied together, generating 32-bit results (we ignore the 8840 // top parts). 8841 SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh, 8842 LHS, RHS, DAG, dl, MVT::v4i32); 8843 8844 SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm, 8845 LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32); 8846 // Shift the high parts up 16 bits. 8847 HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd, 8848 Neg16, DAG, dl); 8849 return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd); 8850 } else if (Op.getValueType() == MVT::v8i16) { 8851 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 8852 8853 SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl); 8854 8855 return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm, 8856 LHS, RHS, Zero, DAG, dl); 8857 } else if (Op.getValueType() == MVT::v16i8) { 8858 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 8859 bool isLittleEndian = Subtarget.isLittleEndian(); 8860 8861 // Multiply the even 8-bit parts, producing 16-bit sums. 8862 SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub, 8863 LHS, RHS, DAG, dl, MVT::v8i16); 8864 EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts); 8865 8866 // Multiply the odd 8-bit parts, producing 16-bit sums. 8867 SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub, 8868 LHS, RHS, DAG, dl, MVT::v8i16); 8869 OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts); 8870 8871 // Merge the results together. Because vmuleub and vmuloub are 8872 // instructions with a big-endian bias, we must reverse the 8873 // element numbering and reverse the meaning of "odd" and "even" 8874 // when generating little endian code. 8875 int Ops[16]; 8876 for (unsigned i = 0; i != 8; ++i) { 8877 if (isLittleEndian) { 8878 Ops[i*2 ] = 2*i; 8879 Ops[i*2+1] = 2*i+16; 8880 } else { 8881 Ops[i*2 ] = 2*i+1; 8882 Ops[i*2+1] = 2*i+1+16; 8883 } 8884 } 8885 if (isLittleEndian) 8886 return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops); 8887 else 8888 return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops); 8889 } else { 8890 llvm_unreachable("Unknown mul to lower!"); 8891 } 8892 } 8893 8894 /// LowerOperation - Provide custom lowering hooks for some operations. 8895 /// 8896 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 8897 switch (Op.getOpcode()) { 8898 default: llvm_unreachable("Wasn't expecting to be able to lower this!"); 8899 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 8900 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 8901 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 8902 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 8903 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 8904 case ISD::SETCC: return LowerSETCC(Op, DAG); 8905 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 8906 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 8907 case ISD::VASTART: 8908 return LowerVASTART(Op, DAG); 8909 8910 case ISD::VAARG: 8911 return LowerVAARG(Op, DAG); 8912 8913 case ISD::VACOPY: 8914 return LowerVACOPY(Op, DAG); 8915 8916 case ISD::STACKRESTORE: 8917 return LowerSTACKRESTORE(Op, DAG); 8918 8919 case ISD::DYNAMIC_STACKALLOC: 8920 return LowerDYNAMIC_STACKALLOC(Op, DAG); 8921 8922 case ISD::GET_DYNAMIC_AREA_OFFSET: 8923 return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG); 8924 8925 case ISD::EH_DWARF_CFA: 8926 return LowerEH_DWARF_CFA(Op, DAG); 8927 8928 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); 8929 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); 8930 8931 case ISD::LOAD: return LowerLOAD(Op, DAG); 8932 case ISD::STORE: return LowerSTORE(Op, DAG); 8933 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); 8934 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 8935 case ISD::FP_TO_UINT: 8936 case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, 8937 SDLoc(Op)); 8938 case ISD::UINT_TO_FP: 8939 case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 8940 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 8941 8942 // Lower 64-bit shifts. 8943 case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG); 8944 case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG); 8945 case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG); 8946 8947 // Vector-related lowering. 8948 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 8949 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 8950 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 8951 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 8952 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 8953 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 8954 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 8955 case ISD::MUL: return LowerMUL(Op, DAG); 8956 8957 // For counter-based loop handling. 8958 case ISD::INTRINSIC_W_CHAIN: return SDValue(); 8959 8960 // Frame & Return address. 8961 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 8962 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 8963 8964 case ISD::INTRINSIC_VOID: 8965 return LowerINTRINSIC_VOID(Op, DAG); 8966 case ISD::SREM: 8967 case ISD::UREM: 8968 return LowerREM(Op, DAG); 8969 } 8970 } 8971 8972 void PPCTargetLowering::ReplaceNodeResults(SDNode *N, 8973 SmallVectorImpl<SDValue>&Results, 8974 SelectionDAG &DAG) const { 8975 SDLoc dl(N); 8976 switch (N->getOpcode()) { 8977 default: 8978 llvm_unreachable("Do not know how to custom type legalize this operation!"); 8979 case ISD::READCYCLECOUNTER: { 8980 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 8981 SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0)); 8982 8983 Results.push_back(RTB); 8984 Results.push_back(RTB.getValue(1)); 8985 Results.push_back(RTB.getValue(2)); 8986 break; 8987 } 8988 case ISD::INTRINSIC_W_CHAIN: { 8989 if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 8990 Intrinsic::ppc_is_decremented_ctr_nonzero) 8991 break; 8992 8993 assert(N->getValueType(0) == MVT::i1 && 8994 "Unexpected result type for CTR decrement intrinsic"); 8995 EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), 8996 N->getValueType(0)); 8997 SDVTList VTs = DAG.getVTList(SVT, MVT::Other); 8998 SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0), 8999 N->getOperand(1)); 9000 9001 Results.push_back(NewInt); 9002 Results.push_back(NewInt.getValue(1)); 9003 break; 9004 } 9005 case ISD::VAARG: { 9006 if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64()) 9007 return; 9008 9009 EVT VT = N->getValueType(0); 9010 9011 if (VT == MVT::i64) { 9012 SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG); 9013 9014 Results.push_back(NewNode); 9015 Results.push_back(NewNode.getValue(1)); 9016 } 9017 return; 9018 } 9019 case ISD::FP_ROUND_INREG: { 9020 assert(N->getValueType(0) == MVT::ppcf128); 9021 assert(N->getOperand(0).getValueType() == MVT::ppcf128); 9022 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, 9023 MVT::f64, N->getOperand(0), 9024 DAG.getIntPtrConstant(0, dl)); 9025 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, 9026 MVT::f64, N->getOperand(0), 9027 DAG.getIntPtrConstant(1, dl)); 9028 9029 // Add the two halves of the long double in round-to-zero mode. 9030 SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi); 9031 9032 // We know the low half is about to be thrown away, so just use something 9033 // convenient. 9034 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128, 9035 FPreg, FPreg)); 9036 return; 9037 } 9038 case ISD::FP_TO_SINT: 9039 case ISD::FP_TO_UINT: 9040 // LowerFP_TO_INT() can only handle f32 and f64. 9041 if (N->getOperand(0).getValueType() == MVT::ppcf128) 9042 return; 9043 Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); 9044 return; 9045 } 9046 } 9047 9048 //===----------------------------------------------------------------------===// 9049 // Other Lowering Code 9050 //===----------------------------------------------------------------------===// 9051 9052 static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { 9053 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 9054 Function *Func = Intrinsic::getDeclaration(M, Id); 9055 return Builder.CreateCall(Func, {}); 9056 } 9057 9058 // The mappings for emitLeading/TrailingFence is taken from 9059 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 9060 Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 9061 Instruction *Inst, 9062 AtomicOrdering Ord) const { 9063 if (Ord == AtomicOrdering::SequentiallyConsistent) 9064 return callIntrinsic(Builder, Intrinsic::ppc_sync); 9065 if (isReleaseOrStronger(Ord)) 9066 return callIntrinsic(Builder, Intrinsic::ppc_lwsync); 9067 return nullptr; 9068 } 9069 9070 Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 9071 Instruction *Inst, 9072 AtomicOrdering Ord) const { 9073 if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { 9074 // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and 9075 // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html 9076 // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. 9077 if (isa<LoadInst>(Inst) && Subtarget.isPPC64()) 9078 return Builder.CreateCall( 9079 Intrinsic::getDeclaration( 9080 Builder.GetInsertBlock()->getParent()->getParent(), 9081 Intrinsic::ppc_cfence, {Inst->getType()}), 9082 {Inst}); 9083 // FIXME: Can use isync for rmw operation. 9084 return callIntrinsic(Builder, Intrinsic::ppc_lwsync); 9085 } 9086 return nullptr; 9087 } 9088 9089 MachineBasicBlock * 9090 PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, 9091 unsigned AtomicSize, 9092 unsigned BinOpcode, 9093 unsigned CmpOpcode, 9094 unsigned CmpPred) const { 9095 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 9096 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9097 9098 auto LoadMnemonic = PPC::LDARX; 9099 auto StoreMnemonic = PPC::STDCX; 9100 switch (AtomicSize) { 9101 default: 9102 llvm_unreachable("Unexpected size of atomic entity"); 9103 case 1: 9104 LoadMnemonic = PPC::LBARX; 9105 StoreMnemonic = PPC::STBCX; 9106 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); 9107 break; 9108 case 2: 9109 LoadMnemonic = PPC::LHARX; 9110 StoreMnemonic = PPC::STHCX; 9111 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); 9112 break; 9113 case 4: 9114 LoadMnemonic = PPC::LWARX; 9115 StoreMnemonic = PPC::STWCX; 9116 break; 9117 case 8: 9118 LoadMnemonic = PPC::LDARX; 9119 StoreMnemonic = PPC::STDCX; 9120 break; 9121 } 9122 9123 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9124 MachineFunction *F = BB->getParent(); 9125 MachineFunction::iterator It = ++BB->getIterator(); 9126 9127 unsigned dest = MI.getOperand(0).getReg(); 9128 unsigned ptrA = MI.getOperand(1).getReg(); 9129 unsigned ptrB = MI.getOperand(2).getReg(); 9130 unsigned incr = MI.getOperand(3).getReg(); 9131 DebugLoc dl = MI.getDebugLoc(); 9132 9133 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); 9134 MachineBasicBlock *loop2MBB = 9135 CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr; 9136 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 9137 F->insert(It, loopMBB); 9138 if (CmpOpcode) 9139 F->insert(It, loop2MBB); 9140 F->insert(It, exitMBB); 9141 exitMBB->splice(exitMBB->begin(), BB, 9142 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9143 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 9144 9145 MachineRegisterInfo &RegInfo = F->getRegInfo(); 9146 unsigned TmpReg = (!BinOpcode) ? incr : 9147 RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass 9148 : &PPC::GPRCRegClass); 9149 9150 // thisMBB: 9151 // ... 9152 // fallthrough --> loopMBB 9153 BB->addSuccessor(loopMBB); 9154 9155 // loopMBB: 9156 // l[wd]arx dest, ptr 9157 // add r0, dest, incr 9158 // st[wd]cx. r0, ptr 9159 // bne- loopMBB 9160 // fallthrough --> exitMBB 9161 9162 // For max/min... 9163 // loopMBB: 9164 // l[wd]arx dest, ptr 9165 // cmpl?[wd] incr, dest 9166 // bgt exitMBB 9167 // loop2MBB: 9168 // st[wd]cx. dest, ptr 9169 // bne- loopMBB 9170 // fallthrough --> exitMBB 9171 9172 BB = loopMBB; 9173 BuildMI(BB, dl, TII->get(LoadMnemonic), dest) 9174 .addReg(ptrA).addReg(ptrB); 9175 if (BinOpcode) 9176 BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest); 9177 if (CmpOpcode) { 9178 // Signed comparisons of byte or halfword values must be sign-extended. 9179 if (CmpOpcode == PPC::CMPW && AtomicSize < 4) { 9180 unsigned ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); 9181 BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH), 9182 ExtReg).addReg(dest); 9183 BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) 9184 .addReg(incr).addReg(ExtReg); 9185 } else 9186 BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) 9187 .addReg(incr).addReg(dest); 9188 9189 BuildMI(BB, dl, TII->get(PPC::BCC)) 9190 .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB); 9191 BB->addSuccessor(loop2MBB); 9192 BB->addSuccessor(exitMBB); 9193 BB = loop2MBB; 9194 } 9195 BuildMI(BB, dl, TII->get(StoreMnemonic)) 9196 .addReg(TmpReg).addReg(ptrA).addReg(ptrB); 9197 BuildMI(BB, dl, TII->get(PPC::BCC)) 9198 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); 9199 BB->addSuccessor(loopMBB); 9200 BB->addSuccessor(exitMBB); 9201 9202 // exitMBB: 9203 // ... 9204 BB = exitMBB; 9205 return BB; 9206 } 9207 9208 MachineBasicBlock * 9209 PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI, 9210 MachineBasicBlock *BB, 9211 bool is8bit, // operation 9212 unsigned BinOpcode, 9213 unsigned CmpOpcode, 9214 unsigned CmpPred) const { 9215 // If we support part-word atomic mnemonics, just use them 9216 if (Subtarget.hasPartwordAtomics()) 9217 return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, 9218 CmpOpcode, CmpPred); 9219 9220 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 9221 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9222 // In 64 bit mode we have to use 64 bits for addresses, even though the 9223 // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address 9224 // registers without caring whether they're 32 or 64, but here we're 9225 // doing actual arithmetic on the addresses. 9226 bool is64bit = Subtarget.isPPC64(); 9227 bool isLittleEndian = Subtarget.isLittleEndian(); 9228 unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; 9229 9230 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9231 MachineFunction *F = BB->getParent(); 9232 MachineFunction::iterator It = ++BB->getIterator(); 9233 9234 unsigned dest = MI.getOperand(0).getReg(); 9235 unsigned ptrA = MI.getOperand(1).getReg(); 9236 unsigned ptrB = MI.getOperand(2).getReg(); 9237 unsigned incr = MI.getOperand(3).getReg(); 9238 DebugLoc dl = MI.getDebugLoc(); 9239 9240 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); 9241 MachineBasicBlock *loop2MBB = 9242 CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr; 9243 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 9244 F->insert(It, loopMBB); 9245 if (CmpOpcode) 9246 F->insert(It, loop2MBB); 9247 F->insert(It, exitMBB); 9248 exitMBB->splice(exitMBB->begin(), BB, 9249 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9250 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 9251 9252 MachineRegisterInfo &RegInfo = F->getRegInfo(); 9253 const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass 9254 : &PPC::GPRCRegClass; 9255 unsigned PtrReg = RegInfo.createVirtualRegister(RC); 9256 unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); 9257 unsigned ShiftReg = 9258 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC); 9259 unsigned Incr2Reg = RegInfo.createVirtualRegister(RC); 9260 unsigned MaskReg = RegInfo.createVirtualRegister(RC); 9261 unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); 9262 unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); 9263 unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); 9264 unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC); 9265 unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); 9266 unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); 9267 unsigned Ptr1Reg; 9268 unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC); 9269 9270 // thisMBB: 9271 // ... 9272 // fallthrough --> loopMBB 9273 BB->addSuccessor(loopMBB); 9274 9275 // The 4-byte load must be aligned, while a char or short may be 9276 // anywhere in the word. Hence all this nasty bookkeeping code. 9277 // add ptr1, ptrA, ptrB [copy if ptrA==0] 9278 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] 9279 // xori shift, shift1, 24 [16] 9280 // rlwinm ptr, ptr1, 0, 0, 29 9281 // slw incr2, incr, shift 9282 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] 9283 // slw mask, mask2, shift 9284 // loopMBB: 9285 // lwarx tmpDest, ptr 9286 // add tmp, tmpDest, incr2 9287 // andc tmp2, tmpDest, mask 9288 // and tmp3, tmp, mask 9289 // or tmp4, tmp3, tmp2 9290 // stwcx. tmp4, ptr 9291 // bne- loopMBB 9292 // fallthrough --> exitMBB 9293 // srw dest, tmpDest, shift 9294 if (ptrA != ZeroReg) { 9295 Ptr1Reg = RegInfo.createVirtualRegister(RC); 9296 BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) 9297 .addReg(ptrA).addReg(ptrB); 9298 } else { 9299 Ptr1Reg = ptrB; 9300 } 9301 BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) 9302 .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); 9303 if (!isLittleEndian) 9304 BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) 9305 .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); 9306 if (is64bit) 9307 BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) 9308 .addReg(Ptr1Reg).addImm(0).addImm(61); 9309 else 9310 BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) 9311 .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); 9312 BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg) 9313 .addReg(incr).addReg(ShiftReg); 9314 if (is8bit) 9315 BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); 9316 else { 9317 BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); 9318 BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535); 9319 } 9320 BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) 9321 .addReg(Mask2Reg).addReg(ShiftReg); 9322 9323 BB = loopMBB; 9324 BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) 9325 .addReg(ZeroReg).addReg(PtrReg); 9326 if (BinOpcode) 9327 BuildMI(BB, dl, TII->get(BinOpcode), TmpReg) 9328 .addReg(Incr2Reg).addReg(TmpDestReg); 9329 BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg) 9330 .addReg(TmpDestReg).addReg(MaskReg); 9331 BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg) 9332 .addReg(TmpReg).addReg(MaskReg); 9333 if (CmpOpcode) { 9334 // For unsigned comparisons, we can directly compare the shifted values. 9335 // For signed comparisons we shift and sign extend. 9336 unsigned SReg = RegInfo.createVirtualRegister(RC); 9337 BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), SReg) 9338 .addReg(TmpDestReg).addReg(MaskReg); 9339 unsigned ValueReg = SReg; 9340 unsigned CmpReg = Incr2Reg; 9341 if (CmpOpcode == PPC::CMPW) { 9342 ValueReg = RegInfo.createVirtualRegister(RC); 9343 BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg) 9344 .addReg(SReg).addReg(ShiftReg); 9345 unsigned ValueSReg = RegInfo.createVirtualRegister(RC); 9346 BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg) 9347 .addReg(ValueReg); 9348 ValueReg = ValueSReg; 9349 CmpReg = incr; 9350 } 9351 BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) 9352 .addReg(CmpReg).addReg(ValueReg); 9353 BuildMI(BB, dl, TII->get(PPC::BCC)) 9354 .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB); 9355 BB->addSuccessor(loop2MBB); 9356 BB->addSuccessor(exitMBB); 9357 BB = loop2MBB; 9358 } 9359 BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg) 9360 .addReg(Tmp3Reg).addReg(Tmp2Reg); 9361 BuildMI(BB, dl, TII->get(PPC::STWCX)) 9362 .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg); 9363 BuildMI(BB, dl, TII->get(PPC::BCC)) 9364 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); 9365 BB->addSuccessor(loopMBB); 9366 BB->addSuccessor(exitMBB); 9367 9368 // exitMBB: 9369 // ... 9370 BB = exitMBB; 9371 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg) 9372 .addReg(ShiftReg); 9373 return BB; 9374 } 9375 9376 llvm::MachineBasicBlock * 9377 PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, 9378 MachineBasicBlock *MBB) const { 9379 DebugLoc DL = MI.getDebugLoc(); 9380 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9381 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); 9382 9383 MachineFunction *MF = MBB->getParent(); 9384 MachineRegisterInfo &MRI = MF->getRegInfo(); 9385 9386 const BasicBlock *BB = MBB->getBasicBlock(); 9387 MachineFunction::iterator I = ++MBB->getIterator(); 9388 9389 // Memory Reference 9390 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); 9391 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); 9392 9393 unsigned DstReg = MI.getOperand(0).getReg(); 9394 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 9395 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); 9396 unsigned mainDstReg = MRI.createVirtualRegister(RC); 9397 unsigned restoreDstReg = MRI.createVirtualRegister(RC); 9398 9399 MVT PVT = getPointerTy(MF->getDataLayout()); 9400 assert((PVT == MVT::i64 || PVT == MVT::i32) && 9401 "Invalid Pointer Size!"); 9402 // For v = setjmp(buf), we generate 9403 // 9404 // thisMBB: 9405 // SjLjSetup mainMBB 9406 // bl mainMBB 9407 // v_restore = 1 9408 // b sinkMBB 9409 // 9410 // mainMBB: 9411 // buf[LabelOffset] = LR 9412 // v_main = 0 9413 // 9414 // sinkMBB: 9415 // v = phi(main, restore) 9416 // 9417 9418 MachineBasicBlock *thisMBB = MBB; 9419 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 9420 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 9421 MF->insert(I, mainMBB); 9422 MF->insert(I, sinkMBB); 9423 9424 MachineInstrBuilder MIB; 9425 9426 // Transfer the remainder of BB and its successor edges to sinkMBB. 9427 sinkMBB->splice(sinkMBB->begin(), MBB, 9428 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 9429 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 9430 9431 // Note that the structure of the jmp_buf used here is not compatible 9432 // with that used by libc, and is not designed to be. Specifically, it 9433 // stores only those 'reserved' registers that LLVM does not otherwise 9434 // understand how to spill. Also, by convention, by the time this 9435 // intrinsic is called, Clang has already stored the frame address in the 9436 // first slot of the buffer and stack address in the third. Following the 9437 // X86 target code, we'll store the jump address in the second slot. We also 9438 // need to save the TOC pointer (R2) to handle jumps between shared 9439 // libraries, and that will be stored in the fourth slot. The thread 9440 // identifier (R13) is not affected. 9441 9442 // thisMBB: 9443 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 9444 const int64_t TOCOffset = 3 * PVT.getStoreSize(); 9445 const int64_t BPOffset = 4 * PVT.getStoreSize(); 9446 9447 // Prepare IP either in reg. 9448 const TargetRegisterClass *PtrRC = getRegClassFor(PVT); 9449 unsigned LabelReg = MRI.createVirtualRegister(PtrRC); 9450 unsigned BufReg = MI.getOperand(1).getReg(); 9451 9452 if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { 9453 setUsesTOCBasePtr(*MBB->getParent()); 9454 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) 9455 .addReg(PPC::X2) 9456 .addImm(TOCOffset) 9457 .addReg(BufReg); 9458 MIB.setMemRefs(MMOBegin, MMOEnd); 9459 } 9460 9461 // Naked functions never have a base pointer, and so we use r1. For all 9462 // other functions, this decision must be delayed until during PEI. 9463 unsigned BaseReg; 9464 if (MF->getFunction()->hasFnAttribute(Attribute::Naked)) 9465 BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1; 9466 else 9467 BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP; 9468 9469 MIB = BuildMI(*thisMBB, MI, DL, 9470 TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW)) 9471 .addReg(BaseReg) 9472 .addImm(BPOffset) 9473 .addReg(BufReg); 9474 MIB.setMemRefs(MMOBegin, MMOEnd); 9475 9476 // Setup 9477 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); 9478 MIB.addRegMask(TRI->getNoPreservedMask()); 9479 9480 BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); 9481 9482 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup)) 9483 .addMBB(mainMBB); 9484 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB); 9485 9486 thisMBB->addSuccessor(mainMBB, BranchProbability::getZero()); 9487 thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne()); 9488 9489 // mainMBB: 9490 // mainDstReg = 0 9491 MIB = 9492 BuildMI(mainMBB, DL, 9493 TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); 9494 9495 // Store IP 9496 if (Subtarget.isPPC64()) { 9497 MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD)) 9498 .addReg(LabelReg) 9499 .addImm(LabelOffset) 9500 .addReg(BufReg); 9501 } else { 9502 MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW)) 9503 .addReg(LabelReg) 9504 .addImm(LabelOffset) 9505 .addReg(BufReg); 9506 } 9507 9508 MIB.setMemRefs(MMOBegin, MMOEnd); 9509 9510 BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0); 9511 mainMBB->addSuccessor(sinkMBB); 9512 9513 // sinkMBB: 9514 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 9515 TII->get(PPC::PHI), DstReg) 9516 .addReg(mainDstReg).addMBB(mainMBB) 9517 .addReg(restoreDstReg).addMBB(thisMBB); 9518 9519 MI.eraseFromParent(); 9520 return sinkMBB; 9521 } 9522 9523 MachineBasicBlock * 9524 PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, 9525 MachineBasicBlock *MBB) const { 9526 DebugLoc DL = MI.getDebugLoc(); 9527 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9528 9529 MachineFunction *MF = MBB->getParent(); 9530 MachineRegisterInfo &MRI = MF->getRegInfo(); 9531 9532 // Memory Reference 9533 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin(); 9534 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end(); 9535 9536 MVT PVT = getPointerTy(MF->getDataLayout()); 9537 assert((PVT == MVT::i64 || PVT == MVT::i32) && 9538 "Invalid Pointer Size!"); 9539 9540 const TargetRegisterClass *RC = 9541 (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; 9542 unsigned Tmp = MRI.createVirtualRegister(RC); 9543 // Since FP is only updated here but NOT referenced, it's treated as GPR. 9544 unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; 9545 unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; 9546 unsigned BP = 9547 (PVT == MVT::i64) 9548 ? PPC::X30 9549 : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29 9550 : PPC::R30); 9551 9552 MachineInstrBuilder MIB; 9553 9554 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 9555 const int64_t SPOffset = 2 * PVT.getStoreSize(); 9556 const int64_t TOCOffset = 3 * PVT.getStoreSize(); 9557 const int64_t BPOffset = 4 * PVT.getStoreSize(); 9558 9559 unsigned BufReg = MI.getOperand(0).getReg(); 9560 9561 // Reload FP (the jumped-to function may not have had a 9562 // frame pointer, and if so, then its r31 will be restored 9563 // as necessary). 9564 if (PVT == MVT::i64) { 9565 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP) 9566 .addImm(0) 9567 .addReg(BufReg); 9568 } else { 9569 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP) 9570 .addImm(0) 9571 .addReg(BufReg); 9572 } 9573 MIB.setMemRefs(MMOBegin, MMOEnd); 9574 9575 // Reload IP 9576 if (PVT == MVT::i64) { 9577 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp) 9578 .addImm(LabelOffset) 9579 .addReg(BufReg); 9580 } else { 9581 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp) 9582 .addImm(LabelOffset) 9583 .addReg(BufReg); 9584 } 9585 MIB.setMemRefs(MMOBegin, MMOEnd); 9586 9587 // Reload SP 9588 if (PVT == MVT::i64) { 9589 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP) 9590 .addImm(SPOffset) 9591 .addReg(BufReg); 9592 } else { 9593 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP) 9594 .addImm(SPOffset) 9595 .addReg(BufReg); 9596 } 9597 MIB.setMemRefs(MMOBegin, MMOEnd); 9598 9599 // Reload BP 9600 if (PVT == MVT::i64) { 9601 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP) 9602 .addImm(BPOffset) 9603 .addReg(BufReg); 9604 } else { 9605 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP) 9606 .addImm(BPOffset) 9607 .addReg(BufReg); 9608 } 9609 MIB.setMemRefs(MMOBegin, MMOEnd); 9610 9611 // Reload TOC 9612 if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) { 9613 setUsesTOCBasePtr(*MBB->getParent()); 9614 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) 9615 .addImm(TOCOffset) 9616 .addReg(BufReg); 9617 9618 MIB.setMemRefs(MMOBegin, MMOEnd); 9619 } 9620 9621 // Jump 9622 BuildMI(*MBB, MI, DL, 9623 TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp); 9624 BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR)); 9625 9626 MI.eraseFromParent(); 9627 return MBB; 9628 } 9629 9630 MachineBasicBlock * 9631 PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 9632 MachineBasicBlock *BB) const { 9633 if (MI.getOpcode() == TargetOpcode::STACKMAP || 9634 MI.getOpcode() == TargetOpcode::PATCHPOINT) { 9635 if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() && 9636 MI.getOpcode() == TargetOpcode::PATCHPOINT) { 9637 // Call lowering should have added an r2 operand to indicate a dependence 9638 // on the TOC base pointer value. It can't however, because there is no 9639 // way to mark the dependence as implicit there, and so the stackmap code 9640 // will confuse it with a regular operand. Instead, add the dependence 9641 // here. 9642 setUsesTOCBasePtr(*BB->getParent()); 9643 MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true)); 9644 } 9645 9646 return emitPatchPoint(MI, BB); 9647 } 9648 9649 if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 || 9650 MI.getOpcode() == PPC::EH_SjLj_SetJmp64) { 9651 return emitEHSjLjSetJmp(MI, BB); 9652 } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 || 9653 MI.getOpcode() == PPC::EH_SjLj_LongJmp64) { 9654 return emitEHSjLjLongJmp(MI, BB); 9655 } 9656 9657 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9658 9659 // To "insert" these instructions we actually have to insert their 9660 // control-flow patterns. 9661 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9662 MachineFunction::iterator It = ++BB->getIterator(); 9663 9664 MachineFunction *F = BB->getParent(); 9665 9666 if (MI.getOpcode() == PPC::SELECT_CC_I4 || 9667 MI.getOpcode() == PPC::SELECT_CC_I8 || 9668 MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8) { 9669 SmallVector<MachineOperand, 2> Cond; 9670 if (MI.getOpcode() == PPC::SELECT_CC_I4 || 9671 MI.getOpcode() == PPC::SELECT_CC_I8) 9672 Cond.push_back(MI.getOperand(4)); 9673 else 9674 Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET)); 9675 Cond.push_back(MI.getOperand(1)); 9676 9677 DebugLoc dl = MI.getDebugLoc(); 9678 TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond, 9679 MI.getOperand(2).getReg(), MI.getOperand(3).getReg()); 9680 } else if (MI.getOpcode() == PPC::SELECT_CC_I4 || 9681 MI.getOpcode() == PPC::SELECT_CC_I8 || 9682 MI.getOpcode() == PPC::SELECT_CC_F4 || 9683 MI.getOpcode() == PPC::SELECT_CC_F8 || 9684 MI.getOpcode() == PPC::SELECT_CC_QFRC || 9685 MI.getOpcode() == PPC::SELECT_CC_QSRC || 9686 MI.getOpcode() == PPC::SELECT_CC_QBRC || 9687 MI.getOpcode() == PPC::SELECT_CC_VRRC || 9688 MI.getOpcode() == PPC::SELECT_CC_VSFRC || 9689 MI.getOpcode() == PPC::SELECT_CC_VSSRC || 9690 MI.getOpcode() == PPC::SELECT_CC_VSRC || 9691 MI.getOpcode() == PPC::SELECT_I4 || 9692 MI.getOpcode() == PPC::SELECT_I8 || 9693 MI.getOpcode() == PPC::SELECT_F4 || 9694 MI.getOpcode() == PPC::SELECT_F8 || 9695 MI.getOpcode() == PPC::SELECT_QFRC || 9696 MI.getOpcode() == PPC::SELECT_QSRC || 9697 MI.getOpcode() == PPC::SELECT_QBRC || 9698 MI.getOpcode() == PPC::SELECT_VRRC || 9699 MI.getOpcode() == PPC::SELECT_VSFRC || 9700 MI.getOpcode() == PPC::SELECT_VSSRC || 9701 MI.getOpcode() == PPC::SELECT_VSRC) { 9702 // The incoming instruction knows the destination vreg to set, the 9703 // condition code register to branch on, the true/false values to 9704 // select between, and a branch opcode to use. 9705 9706 // thisMBB: 9707 // ... 9708 // TrueVal = ... 9709 // cmpTY ccX, r1, r2 9710 // bCC copy1MBB 9711 // fallthrough --> copy0MBB 9712 MachineBasicBlock *thisMBB = BB; 9713 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 9714 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 9715 DebugLoc dl = MI.getDebugLoc(); 9716 F->insert(It, copy0MBB); 9717 F->insert(It, sinkMBB); 9718 9719 // Transfer the remainder of BB and its successor edges to sinkMBB. 9720 sinkMBB->splice(sinkMBB->begin(), BB, 9721 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9722 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 9723 9724 // Next, add the true and fallthrough blocks as its successors. 9725 BB->addSuccessor(copy0MBB); 9726 BB->addSuccessor(sinkMBB); 9727 9728 if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 || 9729 MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || 9730 MI.getOpcode() == PPC::SELECT_QFRC || 9731 MI.getOpcode() == PPC::SELECT_QSRC || 9732 MI.getOpcode() == PPC::SELECT_QBRC || 9733 MI.getOpcode() == PPC::SELECT_VRRC || 9734 MI.getOpcode() == PPC::SELECT_VSFRC || 9735 MI.getOpcode() == PPC::SELECT_VSSRC || 9736 MI.getOpcode() == PPC::SELECT_VSRC) { 9737 BuildMI(BB, dl, TII->get(PPC::BC)) 9738 .addReg(MI.getOperand(1).getReg()) 9739 .addMBB(sinkMBB); 9740 } else { 9741 unsigned SelectPred = MI.getOperand(4).getImm(); 9742 BuildMI(BB, dl, TII->get(PPC::BCC)) 9743 .addImm(SelectPred) 9744 .addReg(MI.getOperand(1).getReg()) 9745 .addMBB(sinkMBB); 9746 } 9747 9748 // copy0MBB: 9749 // %FalseValue = ... 9750 // # fallthrough to sinkMBB 9751 BB = copy0MBB; 9752 9753 // Update machine-CFG edges 9754 BB->addSuccessor(sinkMBB); 9755 9756 // sinkMBB: 9757 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 9758 // ... 9759 BB = sinkMBB; 9760 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg()) 9761 .addReg(MI.getOperand(3).getReg()) 9762 .addMBB(copy0MBB) 9763 .addReg(MI.getOperand(2).getReg()) 9764 .addMBB(thisMBB); 9765 } else if (MI.getOpcode() == PPC::ReadTB) { 9766 // To read the 64-bit time-base register on a 32-bit target, we read the 9767 // two halves. Should the counter have wrapped while it was being read, we 9768 // need to try again. 9769 // ... 9770 // readLoop: 9771 // mfspr Rx,TBU # load from TBU 9772 // mfspr Ry,TB # load from TB 9773 // mfspr Rz,TBU # load from TBU 9774 // cmpw crX,Rx,Rz # check if 'old'='new' 9775 // bne readLoop # branch if they're not equal 9776 // ... 9777 9778 MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB); 9779 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 9780 DebugLoc dl = MI.getDebugLoc(); 9781 F->insert(It, readMBB); 9782 F->insert(It, sinkMBB); 9783 9784 // Transfer the remainder of BB and its successor edges to sinkMBB. 9785 sinkMBB->splice(sinkMBB->begin(), BB, 9786 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9787 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 9788 9789 BB->addSuccessor(readMBB); 9790 BB = readMBB; 9791 9792 MachineRegisterInfo &RegInfo = F->getRegInfo(); 9793 unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); 9794 unsigned LoReg = MI.getOperand(0).getReg(); 9795 unsigned HiReg = MI.getOperand(1).getReg(); 9796 9797 BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269); 9798 BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268); 9799 BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269); 9800 9801 unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); 9802 9803 BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg) 9804 .addReg(HiReg).addReg(ReadAgainReg); 9805 BuildMI(BB, dl, TII->get(PPC::BCC)) 9806 .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB); 9807 9808 BB->addSuccessor(readMBB); 9809 BB->addSuccessor(sinkMBB); 9810 } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8) 9811 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4); 9812 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16) 9813 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4); 9814 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32) 9815 BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4); 9816 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64) 9817 BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8); 9818 9819 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8) 9820 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND); 9821 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16) 9822 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND); 9823 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32) 9824 BB = EmitAtomicBinary(MI, BB, 4, PPC::AND); 9825 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64) 9826 BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8); 9827 9828 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8) 9829 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR); 9830 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16) 9831 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR); 9832 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32) 9833 BB = EmitAtomicBinary(MI, BB, 4, PPC::OR); 9834 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64) 9835 BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8); 9836 9837 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8) 9838 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR); 9839 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16) 9840 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR); 9841 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32) 9842 BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR); 9843 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64) 9844 BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8); 9845 9846 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) 9847 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); 9848 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) 9849 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); 9850 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) 9851 BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND); 9852 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) 9853 BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8); 9854 9855 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) 9856 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); 9857 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16) 9858 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF); 9859 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32) 9860 BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF); 9861 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64) 9862 BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8); 9863 9864 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8) 9865 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE); 9866 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16) 9867 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE); 9868 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32) 9869 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE); 9870 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64) 9871 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE); 9872 9873 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8) 9874 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE); 9875 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16) 9876 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE); 9877 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32) 9878 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE); 9879 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64) 9880 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE); 9881 9882 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8) 9883 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE); 9884 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16) 9885 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE); 9886 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32) 9887 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE); 9888 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64) 9889 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE); 9890 9891 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8) 9892 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE); 9893 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16) 9894 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE); 9895 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32) 9896 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE); 9897 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64) 9898 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE); 9899 9900 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8) 9901 BB = EmitPartwordAtomicBinary(MI, BB, true, 0); 9902 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16) 9903 BB = EmitPartwordAtomicBinary(MI, BB, false, 0); 9904 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32) 9905 BB = EmitAtomicBinary(MI, BB, 4, 0); 9906 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64) 9907 BB = EmitAtomicBinary(MI, BB, 8, 0); 9908 else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || 9909 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 || 9910 (Subtarget.hasPartwordAtomics() && 9911 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) || 9912 (Subtarget.hasPartwordAtomics() && 9913 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) { 9914 bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64; 9915 9916 auto LoadMnemonic = PPC::LDARX; 9917 auto StoreMnemonic = PPC::STDCX; 9918 switch (MI.getOpcode()) { 9919 default: 9920 llvm_unreachable("Compare and swap of unknown size"); 9921 case PPC::ATOMIC_CMP_SWAP_I8: 9922 LoadMnemonic = PPC::LBARX; 9923 StoreMnemonic = PPC::STBCX; 9924 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); 9925 break; 9926 case PPC::ATOMIC_CMP_SWAP_I16: 9927 LoadMnemonic = PPC::LHARX; 9928 StoreMnemonic = PPC::STHCX; 9929 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); 9930 break; 9931 case PPC::ATOMIC_CMP_SWAP_I32: 9932 LoadMnemonic = PPC::LWARX; 9933 StoreMnemonic = PPC::STWCX; 9934 break; 9935 case PPC::ATOMIC_CMP_SWAP_I64: 9936 LoadMnemonic = PPC::LDARX; 9937 StoreMnemonic = PPC::STDCX; 9938 break; 9939 } 9940 unsigned dest = MI.getOperand(0).getReg(); 9941 unsigned ptrA = MI.getOperand(1).getReg(); 9942 unsigned ptrB = MI.getOperand(2).getReg(); 9943 unsigned oldval = MI.getOperand(3).getReg(); 9944 unsigned newval = MI.getOperand(4).getReg(); 9945 DebugLoc dl = MI.getDebugLoc(); 9946 9947 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); 9948 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); 9949 MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); 9950 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 9951 F->insert(It, loop1MBB); 9952 F->insert(It, loop2MBB); 9953 F->insert(It, midMBB); 9954 F->insert(It, exitMBB); 9955 exitMBB->splice(exitMBB->begin(), BB, 9956 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9957 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 9958 9959 // thisMBB: 9960 // ... 9961 // fallthrough --> loopMBB 9962 BB->addSuccessor(loop1MBB); 9963 9964 // loop1MBB: 9965 // l[bhwd]arx dest, ptr 9966 // cmp[wd] dest, oldval 9967 // bne- midMBB 9968 // loop2MBB: 9969 // st[bhwd]cx. newval, ptr 9970 // bne- loopMBB 9971 // b exitBB 9972 // midMBB: 9973 // st[bhwd]cx. dest, ptr 9974 // exitBB: 9975 BB = loop1MBB; 9976 BuildMI(BB, dl, TII->get(LoadMnemonic), dest) 9977 .addReg(ptrA).addReg(ptrB); 9978 BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0) 9979 .addReg(oldval).addReg(dest); 9980 BuildMI(BB, dl, TII->get(PPC::BCC)) 9981 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); 9982 BB->addSuccessor(loop2MBB); 9983 BB->addSuccessor(midMBB); 9984 9985 BB = loop2MBB; 9986 BuildMI(BB, dl, TII->get(StoreMnemonic)) 9987 .addReg(newval).addReg(ptrA).addReg(ptrB); 9988 BuildMI(BB, dl, TII->get(PPC::BCC)) 9989 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); 9990 BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); 9991 BB->addSuccessor(loop1MBB); 9992 BB->addSuccessor(exitMBB); 9993 9994 BB = midMBB; 9995 BuildMI(BB, dl, TII->get(StoreMnemonic)) 9996 .addReg(dest).addReg(ptrA).addReg(ptrB); 9997 BB->addSuccessor(exitMBB); 9998 9999 // exitMBB: 10000 // ... 10001 BB = exitMBB; 10002 } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 || 10003 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) { 10004 // We must use 64-bit registers for addresses when targeting 64-bit, 10005 // since we're actually doing arithmetic on them. Other registers 10006 // can be 32-bit. 10007 bool is64bit = Subtarget.isPPC64(); 10008 bool isLittleEndian = Subtarget.isLittleEndian(); 10009 bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; 10010 10011 unsigned dest = MI.getOperand(0).getReg(); 10012 unsigned ptrA = MI.getOperand(1).getReg(); 10013 unsigned ptrB = MI.getOperand(2).getReg(); 10014 unsigned oldval = MI.getOperand(3).getReg(); 10015 unsigned newval = MI.getOperand(4).getReg(); 10016 DebugLoc dl = MI.getDebugLoc(); 10017 10018 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); 10019 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); 10020 MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); 10021 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 10022 F->insert(It, loop1MBB); 10023 F->insert(It, loop2MBB); 10024 F->insert(It, midMBB); 10025 F->insert(It, exitMBB); 10026 exitMBB->splice(exitMBB->begin(), BB, 10027 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10028 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 10029 10030 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10031 const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass 10032 : &PPC::GPRCRegClass; 10033 unsigned PtrReg = RegInfo.createVirtualRegister(RC); 10034 unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); 10035 unsigned ShiftReg = 10036 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC); 10037 unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC); 10038 unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC); 10039 unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC); 10040 unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC); 10041 unsigned MaskReg = RegInfo.createVirtualRegister(RC); 10042 unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); 10043 unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); 10044 unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); 10045 unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); 10046 unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); 10047 unsigned Ptr1Reg; 10048 unsigned TmpReg = RegInfo.createVirtualRegister(RC); 10049 unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; 10050 // thisMBB: 10051 // ... 10052 // fallthrough --> loopMBB 10053 BB->addSuccessor(loop1MBB); 10054 10055 // The 4-byte load must be aligned, while a char or short may be 10056 // anywhere in the word. Hence all this nasty bookkeeping code. 10057 // add ptr1, ptrA, ptrB [copy if ptrA==0] 10058 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] 10059 // xori shift, shift1, 24 [16] 10060 // rlwinm ptr, ptr1, 0, 0, 29 10061 // slw newval2, newval, shift 10062 // slw oldval2, oldval,shift 10063 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] 10064 // slw mask, mask2, shift 10065 // and newval3, newval2, mask 10066 // and oldval3, oldval2, mask 10067 // loop1MBB: 10068 // lwarx tmpDest, ptr 10069 // and tmp, tmpDest, mask 10070 // cmpw tmp, oldval3 10071 // bne- midMBB 10072 // loop2MBB: 10073 // andc tmp2, tmpDest, mask 10074 // or tmp4, tmp2, newval3 10075 // stwcx. tmp4, ptr 10076 // bne- loop1MBB 10077 // b exitBB 10078 // midMBB: 10079 // stwcx. tmpDest, ptr 10080 // exitBB: 10081 // srw dest, tmpDest, shift 10082 if (ptrA != ZeroReg) { 10083 Ptr1Reg = RegInfo.createVirtualRegister(RC); 10084 BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) 10085 .addReg(ptrA).addReg(ptrB); 10086 } else { 10087 Ptr1Reg = ptrB; 10088 } 10089 BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) 10090 .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); 10091 if (!isLittleEndian) 10092 BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) 10093 .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); 10094 if (is64bit) 10095 BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) 10096 .addReg(Ptr1Reg).addImm(0).addImm(61); 10097 else 10098 BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) 10099 .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); 10100 BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg) 10101 .addReg(newval).addReg(ShiftReg); 10102 BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg) 10103 .addReg(oldval).addReg(ShiftReg); 10104 if (is8bit) 10105 BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); 10106 else { 10107 BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); 10108 BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) 10109 .addReg(Mask3Reg).addImm(65535); 10110 } 10111 BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) 10112 .addReg(Mask2Reg).addReg(ShiftReg); 10113 BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg) 10114 .addReg(NewVal2Reg).addReg(MaskReg); 10115 BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg) 10116 .addReg(OldVal2Reg).addReg(MaskReg); 10117 10118 BB = loop1MBB; 10119 BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) 10120 .addReg(ZeroReg).addReg(PtrReg); 10121 BuildMI(BB, dl, TII->get(PPC::AND),TmpReg) 10122 .addReg(TmpDestReg).addReg(MaskReg); 10123 BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0) 10124 .addReg(TmpReg).addReg(OldVal3Reg); 10125 BuildMI(BB, dl, TII->get(PPC::BCC)) 10126 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); 10127 BB->addSuccessor(loop2MBB); 10128 BB->addSuccessor(midMBB); 10129 10130 BB = loop2MBB; 10131 BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg) 10132 .addReg(TmpDestReg).addReg(MaskReg); 10133 BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg) 10134 .addReg(Tmp2Reg).addReg(NewVal3Reg); 10135 BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg) 10136 .addReg(ZeroReg).addReg(PtrReg); 10137 BuildMI(BB, dl, TII->get(PPC::BCC)) 10138 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); 10139 BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); 10140 BB->addSuccessor(loop1MBB); 10141 BB->addSuccessor(exitMBB); 10142 10143 BB = midMBB; 10144 BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg) 10145 .addReg(ZeroReg).addReg(PtrReg); 10146 BB->addSuccessor(exitMBB); 10147 10148 // exitMBB: 10149 // ... 10150 BB = exitMBB; 10151 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg) 10152 .addReg(ShiftReg); 10153 } else if (MI.getOpcode() == PPC::FADDrtz) { 10154 // This pseudo performs an FADD with rounding mode temporarily forced 10155 // to round-to-zero. We emit this via custom inserter since the FPSCR 10156 // is not modeled at the SelectionDAG level. 10157 unsigned Dest = MI.getOperand(0).getReg(); 10158 unsigned Src1 = MI.getOperand(1).getReg(); 10159 unsigned Src2 = MI.getOperand(2).getReg(); 10160 DebugLoc dl = MI.getDebugLoc(); 10161 10162 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10163 unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); 10164 10165 // Save FPSCR value. 10166 BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg); 10167 10168 // Set rounding mode to round-to-zero. 10169 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31); 10170 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30); 10171 10172 // Perform addition. 10173 BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2); 10174 10175 // Restore FPSCR value. 10176 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg); 10177 } else if (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || 10178 MI.getOpcode() == PPC::ANDIo_1_GT_BIT || 10179 MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || 10180 MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) { 10181 unsigned Opcode = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || 10182 MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) 10183 ? PPC::ANDIo8 10184 : PPC::ANDIo; 10185 bool isEQ = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || 10186 MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8); 10187 10188 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10189 unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ? 10190 &PPC::GPRCRegClass : 10191 &PPC::G8RCRegClass); 10192 10193 DebugLoc dl = MI.getDebugLoc(); 10194 BuildMI(*BB, MI, dl, TII->get(Opcode), Dest) 10195 .addReg(MI.getOperand(1).getReg()) 10196 .addImm(1); 10197 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), 10198 MI.getOperand(0).getReg()) 10199 .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT); 10200 } else if (MI.getOpcode() == PPC::TCHECK_RET) { 10201 DebugLoc Dl = MI.getDebugLoc(); 10202 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10203 unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); 10204 BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg); 10205 return BB; 10206 } else { 10207 llvm_unreachable("Unexpected instr type to insert"); 10208 } 10209 10210 MI.eraseFromParent(); // The pseudo instruction is gone now. 10211 return BB; 10212 } 10213 10214 //===----------------------------------------------------------------------===// 10215 // Target Optimization Hooks 10216 //===----------------------------------------------------------------------===// 10217 10218 static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) { 10219 // For the estimates, convergence is quadratic, so we essentially double the 10220 // number of digits correct after every iteration. For both FRE and FRSQRTE, 10221 // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(), 10222 // this is 2^-14. IEEE float has 23 digits and double has 52 digits. 10223 int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3; 10224 if (VT.getScalarType() == MVT::f64) 10225 RefinementSteps++; 10226 return RefinementSteps; 10227 } 10228 10229 SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 10230 int Enabled, int &RefinementSteps, 10231 bool &UseOneConstNR, 10232 bool Reciprocal) const { 10233 EVT VT = Operand.getValueType(); 10234 if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || 10235 (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || 10236 (VT == MVT::v4f32 && Subtarget.hasAltivec()) || 10237 (VT == MVT::v2f64 && Subtarget.hasVSX()) || 10238 (VT == MVT::v4f32 && Subtarget.hasQPX()) || 10239 (VT == MVT::v4f64 && Subtarget.hasQPX())) { 10240 if (RefinementSteps == ReciprocalEstimate::Unspecified) 10241 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); 10242 10243 UseOneConstNR = true; 10244 return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); 10245 } 10246 return SDValue(); 10247 } 10248 10249 SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, 10250 int Enabled, 10251 int &RefinementSteps) const { 10252 EVT VT = Operand.getValueType(); 10253 if ((VT == MVT::f32 && Subtarget.hasFRES()) || 10254 (VT == MVT::f64 && Subtarget.hasFRE()) || 10255 (VT == MVT::v4f32 && Subtarget.hasAltivec()) || 10256 (VT == MVT::v2f64 && Subtarget.hasVSX()) || 10257 (VT == MVT::v4f32 && Subtarget.hasQPX()) || 10258 (VT == MVT::v4f64 && Subtarget.hasQPX())) { 10259 if (RefinementSteps == ReciprocalEstimate::Unspecified) 10260 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); 10261 return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); 10262 } 10263 return SDValue(); 10264 } 10265 10266 unsigned PPCTargetLowering::combineRepeatedFPDivisors() const { 10267 // Note: This functionality is used only when unsafe-fp-math is enabled, and 10268 // on cores with reciprocal estimates (which are used when unsafe-fp-math is 10269 // enabled for division), this functionality is redundant with the default 10270 // combiner logic (once the division -> reciprocal/multiply transformation 10271 // has taken place). As a result, this matters more for older cores than for 10272 // newer ones. 10273 10274 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 10275 // reciprocal if there are two or more FDIVs (for embedded cores with only 10276 // one FP pipeline) for three or more FDIVs (for generic OOO cores). 10277 switch (Subtarget.getDarwinDirective()) { 10278 default: 10279 return 3; 10280 case PPC::DIR_440: 10281 case PPC::DIR_A2: 10282 case PPC::DIR_E500mc: 10283 case PPC::DIR_E5500: 10284 return 2; 10285 } 10286 } 10287 10288 // isConsecutiveLSLoc needs to work even if all adds have not yet been 10289 // collapsed, and so we need to look through chains of them. 10290 static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base, 10291 int64_t& Offset, SelectionDAG &DAG) { 10292 if (DAG.isBaseWithConstantOffset(Loc)) { 10293 Base = Loc.getOperand(0); 10294 Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue(); 10295 10296 // The base might itself be a base plus an offset, and if so, accumulate 10297 // that as well. 10298 getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG); 10299 } 10300 } 10301 10302 static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, 10303 unsigned Bytes, int Dist, 10304 SelectionDAG &DAG) { 10305 if (VT.getSizeInBits() / 8 != Bytes) 10306 return false; 10307 10308 SDValue BaseLoc = Base->getBasePtr(); 10309 if (Loc.getOpcode() == ISD::FrameIndex) { 10310 if (BaseLoc.getOpcode() != ISD::FrameIndex) 10311 return false; 10312 const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 10313 int FI = cast<FrameIndexSDNode>(Loc)->getIndex(); 10314 int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex(); 10315 int FS = MFI.getObjectSize(FI); 10316 int BFS = MFI.getObjectSize(BFI); 10317 if (FS != BFS || FS != (int)Bytes) return false; 10318 return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes); 10319 } 10320 10321 SDValue Base1 = Loc, Base2 = BaseLoc; 10322 int64_t Offset1 = 0, Offset2 = 0; 10323 getBaseWithConstantOffset(Loc, Base1, Offset1, DAG); 10324 getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG); 10325 if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes)) 10326 return true; 10327 10328 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10329 const GlobalValue *GV1 = nullptr; 10330 const GlobalValue *GV2 = nullptr; 10331 Offset1 = 0; 10332 Offset2 = 0; 10333 bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1); 10334 bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); 10335 if (isGA1 && isGA2 && GV1 == GV2) 10336 return Offset1 == (Offset2 + Dist*Bytes); 10337 return false; 10338 } 10339 10340 // Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does 10341 // not enforce equality of the chain operands. 10342 static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, 10343 unsigned Bytes, int Dist, 10344 SelectionDAG &DAG) { 10345 if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) { 10346 EVT VT = LS->getMemoryVT(); 10347 SDValue Loc = LS->getBasePtr(); 10348 return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG); 10349 } 10350 10351 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 10352 EVT VT; 10353 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 10354 default: return false; 10355 case Intrinsic::ppc_qpx_qvlfd: 10356 case Intrinsic::ppc_qpx_qvlfda: 10357 VT = MVT::v4f64; 10358 break; 10359 case Intrinsic::ppc_qpx_qvlfs: 10360 case Intrinsic::ppc_qpx_qvlfsa: 10361 VT = MVT::v4f32; 10362 break; 10363 case Intrinsic::ppc_qpx_qvlfcd: 10364 case Intrinsic::ppc_qpx_qvlfcda: 10365 VT = MVT::v2f64; 10366 break; 10367 case Intrinsic::ppc_qpx_qvlfcs: 10368 case Intrinsic::ppc_qpx_qvlfcsa: 10369 VT = MVT::v2f32; 10370 break; 10371 case Intrinsic::ppc_qpx_qvlfiwa: 10372 case Intrinsic::ppc_qpx_qvlfiwz: 10373 case Intrinsic::ppc_altivec_lvx: 10374 case Intrinsic::ppc_altivec_lvxl: 10375 case Intrinsic::ppc_vsx_lxvw4x: 10376 case Intrinsic::ppc_vsx_lxvw4x_be: 10377 VT = MVT::v4i32; 10378 break; 10379 case Intrinsic::ppc_vsx_lxvd2x: 10380 case Intrinsic::ppc_vsx_lxvd2x_be: 10381 VT = MVT::v2f64; 10382 break; 10383 case Intrinsic::ppc_altivec_lvebx: 10384 VT = MVT::i8; 10385 break; 10386 case Intrinsic::ppc_altivec_lvehx: 10387 VT = MVT::i16; 10388 break; 10389 case Intrinsic::ppc_altivec_lvewx: 10390 VT = MVT::i32; 10391 break; 10392 } 10393 10394 return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG); 10395 } 10396 10397 if (N->getOpcode() == ISD::INTRINSIC_VOID) { 10398 EVT VT; 10399 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 10400 default: return false; 10401 case Intrinsic::ppc_qpx_qvstfd: 10402 case Intrinsic::ppc_qpx_qvstfda: 10403 VT = MVT::v4f64; 10404 break; 10405 case Intrinsic::ppc_qpx_qvstfs: 10406 case Intrinsic::ppc_qpx_qvstfsa: 10407 VT = MVT::v4f32; 10408 break; 10409 case Intrinsic::ppc_qpx_qvstfcd: 10410 case Intrinsic::ppc_qpx_qvstfcda: 10411 VT = MVT::v2f64; 10412 break; 10413 case Intrinsic::ppc_qpx_qvstfcs: 10414 case Intrinsic::ppc_qpx_qvstfcsa: 10415 VT = MVT::v2f32; 10416 break; 10417 case Intrinsic::ppc_qpx_qvstfiw: 10418 case Intrinsic::ppc_qpx_qvstfiwa: 10419 case Intrinsic::ppc_altivec_stvx: 10420 case Intrinsic::ppc_altivec_stvxl: 10421 case Intrinsic::ppc_vsx_stxvw4x: 10422 VT = MVT::v4i32; 10423 break; 10424 case Intrinsic::ppc_vsx_stxvd2x: 10425 VT = MVT::v2f64; 10426 break; 10427 case Intrinsic::ppc_vsx_stxvw4x_be: 10428 VT = MVT::v4i32; 10429 break; 10430 case Intrinsic::ppc_vsx_stxvd2x_be: 10431 VT = MVT::v2f64; 10432 break; 10433 case Intrinsic::ppc_altivec_stvebx: 10434 VT = MVT::i8; 10435 break; 10436 case Intrinsic::ppc_altivec_stvehx: 10437 VT = MVT::i16; 10438 break; 10439 case Intrinsic::ppc_altivec_stvewx: 10440 VT = MVT::i32; 10441 break; 10442 } 10443 10444 return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG); 10445 } 10446 10447 return false; 10448 } 10449 10450 // Return true is there is a nearyby consecutive load to the one provided 10451 // (regardless of alignment). We search up and down the chain, looking though 10452 // token factors and other loads (but nothing else). As a result, a true result 10453 // indicates that it is safe to create a new consecutive load adjacent to the 10454 // load provided. 10455 static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { 10456 SDValue Chain = LD->getChain(); 10457 EVT VT = LD->getMemoryVT(); 10458 10459 SmallSet<SDNode *, 16> LoadRoots; 10460 SmallVector<SDNode *, 8> Queue(1, Chain.getNode()); 10461 SmallSet<SDNode *, 16> Visited; 10462 10463 // First, search up the chain, branching to follow all token-factor operands. 10464 // If we find a consecutive load, then we're done, otherwise, record all 10465 // nodes just above the top-level loads and token factors. 10466 while (!Queue.empty()) { 10467 SDNode *ChainNext = Queue.pop_back_val(); 10468 if (!Visited.insert(ChainNext).second) 10469 continue; 10470 10471 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) { 10472 if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) 10473 return true; 10474 10475 if (!Visited.count(ChainLD->getChain().getNode())) 10476 Queue.push_back(ChainLD->getChain().getNode()); 10477 } else if (ChainNext->getOpcode() == ISD::TokenFactor) { 10478 for (const SDUse &O : ChainNext->ops()) 10479 if (!Visited.count(O.getNode())) 10480 Queue.push_back(O.getNode()); 10481 } else 10482 LoadRoots.insert(ChainNext); 10483 } 10484 10485 // Second, search down the chain, starting from the top-level nodes recorded 10486 // in the first phase. These top-level nodes are the nodes just above all 10487 // loads and token factors. Starting with their uses, recursively look though 10488 // all loads (just the chain uses) and token factors to find a consecutive 10489 // load. 10490 Visited.clear(); 10491 Queue.clear(); 10492 10493 for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(), 10494 IE = LoadRoots.end(); I != IE; ++I) { 10495 Queue.push_back(*I); 10496 10497 while (!Queue.empty()) { 10498 SDNode *LoadRoot = Queue.pop_back_val(); 10499 if (!Visited.insert(LoadRoot).second) 10500 continue; 10501 10502 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot)) 10503 if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) 10504 return true; 10505 10506 for (SDNode::use_iterator UI = LoadRoot->use_begin(), 10507 UE = LoadRoot->use_end(); UI != UE; ++UI) 10508 if (((isa<MemSDNode>(*UI) && 10509 cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) || 10510 UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) 10511 Queue.push_back(*UI); 10512 } 10513 } 10514 10515 return false; 10516 } 10517 10518 /// This function is called when we have proved that a SETCC node can be replaced 10519 /// by subtraction (and other supporting instructions) so that the result of 10520 /// comparison is kept in a GPR instead of CR. This function is purely for 10521 /// codegen purposes and has some flags to guide the codegen process. 10522 static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement, 10523 bool Swap, SDLoc &DL, SelectionDAG &DAG) { 10524 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); 10525 10526 // Zero extend the operands to the largest legal integer. Originally, they 10527 // must be of a strictly smaller size. 10528 auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0), 10529 DAG.getConstant(Size, DL, MVT::i32)); 10530 auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1), 10531 DAG.getConstant(Size, DL, MVT::i32)); 10532 10533 // Swap if needed. Depends on the condition code. 10534 if (Swap) 10535 std::swap(Op0, Op1); 10536 10537 // Subtract extended integers. 10538 auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1); 10539 10540 // Move the sign bit to the least significant position and zero out the rest. 10541 // Now the least significant bit carries the result of original comparison. 10542 auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode, 10543 DAG.getConstant(Size - 1, DL, MVT::i32)); 10544 auto Final = Shifted; 10545 10546 // Complement the result if needed. Based on the condition code. 10547 if (Complement) 10548 Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted, 10549 DAG.getConstant(1, DL, MVT::i64)); 10550 10551 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final); 10552 } 10553 10554 SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N, 10555 DAGCombinerInfo &DCI) const { 10556 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); 10557 10558 SelectionDAG &DAG = DCI.DAG; 10559 SDLoc DL(N); 10560 10561 // Size of integers being compared has a critical role in the following 10562 // analysis, so we prefer to do this when all types are legal. 10563 if (!DCI.isAfterLegalizeVectorOps()) 10564 return SDValue(); 10565 10566 // If all users of SETCC extend its value to a legal integer type 10567 // then we replace SETCC with a subtraction 10568 for (SDNode::use_iterator UI = N->use_begin(), 10569 UE = N->use_end(); UI != UE; ++UI) { 10570 if (UI->getOpcode() != ISD::ZERO_EXTEND) 10571 return SDValue(); 10572 } 10573 10574 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 10575 auto OpSize = N->getOperand(0).getValueSizeInBits(); 10576 10577 unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits(); 10578 10579 if (OpSize < Size) { 10580 switch (CC) { 10581 default: break; 10582 case ISD::SETULT: 10583 return generateEquivalentSub(N, Size, false, false, DL, DAG); 10584 case ISD::SETULE: 10585 return generateEquivalentSub(N, Size, true, true, DL, DAG); 10586 case ISD::SETUGT: 10587 return generateEquivalentSub(N, Size, false, true, DL, DAG); 10588 case ISD::SETUGE: 10589 return generateEquivalentSub(N, Size, true, false, DL, DAG); 10590 } 10591 } 10592 10593 return SDValue(); 10594 } 10595 10596 SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, 10597 DAGCombinerInfo &DCI) const { 10598 SelectionDAG &DAG = DCI.DAG; 10599 SDLoc dl(N); 10600 10601 assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits"); 10602 // If we're tracking CR bits, we need to be careful that we don't have: 10603 // trunc(binary-ops(zext(x), zext(y))) 10604 // or 10605 // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) 10606 // such that we're unnecessarily moving things into GPRs when it would be 10607 // better to keep them in CR bits. 10608 10609 // Note that trunc here can be an actual i1 trunc, or can be the effective 10610 // truncation that comes from a setcc or select_cc. 10611 if (N->getOpcode() == ISD::TRUNCATE && 10612 N->getValueType(0) != MVT::i1) 10613 return SDValue(); 10614 10615 if (N->getOperand(0).getValueType() != MVT::i32 && 10616 N->getOperand(0).getValueType() != MVT::i64) 10617 return SDValue(); 10618 10619 if (N->getOpcode() == ISD::SETCC || 10620 N->getOpcode() == ISD::SELECT_CC) { 10621 // If we're looking at a comparison, then we need to make sure that the 10622 // high bits (all except for the first) don't matter the result. 10623 ISD::CondCode CC = 10624 cast<CondCodeSDNode>(N->getOperand( 10625 N->getOpcode() == ISD::SETCC ? 2 : 4))->get(); 10626 unsigned OpBits = N->getOperand(0).getValueSizeInBits(); 10627 10628 if (ISD::isSignedIntSetCC(CC)) { 10629 if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits || 10630 DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits) 10631 return SDValue(); 10632 } else if (ISD::isUnsignedIntSetCC(CC)) { 10633 if (!DAG.MaskedValueIsZero(N->getOperand(0), 10634 APInt::getHighBitsSet(OpBits, OpBits-1)) || 10635 !DAG.MaskedValueIsZero(N->getOperand(1), 10636 APInt::getHighBitsSet(OpBits, OpBits-1))) 10637 return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI) 10638 : SDValue()); 10639 } else { 10640 // This is neither a signed nor an unsigned comparison, just make sure 10641 // that the high bits are equal. 10642 KnownBits Op1Known, Op2Known; 10643 DAG.computeKnownBits(N->getOperand(0), Op1Known); 10644 DAG.computeKnownBits(N->getOperand(1), Op2Known); 10645 10646 // We don't really care about what is known about the first bit (if 10647 // anything), so clear it in all masks prior to comparing them. 10648 Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0); 10649 Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0); 10650 10651 if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One) 10652 return SDValue(); 10653 } 10654 } 10655 10656 // We now know that the higher-order bits are irrelevant, we just need to 10657 // make sure that all of the intermediate operations are bit operations, and 10658 // all inputs are extensions. 10659 if (N->getOperand(0).getOpcode() != ISD::AND && 10660 N->getOperand(0).getOpcode() != ISD::OR && 10661 N->getOperand(0).getOpcode() != ISD::XOR && 10662 N->getOperand(0).getOpcode() != ISD::SELECT && 10663 N->getOperand(0).getOpcode() != ISD::SELECT_CC && 10664 N->getOperand(0).getOpcode() != ISD::TRUNCATE && 10665 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND && 10666 N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && 10667 N->getOperand(0).getOpcode() != ISD::ANY_EXTEND) 10668 return SDValue(); 10669 10670 if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) && 10671 N->getOperand(1).getOpcode() != ISD::AND && 10672 N->getOperand(1).getOpcode() != ISD::OR && 10673 N->getOperand(1).getOpcode() != ISD::XOR && 10674 N->getOperand(1).getOpcode() != ISD::SELECT && 10675 N->getOperand(1).getOpcode() != ISD::SELECT_CC && 10676 N->getOperand(1).getOpcode() != ISD::TRUNCATE && 10677 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND && 10678 N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && 10679 N->getOperand(1).getOpcode() != ISD::ANY_EXTEND) 10680 return SDValue(); 10681 10682 SmallVector<SDValue, 4> Inputs; 10683 SmallVector<SDValue, 8> BinOps, PromOps; 10684 SmallPtrSet<SDNode *, 16> Visited; 10685 10686 for (unsigned i = 0; i < 2; ++i) { 10687 if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND || 10688 N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND || 10689 N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) && 10690 N->getOperand(i).getOperand(0).getValueType() == MVT::i1) || 10691 isa<ConstantSDNode>(N->getOperand(i))) 10692 Inputs.push_back(N->getOperand(i)); 10693 else 10694 BinOps.push_back(N->getOperand(i)); 10695 10696 if (N->getOpcode() == ISD::TRUNCATE) 10697 break; 10698 } 10699 10700 // Visit all inputs, collect all binary operations (and, or, xor and 10701 // select) that are all fed by extensions. 10702 while (!BinOps.empty()) { 10703 SDValue BinOp = BinOps.back(); 10704 BinOps.pop_back(); 10705 10706 if (!Visited.insert(BinOp.getNode()).second) 10707 continue; 10708 10709 PromOps.push_back(BinOp); 10710 10711 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { 10712 // The condition of the select is not promoted. 10713 if (BinOp.getOpcode() == ISD::SELECT && i == 0) 10714 continue; 10715 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) 10716 continue; 10717 10718 if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || 10719 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || 10720 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) && 10721 BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) || 10722 isa<ConstantSDNode>(BinOp.getOperand(i))) { 10723 Inputs.push_back(BinOp.getOperand(i)); 10724 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || 10725 BinOp.getOperand(i).getOpcode() == ISD::OR || 10726 BinOp.getOperand(i).getOpcode() == ISD::XOR || 10727 BinOp.getOperand(i).getOpcode() == ISD::SELECT || 10728 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC || 10729 BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || 10730 BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || 10731 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || 10732 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) { 10733 BinOps.push_back(BinOp.getOperand(i)); 10734 } else { 10735 // We have an input that is not an extension or another binary 10736 // operation; we'll abort this transformation. 10737 return SDValue(); 10738 } 10739 } 10740 } 10741 10742 // Make sure that this is a self-contained cluster of operations (which 10743 // is not quite the same thing as saying that everything has only one 10744 // use). 10745 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 10746 if (isa<ConstantSDNode>(Inputs[i])) 10747 continue; 10748 10749 for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), 10750 UE = Inputs[i].getNode()->use_end(); 10751 UI != UE; ++UI) { 10752 SDNode *User = *UI; 10753 if (User != N && !Visited.count(User)) 10754 return SDValue(); 10755 10756 // Make sure that we're not going to promote the non-output-value 10757 // operand(s) or SELECT or SELECT_CC. 10758 // FIXME: Although we could sometimes handle this, and it does occur in 10759 // practice that one of the condition inputs to the select is also one of 10760 // the outputs, we currently can't deal with this. 10761 if (User->getOpcode() == ISD::SELECT) { 10762 if (User->getOperand(0) == Inputs[i]) 10763 return SDValue(); 10764 } else if (User->getOpcode() == ISD::SELECT_CC) { 10765 if (User->getOperand(0) == Inputs[i] || 10766 User->getOperand(1) == Inputs[i]) 10767 return SDValue(); 10768 } 10769 } 10770 } 10771 10772 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { 10773 for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), 10774 UE = PromOps[i].getNode()->use_end(); 10775 UI != UE; ++UI) { 10776 SDNode *User = *UI; 10777 if (User != N && !Visited.count(User)) 10778 return SDValue(); 10779 10780 // Make sure that we're not going to promote the non-output-value 10781 // operand(s) or SELECT or SELECT_CC. 10782 // FIXME: Although we could sometimes handle this, and it does occur in 10783 // practice that one of the condition inputs to the select is also one of 10784 // the outputs, we currently can't deal with this. 10785 if (User->getOpcode() == ISD::SELECT) { 10786 if (User->getOperand(0) == PromOps[i]) 10787 return SDValue(); 10788 } else if (User->getOpcode() == ISD::SELECT_CC) { 10789 if (User->getOperand(0) == PromOps[i] || 10790 User->getOperand(1) == PromOps[i]) 10791 return SDValue(); 10792 } 10793 } 10794 } 10795 10796 // Replace all inputs with the extension operand. 10797 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 10798 // Constants may have users outside the cluster of to-be-promoted nodes, 10799 // and so we need to replace those as we do the promotions. 10800 if (isa<ConstantSDNode>(Inputs[i])) 10801 continue; 10802 else 10803 DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); 10804 } 10805 10806 std::list<HandleSDNode> PromOpHandles; 10807 for (auto &PromOp : PromOps) 10808 PromOpHandles.emplace_back(PromOp); 10809 10810 // Replace all operations (these are all the same, but have a different 10811 // (i1) return type). DAG.getNode will validate that the types of 10812 // a binary operator match, so go through the list in reverse so that 10813 // we've likely promoted both operands first. Any intermediate truncations or 10814 // extensions disappear. 10815 while (!PromOpHandles.empty()) { 10816 SDValue PromOp = PromOpHandles.back().getValue(); 10817 PromOpHandles.pop_back(); 10818 10819 if (PromOp.getOpcode() == ISD::TRUNCATE || 10820 PromOp.getOpcode() == ISD::SIGN_EXTEND || 10821 PromOp.getOpcode() == ISD::ZERO_EXTEND || 10822 PromOp.getOpcode() == ISD::ANY_EXTEND) { 10823 if (!isa<ConstantSDNode>(PromOp.getOperand(0)) && 10824 PromOp.getOperand(0).getValueType() != MVT::i1) { 10825 // The operand is not yet ready (see comment below). 10826 PromOpHandles.emplace_front(PromOp); 10827 continue; 10828 } 10829 10830 SDValue RepValue = PromOp.getOperand(0); 10831 if (isa<ConstantSDNode>(RepValue)) 10832 RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue); 10833 10834 DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue); 10835 continue; 10836 } 10837 10838 unsigned C; 10839 switch (PromOp.getOpcode()) { 10840 default: C = 0; break; 10841 case ISD::SELECT: C = 1; break; 10842 case ISD::SELECT_CC: C = 2; break; 10843 } 10844 10845 if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) && 10846 PromOp.getOperand(C).getValueType() != MVT::i1) || 10847 (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) && 10848 PromOp.getOperand(C+1).getValueType() != MVT::i1)) { 10849 // The to-be-promoted operands of this node have not yet been 10850 // promoted (this should be rare because we're going through the 10851 // list backward, but if one of the operands has several users in 10852 // this cluster of to-be-promoted nodes, it is possible). 10853 PromOpHandles.emplace_front(PromOp); 10854 continue; 10855 } 10856 10857 SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), 10858 PromOp.getNode()->op_end()); 10859 10860 // If there are any constant inputs, make sure they're replaced now. 10861 for (unsigned i = 0; i < 2; ++i) 10862 if (isa<ConstantSDNode>(Ops[C+i])) 10863 Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]); 10864 10865 DAG.ReplaceAllUsesOfValueWith(PromOp, 10866 DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops)); 10867 } 10868 10869 // Now we're left with the initial truncation itself. 10870 if (N->getOpcode() == ISD::TRUNCATE) 10871 return N->getOperand(0); 10872 10873 // Otherwise, this is a comparison. The operands to be compared have just 10874 // changed type (to i1), but everything else is the same. 10875 return SDValue(N, 0); 10876 } 10877 10878 SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, 10879 DAGCombinerInfo &DCI) const { 10880 SelectionDAG &DAG = DCI.DAG; 10881 SDLoc dl(N); 10882 10883 // If we're tracking CR bits, we need to be careful that we don't have: 10884 // zext(binary-ops(trunc(x), trunc(y))) 10885 // or 10886 // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) 10887 // such that we're unnecessarily moving things into CR bits that can more 10888 // efficiently stay in GPRs. Note that if we're not certain that the high 10889 // bits are set as required by the final extension, we still may need to do 10890 // some masking to get the proper behavior. 10891 10892 // This same functionality is important on PPC64 when dealing with 10893 // 32-to-64-bit extensions; these occur often when 32-bit values are used as 10894 // the return values of functions. Because it is so similar, it is handled 10895 // here as well. 10896 10897 if (N->getValueType(0) != MVT::i32 && 10898 N->getValueType(0) != MVT::i64) 10899 return SDValue(); 10900 10901 if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) || 10902 (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64()))) 10903 return SDValue(); 10904 10905 if (N->getOperand(0).getOpcode() != ISD::AND && 10906 N->getOperand(0).getOpcode() != ISD::OR && 10907 N->getOperand(0).getOpcode() != ISD::XOR && 10908 N->getOperand(0).getOpcode() != ISD::SELECT && 10909 N->getOperand(0).getOpcode() != ISD::SELECT_CC) 10910 return SDValue(); 10911 10912 SmallVector<SDValue, 4> Inputs; 10913 SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps; 10914 SmallPtrSet<SDNode *, 16> Visited; 10915 10916 // Visit all inputs, collect all binary operations (and, or, xor and 10917 // select) that are all fed by truncations. 10918 while (!BinOps.empty()) { 10919 SDValue BinOp = BinOps.back(); 10920 BinOps.pop_back(); 10921 10922 if (!Visited.insert(BinOp.getNode()).second) 10923 continue; 10924 10925 PromOps.push_back(BinOp); 10926 10927 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { 10928 // The condition of the select is not promoted. 10929 if (BinOp.getOpcode() == ISD::SELECT && i == 0) 10930 continue; 10931 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) 10932 continue; 10933 10934 if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || 10935 isa<ConstantSDNode>(BinOp.getOperand(i))) { 10936 Inputs.push_back(BinOp.getOperand(i)); 10937 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || 10938 BinOp.getOperand(i).getOpcode() == ISD::OR || 10939 BinOp.getOperand(i).getOpcode() == ISD::XOR || 10940 BinOp.getOperand(i).getOpcode() == ISD::SELECT || 10941 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) { 10942 BinOps.push_back(BinOp.getOperand(i)); 10943 } else { 10944 // We have an input that is not a truncation or another binary 10945 // operation; we'll abort this transformation. 10946 return SDValue(); 10947 } 10948 } 10949 } 10950 10951 // The operands of a select that must be truncated when the select is 10952 // promoted because the operand is actually part of the to-be-promoted set. 10953 DenseMap<SDNode *, EVT> SelectTruncOp[2]; 10954 10955 // Make sure that this is a self-contained cluster of operations (which 10956 // is not quite the same thing as saying that everything has only one 10957 // use). 10958 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 10959 if (isa<ConstantSDNode>(Inputs[i])) 10960 continue; 10961 10962 for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), 10963 UE = Inputs[i].getNode()->use_end(); 10964 UI != UE; ++UI) { 10965 SDNode *User = *UI; 10966 if (User != N && !Visited.count(User)) 10967 return SDValue(); 10968 10969 // If we're going to promote the non-output-value operand(s) or SELECT or 10970 // SELECT_CC, record them for truncation. 10971 if (User->getOpcode() == ISD::SELECT) { 10972 if (User->getOperand(0) == Inputs[i]) 10973 SelectTruncOp[0].insert(std::make_pair(User, 10974 User->getOperand(0).getValueType())); 10975 } else if (User->getOpcode() == ISD::SELECT_CC) { 10976 if (User->getOperand(0) == Inputs[i]) 10977 SelectTruncOp[0].insert(std::make_pair(User, 10978 User->getOperand(0).getValueType())); 10979 if (User->getOperand(1) == Inputs[i]) 10980 SelectTruncOp[1].insert(std::make_pair(User, 10981 User->getOperand(1).getValueType())); 10982 } 10983 } 10984 } 10985 10986 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { 10987 for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), 10988 UE = PromOps[i].getNode()->use_end(); 10989 UI != UE; ++UI) { 10990 SDNode *User = *UI; 10991 if (User != N && !Visited.count(User)) 10992 return SDValue(); 10993 10994 // If we're going to promote the non-output-value operand(s) or SELECT or 10995 // SELECT_CC, record them for truncation. 10996 if (User->getOpcode() == ISD::SELECT) { 10997 if (User->getOperand(0) == PromOps[i]) 10998 SelectTruncOp[0].insert(std::make_pair(User, 10999 User->getOperand(0).getValueType())); 11000 } else if (User->getOpcode() == ISD::SELECT_CC) { 11001 if (User->getOperand(0) == PromOps[i]) 11002 SelectTruncOp[0].insert(std::make_pair(User, 11003 User->getOperand(0).getValueType())); 11004 if (User->getOperand(1) == PromOps[i]) 11005 SelectTruncOp[1].insert(std::make_pair(User, 11006 User->getOperand(1).getValueType())); 11007 } 11008 } 11009 } 11010 11011 unsigned PromBits = N->getOperand(0).getValueSizeInBits(); 11012 bool ReallyNeedsExt = false; 11013 if (N->getOpcode() != ISD::ANY_EXTEND) { 11014 // If all of the inputs are not already sign/zero extended, then 11015 // we'll still need to do that at the end. 11016 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 11017 if (isa<ConstantSDNode>(Inputs[i])) 11018 continue; 11019 11020 unsigned OpBits = 11021 Inputs[i].getOperand(0).getValueSizeInBits(); 11022 assert(PromBits < OpBits && "Truncation not to a smaller bit count?"); 11023 11024 if ((N->getOpcode() == ISD::ZERO_EXTEND && 11025 !DAG.MaskedValueIsZero(Inputs[i].getOperand(0), 11026 APInt::getHighBitsSet(OpBits, 11027 OpBits-PromBits))) || 11028 (N->getOpcode() == ISD::SIGN_EXTEND && 11029 DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) < 11030 (OpBits-(PromBits-1)))) { 11031 ReallyNeedsExt = true; 11032 break; 11033 } 11034 } 11035 } 11036 11037 // Replace all inputs, either with the truncation operand, or a 11038 // truncation or extension to the final output type. 11039 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 11040 // Constant inputs need to be replaced with the to-be-promoted nodes that 11041 // use them because they might have users outside of the cluster of 11042 // promoted nodes. 11043 if (isa<ConstantSDNode>(Inputs[i])) 11044 continue; 11045 11046 SDValue InSrc = Inputs[i].getOperand(0); 11047 if (Inputs[i].getValueType() == N->getValueType(0)) 11048 DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc); 11049 else if (N->getOpcode() == ISD::SIGN_EXTEND) 11050 DAG.ReplaceAllUsesOfValueWith(Inputs[i], 11051 DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0))); 11052 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11053 DAG.ReplaceAllUsesOfValueWith(Inputs[i], 11054 DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0))); 11055 else 11056 DAG.ReplaceAllUsesOfValueWith(Inputs[i], 11057 DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0))); 11058 } 11059 11060 std::list<HandleSDNode> PromOpHandles; 11061 for (auto &PromOp : PromOps) 11062 PromOpHandles.emplace_back(PromOp); 11063 11064 // Replace all operations (these are all the same, but have a different 11065 // (promoted) return type). DAG.getNode will validate that the types of 11066 // a binary operator match, so go through the list in reverse so that 11067 // we've likely promoted both operands first. 11068 while (!PromOpHandles.empty()) { 11069 SDValue PromOp = PromOpHandles.back().getValue(); 11070 PromOpHandles.pop_back(); 11071 11072 unsigned C; 11073 switch (PromOp.getOpcode()) { 11074 default: C = 0; break; 11075 case ISD::SELECT: C = 1; break; 11076 case ISD::SELECT_CC: C = 2; break; 11077 } 11078 11079 if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) && 11080 PromOp.getOperand(C).getValueType() != N->getValueType(0)) || 11081 (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) && 11082 PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) { 11083 // The to-be-promoted operands of this node have not yet been 11084 // promoted (this should be rare because we're going through the 11085 // list backward, but if one of the operands has several users in 11086 // this cluster of to-be-promoted nodes, it is possible). 11087 PromOpHandles.emplace_front(PromOp); 11088 continue; 11089 } 11090 11091 // For SELECT and SELECT_CC nodes, we do a similar check for any 11092 // to-be-promoted comparison inputs. 11093 if (PromOp.getOpcode() == ISD::SELECT || 11094 PromOp.getOpcode() == ISD::SELECT_CC) { 11095 if ((SelectTruncOp[0].count(PromOp.getNode()) && 11096 PromOp.getOperand(0).getValueType() != N->getValueType(0)) || 11097 (SelectTruncOp[1].count(PromOp.getNode()) && 11098 PromOp.getOperand(1).getValueType() != N->getValueType(0))) { 11099 PromOpHandles.emplace_front(PromOp); 11100 continue; 11101 } 11102 } 11103 11104 SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), 11105 PromOp.getNode()->op_end()); 11106 11107 // If this node has constant inputs, then they'll need to be promoted here. 11108 for (unsigned i = 0; i < 2; ++i) { 11109 if (!isa<ConstantSDNode>(Ops[C+i])) 11110 continue; 11111 if (Ops[C+i].getValueType() == N->getValueType(0)) 11112 continue; 11113 11114 if (N->getOpcode() == ISD::SIGN_EXTEND) 11115 Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); 11116 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11117 Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); 11118 else 11119 Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); 11120 } 11121 11122 // If we've promoted the comparison inputs of a SELECT or SELECT_CC, 11123 // truncate them again to the original value type. 11124 if (PromOp.getOpcode() == ISD::SELECT || 11125 PromOp.getOpcode() == ISD::SELECT_CC) { 11126 auto SI0 = SelectTruncOp[0].find(PromOp.getNode()); 11127 if (SI0 != SelectTruncOp[0].end()) 11128 Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]); 11129 auto SI1 = SelectTruncOp[1].find(PromOp.getNode()); 11130 if (SI1 != SelectTruncOp[1].end()) 11131 Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]); 11132 } 11133 11134 DAG.ReplaceAllUsesOfValueWith(PromOp, 11135 DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops)); 11136 } 11137 11138 // Now we're left with the initial extension itself. 11139 if (!ReallyNeedsExt) 11140 return N->getOperand(0); 11141 11142 // To zero extend, just mask off everything except for the first bit (in the 11143 // i1 case). 11144 if (N->getOpcode() == ISD::ZERO_EXTEND) 11145 return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0), 11146 DAG.getConstant(APInt::getLowBitsSet( 11147 N->getValueSizeInBits(0), PromBits), 11148 dl, N->getValueType(0))); 11149 11150 assert(N->getOpcode() == ISD::SIGN_EXTEND && 11151 "Invalid extension type"); 11152 EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout()); 11153 SDValue ShiftCst = 11154 DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy); 11155 return DAG.getNode( 11156 ISD::SRA, dl, N->getValueType(0), 11157 DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst), 11158 ShiftCst); 11159 } 11160 11161 /// \brief Reduces the number of fp-to-int conversion when building a vector. 11162 /// 11163 /// If this vector is built out of floating to integer conversions, 11164 /// transform it to a vector built out of floating point values followed by a 11165 /// single floating to integer conversion of the vector. 11166 /// Namely (build_vector (fptosi $A), (fptosi $B), ...) 11167 /// becomes (fptosi (build_vector ($A, $B, ...))) 11168 SDValue PPCTargetLowering:: 11169 combineElementTruncationToVectorTruncation(SDNode *N, 11170 DAGCombinerInfo &DCI) const { 11171 assert(N->getOpcode() == ISD::BUILD_VECTOR && 11172 "Should be called with a BUILD_VECTOR node"); 11173 11174 SelectionDAG &DAG = DCI.DAG; 11175 SDLoc dl(N); 11176 11177 SDValue FirstInput = N->getOperand(0); 11178 assert(FirstInput.getOpcode() == PPCISD::MFVSR && 11179 "The input operand must be an fp-to-int conversion."); 11180 11181 // This combine happens after legalization so the fp_to_[su]i nodes are 11182 // already converted to PPCSISD nodes. 11183 unsigned FirstConversion = FirstInput.getOperand(0).getOpcode(); 11184 if (FirstConversion == PPCISD::FCTIDZ || 11185 FirstConversion == PPCISD::FCTIDUZ || 11186 FirstConversion == PPCISD::FCTIWZ || 11187 FirstConversion == PPCISD::FCTIWUZ) { 11188 bool IsSplat = true; 11189 bool Is32Bit = FirstConversion == PPCISD::FCTIWZ || 11190 FirstConversion == PPCISD::FCTIWUZ; 11191 EVT SrcVT = FirstInput.getOperand(0).getValueType(); 11192 SmallVector<SDValue, 4> Ops; 11193 EVT TargetVT = N->getValueType(0); 11194 for (int i = 0, e = N->getNumOperands(); i < e; ++i) { 11195 if (N->getOperand(i).getOpcode() != PPCISD::MFVSR) 11196 return SDValue(); 11197 unsigned NextConversion = N->getOperand(i).getOperand(0).getOpcode(); 11198 if (NextConversion != FirstConversion) 11199 return SDValue(); 11200 if (N->getOperand(i) != FirstInput) 11201 IsSplat = false; 11202 } 11203 11204 // If this is a splat, we leave it as-is since there will be only a single 11205 // fp-to-int conversion followed by a splat of the integer. This is better 11206 // for 32-bit and smaller ints and neutral for 64-bit ints. 11207 if (IsSplat) 11208 return SDValue(); 11209 11210 // Now that we know we have the right type of node, get its operands 11211 for (int i = 0, e = N->getNumOperands(); i < e; ++i) { 11212 SDValue In = N->getOperand(i).getOperand(0); 11213 // For 32-bit values, we need to add an FP_ROUND node. 11214 if (Is32Bit) { 11215 if (In.isUndef()) 11216 Ops.push_back(DAG.getUNDEF(SrcVT)); 11217 else { 11218 SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl, 11219 MVT::f32, In.getOperand(0), 11220 DAG.getIntPtrConstant(1, dl)); 11221 Ops.push_back(Trunc); 11222 } 11223 } else 11224 Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0)); 11225 } 11226 11227 unsigned Opcode; 11228 if (FirstConversion == PPCISD::FCTIDZ || 11229 FirstConversion == PPCISD::FCTIWZ) 11230 Opcode = ISD::FP_TO_SINT; 11231 else 11232 Opcode = ISD::FP_TO_UINT; 11233 11234 EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32; 11235 SDValue BV = DAG.getBuildVector(NewVT, dl, Ops); 11236 return DAG.getNode(Opcode, dl, TargetVT, BV); 11237 } 11238 return SDValue(); 11239 } 11240 11241 /// \brief Reduce the number of loads when building a vector. 11242 /// 11243 /// Building a vector out of multiple loads can be converted to a load 11244 /// of the vector type if the loads are consecutive. If the loads are 11245 /// consecutive but in descending order, a shuffle is added at the end 11246 /// to reorder the vector. 11247 static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { 11248 assert(N->getOpcode() == ISD::BUILD_VECTOR && 11249 "Should be called with a BUILD_VECTOR node"); 11250 11251 SDLoc dl(N); 11252 bool InputsAreConsecutiveLoads = true; 11253 bool InputsAreReverseConsecutive = true; 11254 unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8; 11255 SDValue FirstInput = N->getOperand(0); 11256 bool IsRoundOfExtLoad = false; 11257 11258 if (FirstInput.getOpcode() == ISD::FP_ROUND && 11259 FirstInput.getOperand(0).getOpcode() == ISD::LOAD) { 11260 LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0)); 11261 IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD; 11262 } 11263 // Not a build vector of (possibly fp_rounded) loads. 11264 if (!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) 11265 return SDValue(); 11266 11267 for (int i = 1, e = N->getNumOperands(); i < e; ++i) { 11268 // If any inputs are fp_round(extload), they all must be. 11269 if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND) 11270 return SDValue(); 11271 11272 SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) : 11273 N->getOperand(i); 11274 if (NextInput.getOpcode() != ISD::LOAD) 11275 return SDValue(); 11276 11277 SDValue PreviousInput = 11278 IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1); 11279 LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput); 11280 LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput); 11281 11282 // If any inputs are fp_round(extload), they all must be. 11283 if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD) 11284 return SDValue(); 11285 11286 if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG)) 11287 InputsAreConsecutiveLoads = false; 11288 if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG)) 11289 InputsAreReverseConsecutive = false; 11290 11291 // Exit early if the loads are neither consecutive nor reverse consecutive. 11292 if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive) 11293 return SDValue(); 11294 } 11295 11296 assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) && 11297 "The loads cannot be both consecutive and reverse consecutive."); 11298 11299 SDValue FirstLoadOp = 11300 IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput; 11301 SDValue LastLoadOp = 11302 IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) : 11303 N->getOperand(N->getNumOperands()-1); 11304 11305 LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp); 11306 LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp); 11307 if (InputsAreConsecutiveLoads) { 11308 assert(LD1 && "Input needs to be a LoadSDNode."); 11309 return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(), 11310 LD1->getBasePtr(), LD1->getPointerInfo(), 11311 LD1->getAlignment()); 11312 } 11313 if (InputsAreReverseConsecutive) { 11314 assert(LDL && "Input needs to be a LoadSDNode."); 11315 SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(), 11316 LDL->getBasePtr(), LDL->getPointerInfo(), 11317 LDL->getAlignment()); 11318 SmallVector<int, 16> Ops; 11319 for (int i = N->getNumOperands() - 1; i >= 0; i--) 11320 Ops.push_back(i); 11321 11322 return DAG.getVectorShuffle(N->getValueType(0), dl, Load, 11323 DAG.getUNDEF(N->getValueType(0)), Ops); 11324 } 11325 return SDValue(); 11326 } 11327 11328 // This function adds the required vector_shuffle needed to get 11329 // the elements of the vector extract in the correct position 11330 // as specified by the CorrectElems encoding. 11331 static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG, 11332 SDValue Input, uint64_t Elems, 11333 uint64_t CorrectElems) { 11334 SDLoc dl(N); 11335 11336 unsigned NumElems = Input.getValueType().getVectorNumElements(); 11337 SmallVector<int, 16> ShuffleMask(NumElems, -1); 11338 11339 // Knowing the element indices being extracted from the original 11340 // vector and the order in which they're being inserted, just put 11341 // them at element indices required for the instruction. 11342 for (unsigned i = 0; i < N->getNumOperands(); i++) { 11343 if (DAG.getDataLayout().isLittleEndian()) 11344 ShuffleMask[CorrectElems & 0xF] = Elems & 0xF; 11345 else 11346 ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4; 11347 CorrectElems = CorrectElems >> 8; 11348 Elems = Elems >> 8; 11349 } 11350 11351 SDValue Shuffle = 11352 DAG.getVectorShuffle(Input.getValueType(), dl, Input, 11353 DAG.getUNDEF(Input.getValueType()), ShuffleMask); 11354 11355 EVT Ty = N->getValueType(0); 11356 SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle); 11357 return BV; 11358 } 11359 11360 // Look for build vector patterns where input operands come from sign 11361 // extended vector_extract elements of specific indices. If the correct indices 11362 // aren't used, add a vector shuffle to fix up the indices and create a new 11363 // PPCISD:SExtVElems node which selects the vector sign extend instructions 11364 // during instruction selection. 11365 static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) { 11366 // This array encodes the indices that the vector sign extend instructions 11367 // extract from when extending from one type to another for both BE and LE. 11368 // The right nibble of each byte corresponds to the LE incides. 11369 // and the left nibble of each byte corresponds to the BE incides. 11370 // For example: 0x3074B8FC byte->word 11371 // For LE: the allowed indices are: 0x0,0x4,0x8,0xC 11372 // For BE: the allowed indices are: 0x3,0x7,0xB,0xF 11373 // For example: 0x000070F8 byte->double word 11374 // For LE: the allowed indices are: 0x0,0x8 11375 // For BE: the allowed indices are: 0x7,0xF 11376 uint64_t TargetElems[] = { 11377 0x3074B8FC, // b->w 11378 0x000070F8, // b->d 11379 0x10325476, // h->w 11380 0x00003074, // h->d 11381 0x00001032, // w->d 11382 }; 11383 11384 uint64_t Elems = 0; 11385 int Index; 11386 SDValue Input; 11387 11388 auto isSExtOfVecExtract = [&](SDValue Op) -> bool { 11389 if (!Op) 11390 return false; 11391 if (Op.getOpcode() != ISD::SIGN_EXTEND) 11392 return false; 11393 11394 SDValue Extract = Op.getOperand(0); 11395 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11396 return false; 11397 11398 ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1)); 11399 if (!ExtOp) 11400 return false; 11401 11402 Index = ExtOp->getZExtValue(); 11403 if (Input && Input != Extract.getOperand(0)) 11404 return false; 11405 11406 if (!Input) 11407 Input = Extract.getOperand(0); 11408 11409 Elems = Elems << 8; 11410 Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4; 11411 Elems |= Index; 11412 11413 return true; 11414 }; 11415 11416 // If the build vector operands aren't sign extended vector extracts, 11417 // of the same input vector, then return. 11418 for (unsigned i = 0; i < N->getNumOperands(); i++) { 11419 if (!isSExtOfVecExtract(N->getOperand(i))) { 11420 return SDValue(); 11421 } 11422 } 11423 11424 // If the vector extract indicies are not correct, add the appropriate 11425 // vector_shuffle. 11426 int TgtElemArrayIdx; 11427 int InputSize = Input.getValueType().getScalarSizeInBits(); 11428 int OutputSize = N->getValueType(0).getScalarSizeInBits(); 11429 if (InputSize + OutputSize == 40) 11430 TgtElemArrayIdx = 0; 11431 else if (InputSize + OutputSize == 72) 11432 TgtElemArrayIdx = 1; 11433 else if (InputSize + OutputSize == 48) 11434 TgtElemArrayIdx = 2; 11435 else if (InputSize + OutputSize == 80) 11436 TgtElemArrayIdx = 3; 11437 else if (InputSize + OutputSize == 96) 11438 TgtElemArrayIdx = 4; 11439 else 11440 return SDValue(); 11441 11442 uint64_t CorrectElems = TargetElems[TgtElemArrayIdx]; 11443 CorrectElems = DAG.getDataLayout().isLittleEndian() 11444 ? CorrectElems & 0x0F0F0F0F0F0F0F0F 11445 : CorrectElems & 0xF0F0F0F0F0F0F0F0; 11446 if (Elems != CorrectElems) { 11447 return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems); 11448 } 11449 11450 // Regular lowering will catch cases where a shuffle is not needed. 11451 return SDValue(); 11452 } 11453 11454 SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, 11455 DAGCombinerInfo &DCI) const { 11456 assert(N->getOpcode() == ISD::BUILD_VECTOR && 11457 "Should be called with a BUILD_VECTOR node"); 11458 11459 SelectionDAG &DAG = DCI.DAG; 11460 SDLoc dl(N); 11461 11462 if (!Subtarget.hasVSX()) 11463 return SDValue(); 11464 11465 // The target independent DAG combiner will leave a build_vector of 11466 // float-to-int conversions intact. We can generate MUCH better code for 11467 // a float-to-int conversion of a vector of floats. 11468 SDValue FirstInput = N->getOperand(0); 11469 if (FirstInput.getOpcode() == PPCISD::MFVSR) { 11470 SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI); 11471 if (Reduced) 11472 return Reduced; 11473 } 11474 11475 // If we're building a vector out of consecutive loads, just load that 11476 // vector type. 11477 SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG); 11478 if (Reduced) 11479 return Reduced; 11480 11481 // If we're building a vector out of extended elements from another vector 11482 // we have P9 vector integer extend instructions. 11483 if (Subtarget.hasP9Altivec()) { 11484 Reduced = combineBVOfVecSExt(N, DAG); 11485 if (Reduced) 11486 return Reduced; 11487 } 11488 11489 11490 if (N->getValueType(0) != MVT::v2f64) 11491 return SDValue(); 11492 11493 // Looking for: 11494 // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1)) 11495 if (FirstInput.getOpcode() != ISD::SINT_TO_FP && 11496 FirstInput.getOpcode() != ISD::UINT_TO_FP) 11497 return SDValue(); 11498 if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP && 11499 N->getOperand(1).getOpcode() != ISD::UINT_TO_FP) 11500 return SDValue(); 11501 if (FirstInput.getOpcode() != N->getOperand(1).getOpcode()) 11502 return SDValue(); 11503 11504 SDValue Ext1 = FirstInput.getOperand(0); 11505 SDValue Ext2 = N->getOperand(1).getOperand(0); 11506 if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 11507 Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11508 return SDValue(); 11509 11510 ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1)); 11511 ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1)); 11512 if (!Ext1Op || !Ext2Op) 11513 return SDValue(); 11514 if (Ext1.getValueType() != MVT::i32 || 11515 Ext2.getValueType() != MVT::i32) 11516 if (Ext1.getOperand(0) != Ext2.getOperand(0)) 11517 return SDValue(); 11518 11519 int FirstElem = Ext1Op->getZExtValue(); 11520 int SecondElem = Ext2Op->getZExtValue(); 11521 int SubvecIdx; 11522 if (FirstElem == 0 && SecondElem == 1) 11523 SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0; 11524 else if (FirstElem == 2 && SecondElem == 3) 11525 SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1; 11526 else 11527 return SDValue(); 11528 11529 SDValue SrcVec = Ext1.getOperand(0); 11530 auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ? 11531 PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP; 11532 return DAG.getNode(NodeType, dl, MVT::v2f64, 11533 SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl)); 11534 } 11535 11536 SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, 11537 DAGCombinerInfo &DCI) const { 11538 assert((N->getOpcode() == ISD::SINT_TO_FP || 11539 N->getOpcode() == ISD::UINT_TO_FP) && 11540 "Need an int -> FP conversion node here"); 11541 11542 if (useSoftFloat() || !Subtarget.has64BitSupport()) 11543 return SDValue(); 11544 11545 SelectionDAG &DAG = DCI.DAG; 11546 SDLoc dl(N); 11547 SDValue Op(N, 0); 11548 11549 SDValue FirstOperand(Op.getOperand(0)); 11550 bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD && 11551 (FirstOperand.getValueType() == MVT::i8 || 11552 FirstOperand.getValueType() == MVT::i16); 11553 if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) { 11554 bool Signed = N->getOpcode() == ISD::SINT_TO_FP; 11555 bool DstDouble = Op.getValueType() == MVT::f64; 11556 unsigned ConvOp = Signed ? 11557 (DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) : 11558 (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS); 11559 SDValue WidthConst = 11560 DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2, 11561 dl, false); 11562 LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode()); 11563 SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst }; 11564 SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl, 11565 DAG.getVTList(MVT::f64, MVT::Other), 11566 Ops, MVT::i8, LDN->getMemOperand()); 11567 11568 // For signed conversion, we need to sign-extend the value in the VSR 11569 if (Signed) { 11570 SDValue ExtOps[] = { Ld, WidthConst }; 11571 SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps); 11572 return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext); 11573 } else 11574 return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld); 11575 } 11576 11577 // Don't handle ppc_fp128 here or i1 conversions. 11578 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) 11579 return SDValue(); 11580 if (Op.getOperand(0).getValueType() == MVT::i1) 11581 return SDValue(); 11582 11583 // For i32 intermediate values, unfortunately, the conversion functions 11584 // leave the upper 32 bits of the value are undefined. Within the set of 11585 // scalar instructions, we have no method for zero- or sign-extending the 11586 // value. Thus, we cannot handle i32 intermediate values here. 11587 if (Op.getOperand(0).getValueType() == MVT::i32) 11588 return SDValue(); 11589 11590 assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && 11591 "UINT_TO_FP is supported only with FPCVT"); 11592 11593 // If we have FCFIDS, then use it when converting to single-precision. 11594 // Otherwise, convert to double-precision and then round. 11595 unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 11596 ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS 11597 : PPCISD::FCFIDS) 11598 : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU 11599 : PPCISD::FCFID); 11600 MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 11601 ? MVT::f32 11602 : MVT::f64; 11603 11604 // If we're converting from a float, to an int, and back to a float again, 11605 // then we don't need the store/load pair at all. 11606 if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT && 11607 Subtarget.hasFPCVT()) || 11608 (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) { 11609 SDValue Src = Op.getOperand(0).getOperand(0); 11610 if (Src.getValueType() == MVT::f32) { 11611 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); 11612 DCI.AddToWorklist(Src.getNode()); 11613 } else if (Src.getValueType() != MVT::f64) { 11614 // Make sure that we don't pick up a ppc_fp128 source value. 11615 return SDValue(); 11616 } 11617 11618 unsigned FCTOp = 11619 Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ : 11620 PPCISD::FCTIDUZ; 11621 11622 SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src); 11623 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp); 11624 11625 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) { 11626 FP = DAG.getNode(ISD::FP_ROUND, dl, 11627 MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); 11628 DCI.AddToWorklist(FP.getNode()); 11629 } 11630 11631 return FP; 11632 } 11633 11634 return SDValue(); 11635 } 11636 11637 // expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for 11638 // builtins) into loads with swaps. 11639 SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, 11640 DAGCombinerInfo &DCI) const { 11641 SelectionDAG &DAG = DCI.DAG; 11642 SDLoc dl(N); 11643 SDValue Chain; 11644 SDValue Base; 11645 MachineMemOperand *MMO; 11646 11647 switch (N->getOpcode()) { 11648 default: 11649 llvm_unreachable("Unexpected opcode for little endian VSX load"); 11650 case ISD::LOAD: { 11651 LoadSDNode *LD = cast<LoadSDNode>(N); 11652 Chain = LD->getChain(); 11653 Base = LD->getBasePtr(); 11654 MMO = LD->getMemOperand(); 11655 // If the MMO suggests this isn't a load of a full vector, leave 11656 // things alone. For a built-in, we have to make the change for 11657 // correctness, so if there is a size problem that will be a bug. 11658 if (MMO->getSize() < 16) 11659 return SDValue(); 11660 break; 11661 } 11662 case ISD::INTRINSIC_W_CHAIN: { 11663 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N); 11664 Chain = Intrin->getChain(); 11665 // Similarly to the store case below, Intrin->getBasePtr() doesn't get 11666 // us what we want. Get operand 2 instead. 11667 Base = Intrin->getOperand(2); 11668 MMO = Intrin->getMemOperand(); 11669 break; 11670 } 11671 } 11672 11673 MVT VecTy = N->getValueType(0).getSimpleVT(); 11674 11675 // Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is 11676 // aligned and the type is a vector with elements up to 4 bytes 11677 if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) 11678 && VecTy.getScalarSizeInBits() <= 32 ) { 11679 return SDValue(); 11680 } 11681 11682 SDValue LoadOps[] = { Chain, Base }; 11683 SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl, 11684 DAG.getVTList(MVT::v2f64, MVT::Other), 11685 LoadOps, MVT::v2f64, MMO); 11686 11687 DCI.AddToWorklist(Load.getNode()); 11688 Chain = Load.getValue(1); 11689 SDValue Swap = DAG.getNode( 11690 PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load); 11691 DCI.AddToWorklist(Swap.getNode()); 11692 11693 // Add a bitcast if the resulting load type doesn't match v2f64. 11694 if (VecTy != MVT::v2f64) { 11695 SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap); 11696 DCI.AddToWorklist(N.getNode()); 11697 // Package {bitcast value, swap's chain} to match Load's shape. 11698 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other), 11699 N, Swap.getValue(1)); 11700 } 11701 11702 return Swap; 11703 } 11704 11705 // expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for 11706 // builtins) into stores with swaps. 11707 SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, 11708 DAGCombinerInfo &DCI) const { 11709 SelectionDAG &DAG = DCI.DAG; 11710 SDLoc dl(N); 11711 SDValue Chain; 11712 SDValue Base; 11713 unsigned SrcOpnd; 11714 MachineMemOperand *MMO; 11715 11716 switch (N->getOpcode()) { 11717 default: 11718 llvm_unreachable("Unexpected opcode for little endian VSX store"); 11719 case ISD::STORE: { 11720 StoreSDNode *ST = cast<StoreSDNode>(N); 11721 Chain = ST->getChain(); 11722 Base = ST->getBasePtr(); 11723 MMO = ST->getMemOperand(); 11724 SrcOpnd = 1; 11725 // If the MMO suggests this isn't a store of a full vector, leave 11726 // things alone. For a built-in, we have to make the change for 11727 // correctness, so if there is a size problem that will be a bug. 11728 if (MMO->getSize() < 16) 11729 return SDValue(); 11730 break; 11731 } 11732 case ISD::INTRINSIC_VOID: { 11733 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N); 11734 Chain = Intrin->getChain(); 11735 // Intrin->getBasePtr() oddly does not get what we want. 11736 Base = Intrin->getOperand(3); 11737 MMO = Intrin->getMemOperand(); 11738 SrcOpnd = 2; 11739 break; 11740 } 11741 } 11742 11743 SDValue Src = N->getOperand(SrcOpnd); 11744 MVT VecTy = Src.getValueType().getSimpleVT(); 11745 11746 // Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is 11747 // aligned and the type is a vector with elements up to 4 bytes 11748 if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) 11749 && VecTy.getScalarSizeInBits() <= 32 ) { 11750 return SDValue(); 11751 } 11752 11753 // All stores are done as v2f64 and possible bit cast. 11754 if (VecTy != MVT::v2f64) { 11755 Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src); 11756 DCI.AddToWorklist(Src.getNode()); 11757 } 11758 11759 SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl, 11760 DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src); 11761 DCI.AddToWorklist(Swap.getNode()); 11762 Chain = Swap.getValue(1); 11763 SDValue StoreOps[] = { Chain, Swap, Base }; 11764 SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl, 11765 DAG.getVTList(MVT::Other), 11766 StoreOps, VecTy, MMO); 11767 DCI.AddToWorklist(Store.getNode()); 11768 return Store; 11769 } 11770 11771 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, 11772 DAGCombinerInfo &DCI) const { 11773 SelectionDAG &DAG = DCI.DAG; 11774 SDLoc dl(N); 11775 switch (N->getOpcode()) { 11776 default: break; 11777 case ISD::SHL: 11778 return combineSHL(N, DCI); 11779 case ISD::SRA: 11780 return combineSRA(N, DCI); 11781 case ISD::SRL: 11782 return combineSRL(N, DCI); 11783 case PPCISD::SHL: 11784 if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. 11785 return N->getOperand(0); 11786 break; 11787 case PPCISD::SRL: 11788 if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0. 11789 return N->getOperand(0); 11790 break; 11791 case PPCISD::SRA: 11792 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 11793 if (C->isNullValue() || // 0 >>s V -> 0. 11794 C->isAllOnesValue()) // -1 >>s V -> -1. 11795 return N->getOperand(0); 11796 } 11797 break; 11798 case ISD::SIGN_EXTEND: 11799 case ISD::ZERO_EXTEND: 11800 case ISD::ANY_EXTEND: 11801 return DAGCombineExtBoolTrunc(N, DCI); 11802 case ISD::TRUNCATE: 11803 case ISD::SETCC: 11804 case ISD::SELECT_CC: 11805 return DAGCombineTruncBoolExt(N, DCI); 11806 case ISD::SINT_TO_FP: 11807 case ISD::UINT_TO_FP: 11808 return combineFPToIntToFP(N, DCI); 11809 case ISD::STORE: { 11810 EVT Op1VT = N->getOperand(1).getValueType(); 11811 bool ValidTypeForStoreFltAsInt = (Op1VT == MVT::i32) || 11812 (Subtarget.hasP9Vector() && (Op1VT == MVT::i8 || Op1VT == MVT::i16)); 11813 11814 // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)). 11815 if (Subtarget.hasSTFIWX() && !cast<StoreSDNode>(N)->isTruncatingStore() && 11816 N->getOperand(1).getOpcode() == ISD::FP_TO_SINT && 11817 ValidTypeForStoreFltAsInt && 11818 N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) { 11819 SDValue Val = N->getOperand(1).getOperand(0); 11820 if (Val.getValueType() == MVT::f32) { 11821 Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val); 11822 DCI.AddToWorklist(Val.getNode()); 11823 } 11824 Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val); 11825 DCI.AddToWorklist(Val.getNode()); 11826 11827 if (Op1VT == MVT::i32) { 11828 SDValue Ops[] = { 11829 N->getOperand(0), Val, N->getOperand(2), 11830 DAG.getValueType(N->getOperand(1).getValueType()) 11831 }; 11832 11833 Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, 11834 DAG.getVTList(MVT::Other), Ops, 11835 cast<StoreSDNode>(N)->getMemoryVT(), 11836 cast<StoreSDNode>(N)->getMemOperand()); 11837 } else { 11838 unsigned WidthInBytes = 11839 N->getOperand(1).getValueType() == MVT::i8 ? 1 : 2; 11840 SDValue WidthConst = DAG.getIntPtrConstant(WidthInBytes, dl, false); 11841 11842 SDValue Ops[] = { 11843 N->getOperand(0), Val, N->getOperand(2), WidthConst, 11844 DAG.getValueType(N->getOperand(1).getValueType()) 11845 }; 11846 Val = DAG.getMemIntrinsicNode(PPCISD::STXSIX, dl, 11847 DAG.getVTList(MVT::Other), Ops, 11848 cast<StoreSDNode>(N)->getMemoryVT(), 11849 cast<StoreSDNode>(N)->getMemOperand()); 11850 } 11851 11852 DCI.AddToWorklist(Val.getNode()); 11853 return Val; 11854 } 11855 11856 // Turn STORE (BSWAP) -> sthbrx/stwbrx. 11857 if (cast<StoreSDNode>(N)->isUnindexed() && 11858 N->getOperand(1).getOpcode() == ISD::BSWAP && 11859 N->getOperand(1).getNode()->hasOneUse() && 11860 (N->getOperand(1).getValueType() == MVT::i32 || 11861 N->getOperand(1).getValueType() == MVT::i16 || 11862 (Subtarget.hasLDBRX() && Subtarget.isPPC64() && 11863 N->getOperand(1).getValueType() == MVT::i64))) { 11864 SDValue BSwapOp = N->getOperand(1).getOperand(0); 11865 // Do an any-extend to 32-bits if this is a half-word input. 11866 if (BSwapOp.getValueType() == MVT::i16) 11867 BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp); 11868 11869 // If the type of BSWAP operand is wider than stored memory width 11870 // it need to be shifted to the right side before STBRX. 11871 EVT mVT = cast<StoreSDNode>(N)->getMemoryVT(); 11872 if (Op1VT.bitsGT(mVT)) { 11873 int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits(); 11874 BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp, 11875 DAG.getConstant(Shift, dl, MVT::i32)); 11876 // Need to truncate if this is a bswap of i64 stored as i32/i16. 11877 if (Op1VT == MVT::i64) 11878 BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp); 11879 } 11880 11881 SDValue Ops[] = { 11882 N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT) 11883 }; 11884 return 11885 DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other), 11886 Ops, cast<StoreSDNode>(N)->getMemoryVT(), 11887 cast<StoreSDNode>(N)->getMemOperand()); 11888 } 11889 11890 // For little endian, VSX stores require generating xxswapd/lxvd2x. 11891 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. 11892 EVT VT = N->getOperand(1).getValueType(); 11893 if (VT.isSimple()) { 11894 MVT StoreVT = VT.getSimpleVT(); 11895 if (Subtarget.needsSwapsForVSXMemOps() && 11896 (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 || 11897 StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32)) 11898 return expandVSXStoreForLE(N, DCI); 11899 } 11900 break; 11901 } 11902 case ISD::LOAD: { 11903 LoadSDNode *LD = cast<LoadSDNode>(N); 11904 EVT VT = LD->getValueType(0); 11905 11906 // For little endian, VSX loads require generating lxvd2x/xxswapd. 11907 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. 11908 if (VT.isSimple()) { 11909 MVT LoadVT = VT.getSimpleVT(); 11910 if (Subtarget.needsSwapsForVSXMemOps() && 11911 (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 || 11912 LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32)) 11913 return expandVSXLoadForLE(N, DCI); 11914 } 11915 11916 // We sometimes end up with a 64-bit integer load, from which we extract 11917 // two single-precision floating-point numbers. This happens with 11918 // std::complex<float>, and other similar structures, because of the way we 11919 // canonicalize structure copies. However, if we lack direct moves, 11920 // then the final bitcasts from the extracted integer values to the 11921 // floating-point numbers turn into store/load pairs. Even with direct moves, 11922 // just loading the two floating-point numbers is likely better. 11923 auto ReplaceTwoFloatLoad = [&]() { 11924 if (VT != MVT::i64) 11925 return false; 11926 11927 if (LD->getExtensionType() != ISD::NON_EXTLOAD || 11928 LD->isVolatile()) 11929 return false; 11930 11931 // We're looking for a sequence like this: 11932 // t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64 11933 // t16: i64 = srl t13, Constant:i32<32> 11934 // t17: i32 = truncate t16 11935 // t18: f32 = bitcast t17 11936 // t19: i32 = truncate t13 11937 // t20: f32 = bitcast t19 11938 11939 if (!LD->hasNUsesOfValue(2, 0)) 11940 return false; 11941 11942 auto UI = LD->use_begin(); 11943 while (UI.getUse().getResNo() != 0) ++UI; 11944 SDNode *Trunc = *UI++; 11945 while (UI.getUse().getResNo() != 0) ++UI; 11946 SDNode *RightShift = *UI; 11947 if (Trunc->getOpcode() != ISD::TRUNCATE) 11948 std::swap(Trunc, RightShift); 11949 11950 if (Trunc->getOpcode() != ISD::TRUNCATE || 11951 Trunc->getValueType(0) != MVT::i32 || 11952 !Trunc->hasOneUse()) 11953 return false; 11954 if (RightShift->getOpcode() != ISD::SRL || 11955 !isa<ConstantSDNode>(RightShift->getOperand(1)) || 11956 RightShift->getConstantOperandVal(1) != 32 || 11957 !RightShift->hasOneUse()) 11958 return false; 11959 11960 SDNode *Trunc2 = *RightShift->use_begin(); 11961 if (Trunc2->getOpcode() != ISD::TRUNCATE || 11962 Trunc2->getValueType(0) != MVT::i32 || 11963 !Trunc2->hasOneUse()) 11964 return false; 11965 11966 SDNode *Bitcast = *Trunc->use_begin(); 11967 SDNode *Bitcast2 = *Trunc2->use_begin(); 11968 11969 if (Bitcast->getOpcode() != ISD::BITCAST || 11970 Bitcast->getValueType(0) != MVT::f32) 11971 return false; 11972 if (Bitcast2->getOpcode() != ISD::BITCAST || 11973 Bitcast2->getValueType(0) != MVT::f32) 11974 return false; 11975 11976 if (Subtarget.isLittleEndian()) 11977 std::swap(Bitcast, Bitcast2); 11978 11979 // Bitcast has the second float (in memory-layout order) and Bitcast2 11980 // has the first one. 11981 11982 SDValue BasePtr = LD->getBasePtr(); 11983 if (LD->isIndexed()) { 11984 assert(LD->getAddressingMode() == ISD::PRE_INC && 11985 "Non-pre-inc AM on PPC?"); 11986 BasePtr = 11987 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 11988 LD->getOffset()); 11989 } 11990 11991 auto MMOFlags = 11992 LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile; 11993 SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr, 11994 LD->getPointerInfo(), LD->getAlignment(), 11995 MMOFlags, LD->getAAInfo()); 11996 SDValue AddPtr = 11997 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), 11998 BasePtr, DAG.getIntPtrConstant(4, dl)); 11999 SDValue FloatLoad2 = DAG.getLoad( 12000 MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr, 12001 LD->getPointerInfo().getWithOffset(4), 12002 MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo()); 12003 12004 if (LD->isIndexed()) { 12005 // Note that DAGCombine should re-form any pre-increment load(s) from 12006 // what is produced here if that makes sense. 12007 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr); 12008 } 12009 12010 DCI.CombineTo(Bitcast2, FloatLoad); 12011 DCI.CombineTo(Bitcast, FloatLoad2); 12012 12013 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1), 12014 SDValue(FloatLoad2.getNode(), 1)); 12015 return true; 12016 }; 12017 12018 if (ReplaceTwoFloatLoad()) 12019 return SDValue(N, 0); 12020 12021 EVT MemVT = LD->getMemoryVT(); 12022 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); 12023 unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); 12024 Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); 12025 unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy); 12026 if (LD->isUnindexed() && VT.isVector() && 12027 ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && 12028 // P8 and later hardware should just use LOAD. 12029 !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || 12030 VT == MVT::v4i32 || VT == MVT::v4f32)) || 12031 (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) && 12032 LD->getAlignment() >= ScalarABIAlignment)) && 12033 LD->getAlignment() < ABIAlignment) { 12034 // This is a type-legal unaligned Altivec or QPX load. 12035 SDValue Chain = LD->getChain(); 12036 SDValue Ptr = LD->getBasePtr(); 12037 bool isLittleEndian = Subtarget.isLittleEndian(); 12038 12039 // This implements the loading of unaligned vectors as described in 12040 // the venerable Apple Velocity Engine overview. Specifically: 12041 // https://developer.apple.com/hardwaredrivers/ve/alignment.html 12042 // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html 12043 // 12044 // The general idea is to expand a sequence of one or more unaligned 12045 // loads into an alignment-based permutation-control instruction (lvsl 12046 // or lvsr), a series of regular vector loads (which always truncate 12047 // their input address to an aligned address), and a series of 12048 // permutations. The results of these permutations are the requested 12049 // loaded values. The trick is that the last "extra" load is not taken 12050 // from the address you might suspect (sizeof(vector) bytes after the 12051 // last requested load), but rather sizeof(vector) - 1 bytes after the 12052 // last requested vector. The point of this is to avoid a page fault if 12053 // the base address happened to be aligned. This works because if the 12054 // base address is aligned, then adding less than a full vector length 12055 // will cause the last vector in the sequence to be (re)loaded. 12056 // Otherwise, the next vector will be fetched as you might suspect was 12057 // necessary. 12058 12059 // We might be able to reuse the permutation generation from 12060 // a different base address offset from this one by an aligned amount. 12061 // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this 12062 // optimization later. 12063 Intrinsic::ID Intr, IntrLD, IntrPerm; 12064 MVT PermCntlTy, PermTy, LDTy; 12065 if (Subtarget.hasAltivec()) { 12066 Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr : 12067 Intrinsic::ppc_altivec_lvsl; 12068 IntrLD = Intrinsic::ppc_altivec_lvx; 12069 IntrPerm = Intrinsic::ppc_altivec_vperm; 12070 PermCntlTy = MVT::v16i8; 12071 PermTy = MVT::v4i32; 12072 LDTy = MVT::v4i32; 12073 } else { 12074 Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld : 12075 Intrinsic::ppc_qpx_qvlpcls; 12076 IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd : 12077 Intrinsic::ppc_qpx_qvlfs; 12078 IntrPerm = Intrinsic::ppc_qpx_qvfperm; 12079 PermCntlTy = MVT::v4f64; 12080 PermTy = MVT::v4f64; 12081 LDTy = MemVT.getSimpleVT(); 12082 } 12083 12084 SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy); 12085 12086 // Create the new MMO for the new base load. It is like the original MMO, 12087 // but represents an area in memory almost twice the vector size centered 12088 // on the original address. If the address is unaligned, we might start 12089 // reading up to (sizeof(vector)-1) bytes below the address of the 12090 // original unaligned load. 12091 MachineFunction &MF = DAG.getMachineFunction(); 12092 MachineMemOperand *BaseMMO = 12093 MF.getMachineMemOperand(LD->getMemOperand(), 12094 -(long)MemVT.getStoreSize()+1, 12095 2*MemVT.getStoreSize()-1); 12096 12097 // Create the new base load. 12098 SDValue LDXIntID = 12099 DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout())); 12100 SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr }; 12101 SDValue BaseLoad = 12102 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, 12103 DAG.getVTList(PermTy, MVT::Other), 12104 BaseLoadOps, LDTy, BaseMMO); 12105 12106 // Note that the value of IncOffset (which is provided to the next 12107 // load's pointer info offset value, and thus used to calculate the 12108 // alignment), and the value of IncValue (which is actually used to 12109 // increment the pointer value) are different! This is because we 12110 // require the next load to appear to be aligned, even though it 12111 // is actually offset from the base pointer by a lesser amount. 12112 int IncOffset = VT.getSizeInBits() / 8; 12113 int IncValue = IncOffset; 12114 12115 // Walk (both up and down) the chain looking for another load at the real 12116 // (aligned) offset (the alignment of the other load does not matter in 12117 // this case). If found, then do not use the offset reduction trick, as 12118 // that will prevent the loads from being later combined (as they would 12119 // otherwise be duplicates). 12120 if (!findConsecutiveLoad(LD, DAG)) 12121 --IncValue; 12122 12123 SDValue Increment = 12124 DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout())); 12125 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 12126 12127 MachineMemOperand *ExtraMMO = 12128 MF.getMachineMemOperand(LD->getMemOperand(), 12129 1, 2*MemVT.getStoreSize()-1); 12130 SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr }; 12131 SDValue ExtraLoad = 12132 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, 12133 DAG.getVTList(PermTy, MVT::Other), 12134 ExtraLoadOps, LDTy, ExtraMMO); 12135 12136 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 12137 BaseLoad.getValue(1), ExtraLoad.getValue(1)); 12138 12139 // Because vperm has a big-endian bias, we must reverse the order 12140 // of the input vectors and complement the permute control vector 12141 // when generating little endian code. We have already handled the 12142 // latter by using lvsr instead of lvsl, so just reverse BaseLoad 12143 // and ExtraLoad here. 12144 SDValue Perm; 12145 if (isLittleEndian) 12146 Perm = BuildIntrinsicOp(IntrPerm, 12147 ExtraLoad, BaseLoad, PermCntl, DAG, dl); 12148 else 12149 Perm = BuildIntrinsicOp(IntrPerm, 12150 BaseLoad, ExtraLoad, PermCntl, DAG, dl); 12151 12152 if (VT != PermTy) 12153 Perm = Subtarget.hasAltivec() ? 12154 DAG.getNode(ISD::BITCAST, dl, VT, Perm) : 12155 DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX 12156 DAG.getTargetConstant(1, dl, MVT::i64)); 12157 // second argument is 1 because this rounding 12158 // is always exact. 12159 12160 // The output of the permutation is our loaded result, the TokenFactor is 12161 // our new chain. 12162 DCI.CombineTo(N, Perm, TF); 12163 return SDValue(N, 0); 12164 } 12165 } 12166 break; 12167 case ISD::INTRINSIC_WO_CHAIN: { 12168 bool isLittleEndian = Subtarget.isLittleEndian(); 12169 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 12170 Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr 12171 : Intrinsic::ppc_altivec_lvsl); 12172 if ((IID == Intr || 12173 IID == Intrinsic::ppc_qpx_qvlpcld || 12174 IID == Intrinsic::ppc_qpx_qvlpcls) && 12175 N->getOperand(1)->getOpcode() == ISD::ADD) { 12176 SDValue Add = N->getOperand(1); 12177 12178 int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ? 12179 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */; 12180 12181 if (DAG.MaskedValueIsZero(Add->getOperand(1), 12182 APInt::getAllOnesValue(Bits /* alignment */) 12183 .zext(Add.getScalarValueSizeInBits()))) { 12184 SDNode *BasePtr = Add->getOperand(0).getNode(); 12185 for (SDNode::use_iterator UI = BasePtr->use_begin(), 12186 UE = BasePtr->use_end(); 12187 UI != UE; ++UI) { 12188 if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && 12189 cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) { 12190 // We've found another LVSL/LVSR, and this address is an aligned 12191 // multiple of that one. The results will be the same, so use the 12192 // one we've just found instead. 12193 12194 return SDValue(*UI, 0); 12195 } 12196 } 12197 } 12198 12199 if (isa<ConstantSDNode>(Add->getOperand(1))) { 12200 SDNode *BasePtr = Add->getOperand(0).getNode(); 12201 for (SDNode::use_iterator UI = BasePtr->use_begin(), 12202 UE = BasePtr->use_end(); UI != UE; ++UI) { 12203 if (UI->getOpcode() == ISD::ADD && 12204 isa<ConstantSDNode>(UI->getOperand(1)) && 12205 (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() - 12206 cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) % 12207 (1ULL << Bits) == 0) { 12208 SDNode *OtherAdd = *UI; 12209 for (SDNode::use_iterator VI = OtherAdd->use_begin(), 12210 VE = OtherAdd->use_end(); VI != VE; ++VI) { 12211 if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && 12212 cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) { 12213 return SDValue(*VI, 0); 12214 } 12215 } 12216 } 12217 } 12218 } 12219 } 12220 } 12221 12222 break; 12223 case ISD::INTRINSIC_W_CHAIN: 12224 // For little endian, VSX loads require generating lxvd2x/xxswapd. 12225 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. 12226 if (Subtarget.needsSwapsForVSXMemOps()) { 12227 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 12228 default: 12229 break; 12230 case Intrinsic::ppc_vsx_lxvw4x: 12231 case Intrinsic::ppc_vsx_lxvd2x: 12232 return expandVSXLoadForLE(N, DCI); 12233 } 12234 } 12235 break; 12236 case ISD::INTRINSIC_VOID: 12237 // For little endian, VSX stores require generating xxswapd/stxvd2x. 12238 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. 12239 if (Subtarget.needsSwapsForVSXMemOps()) { 12240 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 12241 default: 12242 break; 12243 case Intrinsic::ppc_vsx_stxvw4x: 12244 case Intrinsic::ppc_vsx_stxvd2x: 12245 return expandVSXStoreForLE(N, DCI); 12246 } 12247 } 12248 break; 12249 case ISD::BSWAP: 12250 // Turn BSWAP (LOAD) -> lhbrx/lwbrx. 12251 if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && 12252 N->getOperand(0).hasOneUse() && 12253 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 || 12254 (Subtarget.hasLDBRX() && Subtarget.isPPC64() && 12255 N->getValueType(0) == MVT::i64))) { 12256 SDValue Load = N->getOperand(0); 12257 LoadSDNode *LD = cast<LoadSDNode>(Load); 12258 // Create the byte-swapping load. 12259 SDValue Ops[] = { 12260 LD->getChain(), // Chain 12261 LD->getBasePtr(), // Ptr 12262 DAG.getValueType(N->getValueType(0)) // VT 12263 }; 12264 SDValue BSLoad = 12265 DAG.getMemIntrinsicNode(PPCISD::LBRX, dl, 12266 DAG.getVTList(N->getValueType(0) == MVT::i64 ? 12267 MVT::i64 : MVT::i32, MVT::Other), 12268 Ops, LD->getMemoryVT(), LD->getMemOperand()); 12269 12270 // If this is an i16 load, insert the truncate. 12271 SDValue ResVal = BSLoad; 12272 if (N->getValueType(0) == MVT::i16) 12273 ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad); 12274 12275 // First, combine the bswap away. This makes the value produced by the 12276 // load dead. 12277 DCI.CombineTo(N, ResVal); 12278 12279 // Next, combine the load away, we give it a bogus result value but a real 12280 // chain result. The result value is dead because the bswap is dead. 12281 DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1)); 12282 12283 // Return N so it doesn't get rechecked! 12284 return SDValue(N, 0); 12285 } 12286 break; 12287 case PPCISD::VCMP: 12288 // If a VCMPo node already exists with exactly the same operands as this 12289 // node, use its result instead of this node (VCMPo computes both a CR6 and 12290 // a normal output). 12291 // 12292 if (!N->getOperand(0).hasOneUse() && 12293 !N->getOperand(1).hasOneUse() && 12294 !N->getOperand(2).hasOneUse()) { 12295 12296 // Scan all of the users of the LHS, looking for VCMPo's that match. 12297 SDNode *VCMPoNode = nullptr; 12298 12299 SDNode *LHSN = N->getOperand(0).getNode(); 12300 for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end(); 12301 UI != E; ++UI) 12302 if (UI->getOpcode() == PPCISD::VCMPo && 12303 UI->getOperand(1) == N->getOperand(1) && 12304 UI->getOperand(2) == N->getOperand(2) && 12305 UI->getOperand(0) == N->getOperand(0)) { 12306 VCMPoNode = *UI; 12307 break; 12308 } 12309 12310 // If there is no VCMPo node, or if the flag value has a single use, don't 12311 // transform this. 12312 if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1)) 12313 break; 12314 12315 // Look at the (necessarily single) use of the flag value. If it has a 12316 // chain, this transformation is more complex. Note that multiple things 12317 // could use the value result, which we should ignore. 12318 SDNode *FlagUser = nullptr; 12319 for (SDNode::use_iterator UI = VCMPoNode->use_begin(); 12320 FlagUser == nullptr; ++UI) { 12321 assert(UI != VCMPoNode->use_end() && "Didn't find user!"); 12322 SDNode *User = *UI; 12323 for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { 12324 if (User->getOperand(i) == SDValue(VCMPoNode, 1)) { 12325 FlagUser = User; 12326 break; 12327 } 12328 } 12329 } 12330 12331 // If the user is a MFOCRF instruction, we know this is safe. 12332 // Otherwise we give up for right now. 12333 if (FlagUser->getOpcode() == PPCISD::MFOCRF) 12334 return SDValue(VCMPoNode, 0); 12335 } 12336 break; 12337 case ISD::BRCOND: { 12338 SDValue Cond = N->getOperand(1); 12339 SDValue Target = N->getOperand(2); 12340 12341 if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && 12342 cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() == 12343 Intrinsic::ppc_is_decremented_ctr_nonzero) { 12344 12345 // We now need to make the intrinsic dead (it cannot be instruction 12346 // selected). 12347 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0)); 12348 assert(Cond.getNode()->hasOneUse() && 12349 "Counter decrement has more than one use"); 12350 12351 return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other, 12352 N->getOperand(0), Target); 12353 } 12354 } 12355 break; 12356 case ISD::BR_CC: { 12357 // If this is a branch on an altivec predicate comparison, lower this so 12358 // that we don't have to do a MFOCRF: instead, branch directly on CR6. This 12359 // lowering is done pre-legalize, because the legalizer lowers the predicate 12360 // compare down to code that is difficult to reassemble. 12361 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 12362 SDValue LHS = N->getOperand(2), RHS = N->getOperand(3); 12363 12364 // Sometimes the promoted value of the intrinsic is ANDed by some non-zero 12365 // value. If so, pass-through the AND to get to the intrinsic. 12366 if (LHS.getOpcode() == ISD::AND && 12367 LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && 12368 cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() == 12369 Intrinsic::ppc_is_decremented_ctr_nonzero && 12370 isa<ConstantSDNode>(LHS.getOperand(1)) && 12371 !isNullConstant(LHS.getOperand(1))) 12372 LHS = LHS.getOperand(0); 12373 12374 if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && 12375 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == 12376 Intrinsic::ppc_is_decremented_ctr_nonzero && 12377 isa<ConstantSDNode>(RHS)) { 12378 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && 12379 "Counter decrement comparison is not EQ or NE"); 12380 12381 unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); 12382 bool isBDNZ = (CC == ISD::SETEQ && Val) || 12383 (CC == ISD::SETNE && !Val); 12384 12385 // We now need to make the intrinsic dead (it cannot be instruction 12386 // selected). 12387 DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0)); 12388 assert(LHS.getNode()->hasOneUse() && 12389 "Counter decrement has more than one use"); 12390 12391 return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other, 12392 N->getOperand(0), N->getOperand(4)); 12393 } 12394 12395 int CompareOpc; 12396 bool isDot; 12397 12398 if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && 12399 isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && 12400 getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { 12401 assert(isDot && "Can't compare against a vector result!"); 12402 12403 // If this is a comparison against something other than 0/1, then we know 12404 // that the condition is never/always true. 12405 unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); 12406 if (Val != 0 && Val != 1) { 12407 if (CC == ISD::SETEQ) // Cond never true, remove branch. 12408 return N->getOperand(0); 12409 // Always !=, turn it into an unconditional branch. 12410 return DAG.getNode(ISD::BR, dl, MVT::Other, 12411 N->getOperand(0), N->getOperand(4)); 12412 } 12413 12414 bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0); 12415 12416 // Create the PPCISD altivec 'dot' comparison node. 12417 SDValue Ops[] = { 12418 LHS.getOperand(2), // LHS of compare 12419 LHS.getOperand(3), // RHS of compare 12420 DAG.getConstant(CompareOpc, dl, MVT::i32) 12421 }; 12422 EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue }; 12423 SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); 12424 12425 // Unpack the result based on how the target uses it. 12426 PPC::Predicate CompOpc; 12427 switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) { 12428 default: // Can't happen, don't crash on invalid number though. 12429 case 0: // Branch on the value of the EQ bit of CR6. 12430 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE; 12431 break; 12432 case 1: // Branch on the inverted value of the EQ bit of CR6. 12433 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ; 12434 break; 12435 case 2: // Branch on the value of the LT bit of CR6. 12436 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE; 12437 break; 12438 case 3: // Branch on the inverted value of the LT bit of CR6. 12439 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT; 12440 break; 12441 } 12442 12443 return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0), 12444 DAG.getConstant(CompOpc, dl, MVT::i32), 12445 DAG.getRegister(PPC::CR6, MVT::i32), 12446 N->getOperand(4), CompNode.getValue(1)); 12447 } 12448 break; 12449 } 12450 case ISD::BUILD_VECTOR: 12451 return DAGCombineBuildVector(N, DCI); 12452 } 12453 12454 return SDValue(); 12455 } 12456 12457 SDValue 12458 PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 12459 SelectionDAG &DAG, 12460 std::vector<SDNode *> *Created) const { 12461 // fold (sdiv X, pow2) 12462 EVT VT = N->getValueType(0); 12463 if (VT == MVT::i64 && !Subtarget.isPPC64()) 12464 return SDValue(); 12465 if ((VT != MVT::i32 && VT != MVT::i64) || 12466 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) 12467 return SDValue(); 12468 12469 SDLoc DL(N); 12470 SDValue N0 = N->getOperand(0); 12471 12472 bool IsNegPow2 = (-Divisor).isPowerOf2(); 12473 unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros(); 12474 SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT); 12475 12476 SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt); 12477 if (Created) 12478 Created->push_back(Op.getNode()); 12479 12480 if (IsNegPow2) { 12481 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); 12482 if (Created) 12483 Created->push_back(Op.getNode()); 12484 } 12485 12486 return Op; 12487 } 12488 12489 //===----------------------------------------------------------------------===// 12490 // Inline Assembly Support 12491 //===----------------------------------------------------------------------===// 12492 12493 void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 12494 KnownBits &Known, 12495 const APInt &DemandedElts, 12496 const SelectionDAG &DAG, 12497 unsigned Depth) const { 12498 Known.resetAll(); 12499 switch (Op.getOpcode()) { 12500 default: break; 12501 case PPCISD::LBRX: { 12502 // lhbrx is known to have the top bits cleared out. 12503 if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16) 12504 Known.Zero = 0xFFFF0000; 12505 break; 12506 } 12507 case ISD::INTRINSIC_WO_CHAIN: { 12508 switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { 12509 default: break; 12510 case Intrinsic::ppc_altivec_vcmpbfp_p: 12511 case Intrinsic::ppc_altivec_vcmpeqfp_p: 12512 case Intrinsic::ppc_altivec_vcmpequb_p: 12513 case Intrinsic::ppc_altivec_vcmpequh_p: 12514 case Intrinsic::ppc_altivec_vcmpequw_p: 12515 case Intrinsic::ppc_altivec_vcmpequd_p: 12516 case Intrinsic::ppc_altivec_vcmpgefp_p: 12517 case Intrinsic::ppc_altivec_vcmpgtfp_p: 12518 case Intrinsic::ppc_altivec_vcmpgtsb_p: 12519 case Intrinsic::ppc_altivec_vcmpgtsh_p: 12520 case Intrinsic::ppc_altivec_vcmpgtsw_p: 12521 case Intrinsic::ppc_altivec_vcmpgtsd_p: 12522 case Intrinsic::ppc_altivec_vcmpgtub_p: 12523 case Intrinsic::ppc_altivec_vcmpgtuh_p: 12524 case Intrinsic::ppc_altivec_vcmpgtuw_p: 12525 case Intrinsic::ppc_altivec_vcmpgtud_p: 12526 Known.Zero = ~1U; // All bits but the low one are known to be zero. 12527 break; 12528 } 12529 } 12530 } 12531 } 12532 12533 unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { 12534 switch (Subtarget.getDarwinDirective()) { 12535 default: break; 12536 case PPC::DIR_970: 12537 case PPC::DIR_PWR4: 12538 case PPC::DIR_PWR5: 12539 case PPC::DIR_PWR5X: 12540 case PPC::DIR_PWR6: 12541 case PPC::DIR_PWR6X: 12542 case PPC::DIR_PWR7: 12543 case PPC::DIR_PWR8: 12544 case PPC::DIR_PWR9: { 12545 if (!ML) 12546 break; 12547 12548 const PPCInstrInfo *TII = Subtarget.getInstrInfo(); 12549 12550 // For small loops (between 5 and 8 instructions), align to a 32-byte 12551 // boundary so that the entire loop fits in one instruction-cache line. 12552 uint64_t LoopSize = 0; 12553 for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) 12554 for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) { 12555 LoopSize += TII->getInstSizeInBytes(*J); 12556 if (LoopSize > 32) 12557 break; 12558 } 12559 12560 if (LoopSize > 16 && LoopSize <= 32) 12561 return 5; 12562 12563 break; 12564 } 12565 } 12566 12567 return TargetLowering::getPrefLoopAlignment(ML); 12568 } 12569 12570 /// getConstraintType - Given a constraint, return the type of 12571 /// constraint it is for this target. 12572 PPCTargetLowering::ConstraintType 12573 PPCTargetLowering::getConstraintType(StringRef Constraint) const { 12574 if (Constraint.size() == 1) { 12575 switch (Constraint[0]) { 12576 default: break; 12577 case 'b': 12578 case 'r': 12579 case 'f': 12580 case 'd': 12581 case 'v': 12582 case 'y': 12583 return C_RegisterClass; 12584 case 'Z': 12585 // FIXME: While Z does indicate a memory constraint, it specifically 12586 // indicates an r+r address (used in conjunction with the 'y' modifier 12587 // in the replacement string). Currently, we're forcing the base 12588 // register to be r0 in the asm printer (which is interpreted as zero) 12589 // and forming the complete address in the second register. This is 12590 // suboptimal. 12591 return C_Memory; 12592 } 12593 } else if (Constraint == "wc") { // individual CR bits. 12594 return C_RegisterClass; 12595 } else if (Constraint == "wa" || Constraint == "wd" || 12596 Constraint == "wf" || Constraint == "ws") { 12597 return C_RegisterClass; // VSX registers. 12598 } 12599 return TargetLowering::getConstraintType(Constraint); 12600 } 12601 12602 /// Examine constraint type and operand type and determine a weight value. 12603 /// This object must already have been set up with the operand type 12604 /// and the current alternative constraint selected. 12605 TargetLowering::ConstraintWeight 12606 PPCTargetLowering::getSingleConstraintMatchWeight( 12607 AsmOperandInfo &info, const char *constraint) const { 12608 ConstraintWeight weight = CW_Invalid; 12609 Value *CallOperandVal = info.CallOperandVal; 12610 // If we don't have a value, we can't do a match, 12611 // but allow it at the lowest weight. 12612 if (!CallOperandVal) 12613 return CW_Default; 12614 Type *type = CallOperandVal->getType(); 12615 12616 // Look at the constraint type. 12617 if (StringRef(constraint) == "wc" && type->isIntegerTy(1)) 12618 return CW_Register; // an individual CR bit. 12619 else if ((StringRef(constraint) == "wa" || 12620 StringRef(constraint) == "wd" || 12621 StringRef(constraint) == "wf") && 12622 type->isVectorTy()) 12623 return CW_Register; 12624 else if (StringRef(constraint) == "ws" && type->isDoubleTy()) 12625 return CW_Register; 12626 12627 switch (*constraint) { 12628 default: 12629 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 12630 break; 12631 case 'b': 12632 if (type->isIntegerTy()) 12633 weight = CW_Register; 12634 break; 12635 case 'f': 12636 if (type->isFloatTy()) 12637 weight = CW_Register; 12638 break; 12639 case 'd': 12640 if (type->isDoubleTy()) 12641 weight = CW_Register; 12642 break; 12643 case 'v': 12644 if (type->isVectorTy()) 12645 weight = CW_Register; 12646 break; 12647 case 'y': 12648 weight = CW_Register; 12649 break; 12650 case 'Z': 12651 weight = CW_Memory; 12652 break; 12653 } 12654 return weight; 12655 } 12656 12657 std::pair<unsigned, const TargetRegisterClass *> 12658 PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 12659 StringRef Constraint, 12660 MVT VT) const { 12661 if (Constraint.size() == 1) { 12662 // GCC RS6000 Constraint Letters 12663 switch (Constraint[0]) { 12664 case 'b': // R1-R31 12665 if (VT == MVT::i64 && Subtarget.isPPC64()) 12666 return std::make_pair(0U, &PPC::G8RC_NOX0RegClass); 12667 return std::make_pair(0U, &PPC::GPRC_NOR0RegClass); 12668 case 'r': // R0-R31 12669 if (VT == MVT::i64 && Subtarget.isPPC64()) 12670 return std::make_pair(0U, &PPC::G8RCRegClass); 12671 return std::make_pair(0U, &PPC::GPRCRegClass); 12672 // 'd' and 'f' constraints are both defined to be "the floating point 12673 // registers", where one is for 32-bit and the other for 64-bit. We don't 12674 // really care overly much here so just give them all the same reg classes. 12675 case 'd': 12676 case 'f': 12677 if (VT == MVT::f32 || VT == MVT::i32) 12678 return std::make_pair(0U, &PPC::F4RCRegClass); 12679 if (VT == MVT::f64 || VT == MVT::i64) 12680 return std::make_pair(0U, &PPC::F8RCRegClass); 12681 if (VT == MVT::v4f64 && Subtarget.hasQPX()) 12682 return std::make_pair(0U, &PPC::QFRCRegClass); 12683 if (VT == MVT::v4f32 && Subtarget.hasQPX()) 12684 return std::make_pair(0U, &PPC::QSRCRegClass); 12685 break; 12686 case 'v': 12687 if (VT == MVT::v4f64 && Subtarget.hasQPX()) 12688 return std::make_pair(0U, &PPC::QFRCRegClass); 12689 if (VT == MVT::v4f32 && Subtarget.hasQPX()) 12690 return std::make_pair(0U, &PPC::QSRCRegClass); 12691 if (Subtarget.hasAltivec()) 12692 return std::make_pair(0U, &PPC::VRRCRegClass); 12693 case 'y': // crrc 12694 return std::make_pair(0U, &PPC::CRRCRegClass); 12695 } 12696 } else if (Constraint == "wc" && Subtarget.useCRBits()) { 12697 // An individual CR bit. 12698 return std::make_pair(0U, &PPC::CRBITRCRegClass); 12699 } else if ((Constraint == "wa" || Constraint == "wd" || 12700 Constraint == "wf") && Subtarget.hasVSX()) { 12701 return std::make_pair(0U, &PPC::VSRCRegClass); 12702 } else if (Constraint == "ws" && Subtarget.hasVSX()) { 12703 if (VT == MVT::f32 && Subtarget.hasP8Vector()) 12704 return std::make_pair(0U, &PPC::VSSRCRegClass); 12705 else 12706 return std::make_pair(0U, &PPC::VSFRCRegClass); 12707 } 12708 12709 std::pair<unsigned, const TargetRegisterClass *> R = 12710 TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 12711 12712 // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers 12713 // (which we call X[0-9]+). If a 64-bit value has been requested, and a 12714 // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent 12715 // register. 12716 // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use 12717 // the AsmName field from *RegisterInfo.td, then this would not be necessary. 12718 if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && 12719 PPC::GPRCRegClass.contains(R.first)) 12720 return std::make_pair(TRI->getMatchingSuperReg(R.first, 12721 PPC::sub_32, &PPC::G8RCRegClass), 12722 &PPC::G8RCRegClass); 12723 12724 // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same. 12725 if (!R.second && StringRef("{cc}").equals_lower(Constraint)) { 12726 R.first = PPC::CR0; 12727 R.second = &PPC::CRRCRegClass; 12728 } 12729 12730 return R; 12731 } 12732 12733 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 12734 /// vector. If it is invalid, don't add anything to Ops. 12735 void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 12736 std::string &Constraint, 12737 std::vector<SDValue>&Ops, 12738 SelectionDAG &DAG) const { 12739 SDValue Result; 12740 12741 // Only support length 1 constraints. 12742 if (Constraint.length() > 1) return; 12743 12744 char Letter = Constraint[0]; 12745 switch (Letter) { 12746 default: break; 12747 case 'I': 12748 case 'J': 12749 case 'K': 12750 case 'L': 12751 case 'M': 12752 case 'N': 12753 case 'O': 12754 case 'P': { 12755 ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op); 12756 if (!CST) return; // Must be an immediate to match. 12757 SDLoc dl(Op); 12758 int64_t Value = CST->getSExtValue(); 12759 EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative 12760 // numbers are printed as such. 12761 switch (Letter) { 12762 default: llvm_unreachable("Unknown constraint letter!"); 12763 case 'I': // "I" is a signed 16-bit constant. 12764 if (isInt<16>(Value)) 12765 Result = DAG.getTargetConstant(Value, dl, TCVT); 12766 break; 12767 case 'J': // "J" is a constant with only the high-order 16 bits nonzero. 12768 if (isShiftedUInt<16, 16>(Value)) 12769 Result = DAG.getTargetConstant(Value, dl, TCVT); 12770 break; 12771 case 'L': // "L" is a signed 16-bit constant shifted left 16 bits. 12772 if (isShiftedInt<16, 16>(Value)) 12773 Result = DAG.getTargetConstant(Value, dl, TCVT); 12774 break; 12775 case 'K': // "K" is a constant with only the low-order 16 bits nonzero. 12776 if (isUInt<16>(Value)) 12777 Result = DAG.getTargetConstant(Value, dl, TCVT); 12778 break; 12779 case 'M': // "M" is a constant that is greater than 31. 12780 if (Value > 31) 12781 Result = DAG.getTargetConstant(Value, dl, TCVT); 12782 break; 12783 case 'N': // "N" is a positive constant that is an exact power of two. 12784 if (Value > 0 && isPowerOf2_64(Value)) 12785 Result = DAG.getTargetConstant(Value, dl, TCVT); 12786 break; 12787 case 'O': // "O" is the constant zero. 12788 if (Value == 0) 12789 Result = DAG.getTargetConstant(Value, dl, TCVT); 12790 break; 12791 case 'P': // "P" is a constant whose negation is a signed 16-bit constant. 12792 if (isInt<16>(-Value)) 12793 Result = DAG.getTargetConstant(Value, dl, TCVT); 12794 break; 12795 } 12796 break; 12797 } 12798 } 12799 12800 if (Result.getNode()) { 12801 Ops.push_back(Result); 12802 return; 12803 } 12804 12805 // Handle standard constraint letters. 12806 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 12807 } 12808 12809 // isLegalAddressingMode - Return true if the addressing mode represented 12810 // by AM is legal for this target, for a load/store of the specified type. 12811 bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, 12812 const AddrMode &AM, Type *Ty, 12813 unsigned AS, Instruction *I) const { 12814 // PPC does not allow r+i addressing modes for vectors! 12815 if (Ty->isVectorTy() && AM.BaseOffs != 0) 12816 return false; 12817 12818 // PPC allows a sign-extended 16-bit immediate field. 12819 if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1) 12820 return false; 12821 12822 // No global is ever allowed as a base. 12823 if (AM.BaseGV) 12824 return false; 12825 12826 // PPC only support r+r, 12827 switch (AM.Scale) { 12828 case 0: // "r+i" or just "i", depending on HasBaseReg. 12829 break; 12830 case 1: 12831 if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. 12832 return false; 12833 // Otherwise we have r+r or r+i. 12834 break; 12835 case 2: 12836 if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. 12837 return false; 12838 // Allow 2*r as r+r. 12839 break; 12840 default: 12841 // No other scales are supported. 12842 return false; 12843 } 12844 12845 return true; 12846 } 12847 12848 SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, 12849 SelectionDAG &DAG) const { 12850 MachineFunction &MF = DAG.getMachineFunction(); 12851 MachineFrameInfo &MFI = MF.getFrameInfo(); 12852 MFI.setReturnAddressIsTaken(true); 12853 12854 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 12855 return SDValue(); 12856 12857 SDLoc dl(Op); 12858 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 12859 12860 // Make sure the function does not optimize away the store of the RA to 12861 // the stack. 12862 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 12863 FuncInfo->setLRStoreRequired(); 12864 bool isPPC64 = Subtarget.isPPC64(); 12865 auto PtrVT = getPointerTy(MF.getDataLayout()); 12866 12867 if (Depth > 0) { 12868 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 12869 SDValue Offset = 12870 DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl, 12871 isPPC64 ? MVT::i64 : MVT::i32); 12872 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 12873 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), 12874 MachinePointerInfo()); 12875 } 12876 12877 // Just load the return address off the stack. 12878 SDValue RetAddrFI = getReturnAddrFrameIndex(DAG); 12879 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, 12880 MachinePointerInfo()); 12881 } 12882 12883 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, 12884 SelectionDAG &DAG) const { 12885 SDLoc dl(Op); 12886 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 12887 12888 MachineFunction &MF = DAG.getMachineFunction(); 12889 MachineFrameInfo &MFI = MF.getFrameInfo(); 12890 MFI.setFrameAddressIsTaken(true); 12891 12892 EVT PtrVT = getPointerTy(MF.getDataLayout()); 12893 bool isPPC64 = PtrVT == MVT::i64; 12894 12895 // Naked functions never have a frame pointer, and so we use r1. For all 12896 // other functions, this decision must be delayed until during PEI. 12897 unsigned FrameReg; 12898 if (MF.getFunction()->hasFnAttribute(Attribute::Naked)) 12899 FrameReg = isPPC64 ? PPC::X1 : PPC::R1; 12900 else 12901 FrameReg = isPPC64 ? PPC::FP8 : PPC::FP; 12902 12903 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, 12904 PtrVT); 12905 while (Depth--) 12906 FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(), 12907 FrameAddr, MachinePointerInfo()); 12908 return FrameAddr; 12909 } 12910 12911 // FIXME? Maybe this could be a TableGen attribute on some registers and 12912 // this table could be generated automatically from RegInfo. 12913 unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, 12914 SelectionDAG &DAG) const { 12915 bool isPPC64 = Subtarget.isPPC64(); 12916 bool isDarwinABI = Subtarget.isDarwinABI(); 12917 12918 if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || 12919 (!isPPC64 && VT != MVT::i32)) 12920 report_fatal_error("Invalid register global variable type"); 12921 12922 bool is64Bit = isPPC64 && VT == MVT::i64; 12923 unsigned Reg = StringSwitch<unsigned>(RegName) 12924 .Case("r1", is64Bit ? PPC::X1 : PPC::R1) 12925 .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2) 12926 .Case("r13", (!isPPC64 && isDarwinABI) ? 0 : 12927 (is64Bit ? PPC::X13 : PPC::R13)) 12928 .Default(0); 12929 12930 if (Reg) 12931 return Reg; 12932 report_fatal_error("Invalid register name global variable"); 12933 } 12934 12935 bool 12936 PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 12937 // The PowerPC target isn't yet aware of offsets. 12938 return false; 12939 } 12940 12941 bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 12942 const CallInst &I, 12943 unsigned Intrinsic) const { 12944 switch (Intrinsic) { 12945 case Intrinsic::ppc_qpx_qvlfd: 12946 case Intrinsic::ppc_qpx_qvlfs: 12947 case Intrinsic::ppc_qpx_qvlfcd: 12948 case Intrinsic::ppc_qpx_qvlfcs: 12949 case Intrinsic::ppc_qpx_qvlfiwa: 12950 case Intrinsic::ppc_qpx_qvlfiwz: 12951 case Intrinsic::ppc_altivec_lvx: 12952 case Intrinsic::ppc_altivec_lvxl: 12953 case Intrinsic::ppc_altivec_lvebx: 12954 case Intrinsic::ppc_altivec_lvehx: 12955 case Intrinsic::ppc_altivec_lvewx: 12956 case Intrinsic::ppc_vsx_lxvd2x: 12957 case Intrinsic::ppc_vsx_lxvw4x: { 12958 EVT VT; 12959 switch (Intrinsic) { 12960 case Intrinsic::ppc_altivec_lvebx: 12961 VT = MVT::i8; 12962 break; 12963 case Intrinsic::ppc_altivec_lvehx: 12964 VT = MVT::i16; 12965 break; 12966 case Intrinsic::ppc_altivec_lvewx: 12967 VT = MVT::i32; 12968 break; 12969 case Intrinsic::ppc_vsx_lxvd2x: 12970 VT = MVT::v2f64; 12971 break; 12972 case Intrinsic::ppc_qpx_qvlfd: 12973 VT = MVT::v4f64; 12974 break; 12975 case Intrinsic::ppc_qpx_qvlfs: 12976 VT = MVT::v4f32; 12977 break; 12978 case Intrinsic::ppc_qpx_qvlfcd: 12979 VT = MVT::v2f64; 12980 break; 12981 case Intrinsic::ppc_qpx_qvlfcs: 12982 VT = MVT::v2f32; 12983 break; 12984 default: 12985 VT = MVT::v4i32; 12986 break; 12987 } 12988 12989 Info.opc = ISD::INTRINSIC_W_CHAIN; 12990 Info.memVT = VT; 12991 Info.ptrVal = I.getArgOperand(0); 12992 Info.offset = -VT.getStoreSize()+1; 12993 Info.size = 2*VT.getStoreSize()-1; 12994 Info.align = 1; 12995 Info.vol = false; 12996 Info.readMem = true; 12997 Info.writeMem = false; 12998 return true; 12999 } 13000 case Intrinsic::ppc_qpx_qvlfda: 13001 case Intrinsic::ppc_qpx_qvlfsa: 13002 case Intrinsic::ppc_qpx_qvlfcda: 13003 case Intrinsic::ppc_qpx_qvlfcsa: 13004 case Intrinsic::ppc_qpx_qvlfiwaa: 13005 case Intrinsic::ppc_qpx_qvlfiwza: { 13006 EVT VT; 13007 switch (Intrinsic) { 13008 case Intrinsic::ppc_qpx_qvlfda: 13009 VT = MVT::v4f64; 13010 break; 13011 case Intrinsic::ppc_qpx_qvlfsa: 13012 VT = MVT::v4f32; 13013 break; 13014 case Intrinsic::ppc_qpx_qvlfcda: 13015 VT = MVT::v2f64; 13016 break; 13017 case Intrinsic::ppc_qpx_qvlfcsa: 13018 VT = MVT::v2f32; 13019 break; 13020 default: 13021 VT = MVT::v4i32; 13022 break; 13023 } 13024 13025 Info.opc = ISD::INTRINSIC_W_CHAIN; 13026 Info.memVT = VT; 13027 Info.ptrVal = I.getArgOperand(0); 13028 Info.offset = 0; 13029 Info.size = VT.getStoreSize(); 13030 Info.align = 1; 13031 Info.vol = false; 13032 Info.readMem = true; 13033 Info.writeMem = false; 13034 return true; 13035 } 13036 case Intrinsic::ppc_qpx_qvstfd: 13037 case Intrinsic::ppc_qpx_qvstfs: 13038 case Intrinsic::ppc_qpx_qvstfcd: 13039 case Intrinsic::ppc_qpx_qvstfcs: 13040 case Intrinsic::ppc_qpx_qvstfiw: 13041 case Intrinsic::ppc_altivec_stvx: 13042 case Intrinsic::ppc_altivec_stvxl: 13043 case Intrinsic::ppc_altivec_stvebx: 13044 case Intrinsic::ppc_altivec_stvehx: 13045 case Intrinsic::ppc_altivec_stvewx: 13046 case Intrinsic::ppc_vsx_stxvd2x: 13047 case Intrinsic::ppc_vsx_stxvw4x: { 13048 EVT VT; 13049 switch (Intrinsic) { 13050 case Intrinsic::ppc_altivec_stvebx: 13051 VT = MVT::i8; 13052 break; 13053 case Intrinsic::ppc_altivec_stvehx: 13054 VT = MVT::i16; 13055 break; 13056 case Intrinsic::ppc_altivec_stvewx: 13057 VT = MVT::i32; 13058 break; 13059 case Intrinsic::ppc_vsx_stxvd2x: 13060 VT = MVT::v2f64; 13061 break; 13062 case Intrinsic::ppc_qpx_qvstfd: 13063 VT = MVT::v4f64; 13064 break; 13065 case Intrinsic::ppc_qpx_qvstfs: 13066 VT = MVT::v4f32; 13067 break; 13068 case Intrinsic::ppc_qpx_qvstfcd: 13069 VT = MVT::v2f64; 13070 break; 13071 case Intrinsic::ppc_qpx_qvstfcs: 13072 VT = MVT::v2f32; 13073 break; 13074 default: 13075 VT = MVT::v4i32; 13076 break; 13077 } 13078 13079 Info.opc = ISD::INTRINSIC_VOID; 13080 Info.memVT = VT; 13081 Info.ptrVal = I.getArgOperand(1); 13082 Info.offset = -VT.getStoreSize()+1; 13083 Info.size = 2*VT.getStoreSize()-1; 13084 Info.align = 1; 13085 Info.vol = false; 13086 Info.readMem = false; 13087 Info.writeMem = true; 13088 return true; 13089 } 13090 case Intrinsic::ppc_qpx_qvstfda: 13091 case Intrinsic::ppc_qpx_qvstfsa: 13092 case Intrinsic::ppc_qpx_qvstfcda: 13093 case Intrinsic::ppc_qpx_qvstfcsa: 13094 case Intrinsic::ppc_qpx_qvstfiwa: { 13095 EVT VT; 13096 switch (Intrinsic) { 13097 case Intrinsic::ppc_qpx_qvstfda: 13098 VT = MVT::v4f64; 13099 break; 13100 case Intrinsic::ppc_qpx_qvstfsa: 13101 VT = MVT::v4f32; 13102 break; 13103 case Intrinsic::ppc_qpx_qvstfcda: 13104 VT = MVT::v2f64; 13105 break; 13106 case Intrinsic::ppc_qpx_qvstfcsa: 13107 VT = MVT::v2f32; 13108 break; 13109 default: 13110 VT = MVT::v4i32; 13111 break; 13112 } 13113 13114 Info.opc = ISD::INTRINSIC_VOID; 13115 Info.memVT = VT; 13116 Info.ptrVal = I.getArgOperand(1); 13117 Info.offset = 0; 13118 Info.size = VT.getStoreSize(); 13119 Info.align = 1; 13120 Info.vol = false; 13121 Info.readMem = false; 13122 Info.writeMem = true; 13123 return true; 13124 } 13125 default: 13126 break; 13127 } 13128 13129 return false; 13130 } 13131 13132 /// getOptimalMemOpType - Returns the target specific optimal type for load 13133 /// and store operations as a result of memset, memcpy, and memmove 13134 /// lowering. If DstAlign is zero that means it's safe to destination 13135 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 13136 /// means there isn't a need to check it against alignment requirement, 13137 /// probably because the source does not need to be loaded. If 'IsMemset' is 13138 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 13139 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 13140 /// source is constant so it does not need to be loaded. 13141 /// It returns EVT::Other if the type should be determined using generic 13142 /// target-independent logic. 13143 EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, 13144 unsigned DstAlign, unsigned SrcAlign, 13145 bool IsMemset, bool ZeroMemset, 13146 bool MemcpyStrSrc, 13147 MachineFunction &MF) const { 13148 if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { 13149 const Function *F = MF.getFunction(); 13150 // When expanding a memset, require at least two QPX instructions to cover 13151 // the cost of loading the value to be stored from the constant pool. 13152 if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) && 13153 (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) && 13154 !F->hasFnAttribute(Attribute::NoImplicitFloat)) { 13155 return MVT::v4f64; 13156 } 13157 13158 // We should use Altivec/VSX loads and stores when available. For unaligned 13159 // addresses, unaligned VSX loads are only fast starting with the P8. 13160 if (Subtarget.hasAltivec() && Size >= 16 && 13161 (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) || 13162 ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector()))) 13163 return MVT::v4i32; 13164 } 13165 13166 if (Subtarget.isPPC64()) { 13167 return MVT::i64; 13168 } 13169 13170 return MVT::i32; 13171 } 13172 13173 /// \brief Returns true if it is beneficial to convert a load of a constant 13174 /// to just the constant itself. 13175 bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 13176 Type *Ty) const { 13177 assert(Ty->isIntegerTy()); 13178 13179 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 13180 return !(BitSize == 0 || BitSize > 64); 13181 } 13182 13183 bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 13184 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 13185 return false; 13186 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 13187 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 13188 return NumBits1 == 64 && NumBits2 == 32; 13189 } 13190 13191 bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 13192 if (!VT1.isInteger() || !VT2.isInteger()) 13193 return false; 13194 unsigned NumBits1 = VT1.getSizeInBits(); 13195 unsigned NumBits2 = VT2.getSizeInBits(); 13196 return NumBits1 == 64 && NumBits2 == 32; 13197 } 13198 13199 bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 13200 // Generally speaking, zexts are not free, but they are free when they can be 13201 // folded with other operations. 13202 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) { 13203 EVT MemVT = LD->getMemoryVT(); 13204 if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 || 13205 (Subtarget.isPPC64() && MemVT == MVT::i32)) && 13206 (LD->getExtensionType() == ISD::NON_EXTLOAD || 13207 LD->getExtensionType() == ISD::ZEXTLOAD)) 13208 return true; 13209 } 13210 13211 // FIXME: Add other cases... 13212 // - 32-bit shifts with a zext to i64 13213 // - zext after ctlz, bswap, etc. 13214 // - zext after and by a constant mask 13215 13216 return TargetLowering::isZExtFree(Val, VT2); 13217 } 13218 13219 bool PPCTargetLowering::isFPExtFree(EVT VT) const { 13220 assert(VT.isFloatingPoint()); 13221 return true; 13222 } 13223 13224 bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 13225 return isInt<16>(Imm) || isUInt<16>(Imm); 13226 } 13227 13228 bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { 13229 return isInt<16>(Imm) || isUInt<16>(Imm); 13230 } 13231 13232 bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 13233 unsigned, 13234 unsigned, 13235 bool *Fast) const { 13236 if (DisablePPCUnaligned) 13237 return false; 13238 13239 // PowerPC supports unaligned memory access for simple non-vector types. 13240 // Although accessing unaligned addresses is not as efficient as accessing 13241 // aligned addresses, it is generally more efficient than manual expansion, 13242 // and generally only traps for software emulation when crossing page 13243 // boundaries. 13244 13245 if (!VT.isSimple()) 13246 return false; 13247 13248 if (VT.getSimpleVT().isVector()) { 13249 if (Subtarget.hasVSX()) { 13250 if (VT != MVT::v2f64 && VT != MVT::v2i64 && 13251 VT != MVT::v4f32 && VT != MVT::v4i32) 13252 return false; 13253 } else { 13254 return false; 13255 } 13256 } 13257 13258 if (VT == MVT::ppcf128) 13259 return false; 13260 13261 if (Fast) 13262 *Fast = true; 13263 13264 return true; 13265 } 13266 13267 bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 13268 VT = VT.getScalarType(); 13269 13270 if (!VT.isSimple()) 13271 return false; 13272 13273 switch (VT.getSimpleVT().SimpleTy) { 13274 case MVT::f32: 13275 case MVT::f64: 13276 return true; 13277 default: 13278 break; 13279 } 13280 13281 return false; 13282 } 13283 13284 const MCPhysReg * 13285 PPCTargetLowering::getScratchRegisters(CallingConv::ID) const { 13286 // LR is a callee-save register, but we must treat it as clobbered by any call 13287 // site. Hence we include LR in the scratch registers, which are in turn added 13288 // as implicit-defs for stackmaps and patchpoints. The same reasoning applies 13289 // to CTR, which is used by any indirect call. 13290 static const MCPhysReg ScratchRegs[] = { 13291 PPC::X12, PPC::LR8, PPC::CTR8, 0 13292 }; 13293 13294 return ScratchRegs; 13295 } 13296 13297 unsigned PPCTargetLowering::getExceptionPointerRegister( 13298 const Constant *PersonalityFn) const { 13299 return Subtarget.isPPC64() ? PPC::X3 : PPC::R3; 13300 } 13301 13302 unsigned PPCTargetLowering::getExceptionSelectorRegister( 13303 const Constant *PersonalityFn) const { 13304 return Subtarget.isPPC64() ? PPC::X4 : PPC::R4; 13305 } 13306 13307 bool 13308 PPCTargetLowering::shouldExpandBuildVectorWithShuffles( 13309 EVT VT , unsigned DefinedValues) const { 13310 if (VT == MVT::v2i64) 13311 return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves 13312 13313 if (Subtarget.hasVSX() || Subtarget.hasQPX()) 13314 return true; 13315 13316 return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); 13317 } 13318 13319 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const { 13320 if (DisableILPPref || Subtarget.enableMachineScheduler()) 13321 return TargetLowering::getSchedulingPreference(N); 13322 13323 return Sched::ILP; 13324 } 13325 13326 // Create a fast isel object. 13327 FastISel * 13328 PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo, 13329 const TargetLibraryInfo *LibInfo) const { 13330 return PPC::createFastISel(FuncInfo, LibInfo); 13331 } 13332 13333 void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 13334 if (Subtarget.isDarwinABI()) return; 13335 if (!Subtarget.isPPC64()) return; 13336 13337 // Update IsSplitCSR in PPCFunctionInfo 13338 PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>(); 13339 PFI->setIsSplitCSR(true); 13340 } 13341 13342 void PPCTargetLowering::insertCopiesSplitCSR( 13343 MachineBasicBlock *Entry, 13344 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 13345 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); 13346 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 13347 if (!IStart) 13348 return; 13349 13350 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 13351 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 13352 MachineBasicBlock::iterator MBBI = Entry->begin(); 13353 for (const MCPhysReg *I = IStart; *I; ++I) { 13354 const TargetRegisterClass *RC = nullptr; 13355 if (PPC::G8RCRegClass.contains(*I)) 13356 RC = &PPC::G8RCRegClass; 13357 else if (PPC::F8RCRegClass.contains(*I)) 13358 RC = &PPC::F8RCRegClass; 13359 else if (PPC::CRRCRegClass.contains(*I)) 13360 RC = &PPC::CRRCRegClass; 13361 else if (PPC::VRRCRegClass.contains(*I)) 13362 RC = &PPC::VRRCRegClass; 13363 else 13364 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 13365 13366 unsigned NewVR = MRI->createVirtualRegister(RC); 13367 // Create copy from CSR to a virtual register. 13368 // FIXME: this currently does not emit CFI pseudo-instructions, it works 13369 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 13370 // nounwind. If we want to generalize this later, we may need to emit 13371 // CFI pseudo-instructions. 13372 assert(Entry->getParent()->getFunction()->hasFnAttribute( 13373 Attribute::NoUnwind) && 13374 "Function should be nounwind in insertCopiesSplitCSR!"); 13375 Entry->addLiveIn(*I); 13376 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 13377 .addReg(*I); 13378 13379 // Insert the copy-back instructions right before the terminator 13380 for (auto *Exit : Exits) 13381 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 13382 TII->get(TargetOpcode::COPY), *I) 13383 .addReg(NewVR); 13384 } 13385 } 13386 13387 // Override to enable LOAD_STACK_GUARD lowering on Linux. 13388 bool PPCTargetLowering::useLoadStackGuardNode() const { 13389 if (!Subtarget.isTargetLinux()) 13390 return TargetLowering::useLoadStackGuardNode(); 13391 return true; 13392 } 13393 13394 // Override to disable global variable loading on Linux. 13395 void PPCTargetLowering::insertSSPDeclarations(Module &M) const { 13396 if (!Subtarget.isTargetLinux()) 13397 return TargetLowering::insertSSPDeclarations(M); 13398 } 13399 13400 bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 13401 if (!VT.isSimple() || !Subtarget.hasVSX()) 13402 return false; 13403 13404 switch(VT.getSimpleVT().SimpleTy) { 13405 default: 13406 // For FP types that are currently not supported by PPC backend, return 13407 // false. Examples: f16, f80. 13408 return false; 13409 case MVT::f32: 13410 case MVT::f64: 13411 case MVT::ppcf128: 13412 return Imm.isPosZero(); 13413 } 13414 } 13415 13416 // For vector shift operation op, fold 13417 // (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y) 13418 static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N, 13419 SelectionDAG &DAG) { 13420 SDValue N0 = N->getOperand(0); 13421 SDValue N1 = N->getOperand(1); 13422 EVT VT = N0.getValueType(); 13423 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 13424 unsigned Opcode = N->getOpcode(); 13425 unsigned TargetOpcode; 13426 13427 switch (Opcode) { 13428 default: 13429 llvm_unreachable("Unexpected shift operation"); 13430 case ISD::SHL: 13431 TargetOpcode = PPCISD::SHL; 13432 break; 13433 case ISD::SRL: 13434 TargetOpcode = PPCISD::SRL; 13435 break; 13436 case ISD::SRA: 13437 TargetOpcode = PPCISD::SRA; 13438 break; 13439 } 13440 13441 if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) && 13442 N1->getOpcode() == ISD::AND) 13443 if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) 13444 if (Mask->getZExtValue() == OpSizeInBits - 1) 13445 return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0)); 13446 13447 return SDValue(); 13448 } 13449 13450 SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const { 13451 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) 13452 return Value; 13453 13454 return SDValue(); 13455 } 13456 13457 SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const { 13458 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) 13459 return Value; 13460 13461 return SDValue(); 13462 } 13463 13464 SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { 13465 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) 13466 return Value; 13467 13468 return SDValue(); 13469 } 13470