1 //===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the PPCISelLowering class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "PPCISelLowering.h" 15 #include "MCTargetDesc/PPCPredicates.h" 16 #include "PPC.h" 17 #include "PPCCCState.h" 18 #include "PPCCallingConv.h" 19 #include "PPCFrameLowering.h" 20 #include "PPCInstrInfo.h" 21 #include "PPCMachineFunctionInfo.h" 22 #include "PPCPerfectShuffle.h" 23 #include "PPCRegisterInfo.h" 24 #include "PPCSubtarget.h" 25 #include "PPCTargetMachine.h" 26 #include "llvm/ADT/APFloat.h" 27 #include "llvm/ADT/APInt.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/DenseMap.h" 30 #include "llvm/ADT/None.h" 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/ADT/SmallPtrSet.h" 33 #include "llvm/ADT/SmallSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Statistic.h" 36 #include "llvm/ADT/StringRef.h" 37 #include "llvm/ADT/StringSwitch.h" 38 #include "llvm/CodeGen/CallingConvLower.h" 39 #include "llvm/CodeGen/ISDOpcodes.h" 40 #include "llvm/CodeGen/MachineBasicBlock.h" 41 #include "llvm/CodeGen/MachineFrameInfo.h" 42 #include "llvm/CodeGen/MachineFunction.h" 43 #include "llvm/CodeGen/MachineInstr.h" 44 #include "llvm/CodeGen/MachineInstrBuilder.h" 45 #include "llvm/CodeGen/MachineJumpTableInfo.h" 46 #include "llvm/CodeGen/MachineLoopInfo.h" 47 #include "llvm/CodeGen/MachineMemOperand.h" 48 #include "llvm/CodeGen/MachineOperand.h" 49 #include "llvm/CodeGen/MachineRegisterInfo.h" 50 #include "llvm/CodeGen/RuntimeLibcalls.h" 51 #include "llvm/CodeGen/SelectionDAG.h" 52 #include "llvm/CodeGen/SelectionDAGNodes.h" 53 #include "llvm/CodeGen/TargetInstrInfo.h" 54 #include "llvm/CodeGen/TargetLowering.h" 55 #include "llvm/CodeGen/TargetRegisterInfo.h" 56 #include "llvm/CodeGen/ValueTypes.h" 57 #include "llvm/IR/CallSite.h" 58 #include "llvm/IR/CallingConv.h" 59 #include "llvm/IR/Constant.h" 60 #include "llvm/IR/Constants.h" 61 #include "llvm/IR/DataLayout.h" 62 #include "llvm/IR/DebugLoc.h" 63 #include "llvm/IR/DerivedTypes.h" 64 #include "llvm/IR/Function.h" 65 #include "llvm/IR/GlobalValue.h" 66 #include "llvm/IR/IRBuilder.h" 67 #include "llvm/IR/Instructions.h" 68 #include "llvm/IR/Intrinsics.h" 69 #include "llvm/IR/Module.h" 70 #include "llvm/IR/Type.h" 71 #include "llvm/IR/Use.h" 72 #include "llvm/IR/Value.h" 73 #include "llvm/MC/MCExpr.h" 74 #include "llvm/MC/MCRegisterInfo.h" 75 #include "llvm/Support/AtomicOrdering.h" 76 #include "llvm/Support/BranchProbability.h" 77 #include "llvm/Support/Casting.h" 78 #include "llvm/Support/CodeGen.h" 79 #include "llvm/Support/CommandLine.h" 80 #include "llvm/Support/Compiler.h" 81 #include "llvm/Support/Debug.h" 82 #include "llvm/Support/ErrorHandling.h" 83 #include "llvm/Support/Format.h" 84 #include "llvm/Support/KnownBits.h" 85 #include "llvm/Support/MachineValueType.h" 86 #include "llvm/Support/MathExtras.h" 87 #include "llvm/Support/raw_ostream.h" 88 #include "llvm/Target/TargetMachine.h" 89 #include "llvm/Target/TargetOptions.h" 90 #include <algorithm> 91 #include <cassert> 92 #include <cstdint> 93 #include <iterator> 94 #include <list> 95 #include <utility> 96 #include <vector> 97 98 using namespace llvm; 99 100 #define DEBUG_TYPE "ppc-lowering" 101 102 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc", 103 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); 104 105 static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref", 106 cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden); 107 108 static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned", 109 cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); 110 111 static cl::opt<bool> DisableSCO("disable-ppc-sco", 112 cl::desc("disable sibling call optimization on ppc"), cl::Hidden); 113 114 static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision", 115 cl::desc("enable quad precision float support on ppc"), cl::Hidden); 116 117 STATISTIC(NumTailCalls, "Number of tail calls"); 118 STATISTIC(NumSiblingCalls, "Number of sibling calls"); 119 120 static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); 121 122 // FIXME: Remove this once the bug has been fixed! 123 extern cl::opt<bool> ANDIGlueBug; 124 125 PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, 126 const PPCSubtarget &STI) 127 : TargetLowering(TM), Subtarget(STI) { 128 // Use _setjmp/_longjmp instead of setjmp/longjmp. 129 setUseUnderscoreSetJmp(true); 130 setUseUnderscoreLongJmp(true); 131 132 // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all 133 // arguments are at least 4/8 bytes aligned. 134 bool isPPC64 = Subtarget.isPPC64(); 135 setMinStackArgumentAlignment(isPPC64 ? 8:4); 136 137 // Set up the register classes. 138 addRegisterClass(MVT::i32, &PPC::GPRCRegClass); 139 if (!useSoftFloat()) { 140 if (hasSPE()) { 141 addRegisterClass(MVT::f32, &PPC::SPE4RCRegClass); 142 addRegisterClass(MVT::f64, &PPC::SPERCRegClass); 143 } else { 144 addRegisterClass(MVT::f32, &PPC::F4RCRegClass); 145 addRegisterClass(MVT::f64, &PPC::F8RCRegClass); 146 } 147 } 148 149 // Match BITREVERSE to customized fast code sequence in the td file. 150 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 151 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 152 153 // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended. 154 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 155 156 // PowerPC has an i16 but no i8 (or i1) SEXTLOAD. 157 for (MVT VT : MVT::integer_valuetypes()) { 158 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 159 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); 160 } 161 162 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 163 164 // PowerPC has pre-inc load and store's. 165 setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal); 166 setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal); 167 setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal); 168 setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal); 169 setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal); 170 setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal); 171 setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal); 172 setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal); 173 setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal); 174 setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal); 175 if (!Subtarget.hasSPE()) { 176 setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal); 177 setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal); 178 setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal); 179 setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal); 180 } 181 182 // PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry. 183 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 184 for (MVT VT : ScalarIntVTs) { 185 setOperationAction(ISD::ADDC, VT, Legal); 186 setOperationAction(ISD::ADDE, VT, Legal); 187 setOperationAction(ISD::SUBC, VT, Legal); 188 setOperationAction(ISD::SUBE, VT, Legal); 189 } 190 191 if (Subtarget.useCRBits()) { 192 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 193 194 if (isPPC64 || Subtarget.hasFPCVT()) { 195 setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); 196 AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, 197 isPPC64 ? MVT::i64 : MVT::i32); 198 setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); 199 AddPromotedToType(ISD::UINT_TO_FP, MVT::i1, 200 isPPC64 ? MVT::i64 : MVT::i32); 201 } else { 202 setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom); 203 setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); 204 } 205 206 // PowerPC does not support direct load/store of condition registers. 207 setOperationAction(ISD::LOAD, MVT::i1, Custom); 208 setOperationAction(ISD::STORE, MVT::i1, Custom); 209 210 // FIXME: Remove this once the ANDI glue bug is fixed: 211 if (ANDIGlueBug) 212 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); 213 214 for (MVT VT : MVT::integer_valuetypes()) { 215 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 216 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 217 setTruncStoreAction(VT, MVT::i1, Expand); 218 } 219 220 addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass); 221 } 222 223 // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on 224 // PPC (the libcall is not available). 225 setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom); 226 setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom); 227 228 // We do not currently implement these libm ops for PowerPC. 229 setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand); 230 setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand); 231 setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand); 232 setOperationAction(ISD::FRINT, MVT::ppcf128, Expand); 233 setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand); 234 setOperationAction(ISD::FREM, MVT::ppcf128, Expand); 235 236 // PowerPC has no SREM/UREM instructions unless we are on P9 237 // On P9 we may use a hardware instruction to compute the remainder. 238 // The instructions are not legalized directly because in the cases where the 239 // result of both the remainder and the division is required it is more 240 // efficient to compute the remainder from the result of the division rather 241 // than use the remainder instruction. 242 if (Subtarget.isISA3_0()) { 243 setOperationAction(ISD::SREM, MVT::i32, Custom); 244 setOperationAction(ISD::UREM, MVT::i32, Custom); 245 setOperationAction(ISD::SREM, MVT::i64, Custom); 246 setOperationAction(ISD::UREM, MVT::i64, Custom); 247 } else { 248 setOperationAction(ISD::SREM, MVT::i32, Expand); 249 setOperationAction(ISD::UREM, MVT::i32, Expand); 250 setOperationAction(ISD::SREM, MVT::i64, Expand); 251 setOperationAction(ISD::UREM, MVT::i64, Expand); 252 } 253 254 if (Subtarget.hasP9Vector()) { 255 setOperationAction(ISD::ABS, MVT::v4i32, Legal); 256 setOperationAction(ISD::ABS, MVT::v8i16, Legal); 257 setOperationAction(ISD::ABS, MVT::v16i8, Legal); 258 } 259 260 // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. 261 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 262 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 263 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 264 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 265 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 266 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 267 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 268 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 269 270 // We don't support sin/cos/sqrt/fmod/pow 271 setOperationAction(ISD::FSIN , MVT::f64, Expand); 272 setOperationAction(ISD::FCOS , MVT::f64, Expand); 273 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 274 setOperationAction(ISD::FREM , MVT::f64, Expand); 275 setOperationAction(ISD::FPOW , MVT::f64, Expand); 276 setOperationAction(ISD::FSIN , MVT::f32, Expand); 277 setOperationAction(ISD::FCOS , MVT::f32, Expand); 278 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 279 setOperationAction(ISD::FREM , MVT::f32, Expand); 280 setOperationAction(ISD::FPOW , MVT::f32, Expand); 281 if (Subtarget.hasSPE()) { 282 setOperationAction(ISD::FMA , MVT::f64, Expand); 283 setOperationAction(ISD::FMA , MVT::f32, Expand); 284 } else { 285 setOperationAction(ISD::FMA , MVT::f64, Legal); 286 setOperationAction(ISD::FMA , MVT::f32, Legal); 287 } 288 289 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 290 291 // If we're enabling GP optimizations, use hardware square root 292 if (!Subtarget.hasFSQRT() && 293 !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() && 294 Subtarget.hasFRE())) 295 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 296 297 if (!Subtarget.hasFSQRT() && 298 !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() && 299 Subtarget.hasFRES())) 300 setOperationAction(ISD::FSQRT, MVT::f32, Expand); 301 302 if (Subtarget.hasFCPSGN()) { 303 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal); 304 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal); 305 } else { 306 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 307 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 308 } 309 310 if (Subtarget.hasFPRND()) { 311 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 312 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 313 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 314 setOperationAction(ISD::FROUND, MVT::f64, Legal); 315 316 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 317 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 318 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 319 setOperationAction(ISD::FROUND, MVT::f32, Legal); 320 } 321 322 // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd 323 // to speed up scalar BSWAP64. 324 // CTPOP or CTTZ were introduced in P8/P9 respectively 325 setOperationAction(ISD::BSWAP, MVT::i32 , Expand); 326 if (Subtarget.hasP9Vector()) 327 setOperationAction(ISD::BSWAP, MVT::i64 , Custom); 328 else 329 setOperationAction(ISD::BSWAP, MVT::i64 , Expand); 330 if (Subtarget.isISA3_0()) { 331 setOperationAction(ISD::CTTZ , MVT::i32 , Legal); 332 setOperationAction(ISD::CTTZ , MVT::i64 , Legal); 333 } else { 334 setOperationAction(ISD::CTTZ , MVT::i32 , Expand); 335 setOperationAction(ISD::CTTZ , MVT::i64 , Expand); 336 } 337 338 if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) { 339 setOperationAction(ISD::CTPOP, MVT::i32 , Legal); 340 setOperationAction(ISD::CTPOP, MVT::i64 , Legal); 341 } else { 342 setOperationAction(ISD::CTPOP, MVT::i32 , Expand); 343 setOperationAction(ISD::CTPOP, MVT::i64 , Expand); 344 } 345 346 // PowerPC does not have ROTR 347 setOperationAction(ISD::ROTR, MVT::i32 , Expand); 348 setOperationAction(ISD::ROTR, MVT::i64 , Expand); 349 350 if (!Subtarget.useCRBits()) { 351 // PowerPC does not have Select 352 setOperationAction(ISD::SELECT, MVT::i32, Expand); 353 setOperationAction(ISD::SELECT, MVT::i64, Expand); 354 setOperationAction(ISD::SELECT, MVT::f32, Expand); 355 setOperationAction(ISD::SELECT, MVT::f64, Expand); 356 } 357 358 // PowerPC wants to turn select_cc of FP into fsel when possible. 359 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 360 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 361 362 // PowerPC wants to optimize integer setcc a bit 363 if (!Subtarget.useCRBits()) 364 setOperationAction(ISD::SETCC, MVT::i32, Custom); 365 366 // PowerPC does not have BRCOND which requires SetCC 367 if (!Subtarget.useCRBits()) 368 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 369 370 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 371 372 if (Subtarget.hasSPE()) { 373 // SPE has built-in conversions 374 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); 375 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); 376 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); 377 } else { 378 // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores. 379 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 380 381 // PowerPC does not have [U|S]INT_TO_FP 382 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); 383 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); 384 } 385 386 if (Subtarget.hasDirectMove() && isPPC64) { 387 setOperationAction(ISD::BITCAST, MVT::f32, Legal); 388 setOperationAction(ISD::BITCAST, MVT::i32, Legal); 389 setOperationAction(ISD::BITCAST, MVT::i64, Legal); 390 setOperationAction(ISD::BITCAST, MVT::f64, Legal); 391 } else { 392 setOperationAction(ISD::BITCAST, MVT::f32, Expand); 393 setOperationAction(ISD::BITCAST, MVT::i32, Expand); 394 setOperationAction(ISD::BITCAST, MVT::i64, Expand); 395 setOperationAction(ISD::BITCAST, MVT::f64, Expand); 396 } 397 398 // We cannot sextinreg(i1). Expand to shifts. 399 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 400 401 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support 402 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 403 // support continuation, user-level threading, and etc.. As a result, no 404 // other SjLj exception interfaces are implemented and please don't build 405 // your own exception handling based on them. 406 // LLVM/Clang supports zero-cost DWARF exception handling. 407 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 408 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 409 410 // We want to legalize GlobalAddress and ConstantPool nodes into the 411 // appropriate instructions to materialize the address. 412 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 413 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 414 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 415 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 416 setOperationAction(ISD::JumpTable, MVT::i32, Custom); 417 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 418 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 419 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 420 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 421 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 422 423 // TRAP is legal. 424 setOperationAction(ISD::TRAP, MVT::Other, Legal); 425 426 // TRAMPOLINE is custom lowered. 427 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 428 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 429 430 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 431 setOperationAction(ISD::VASTART , MVT::Other, Custom); 432 433 if (Subtarget.isSVR4ABI()) { 434 if (isPPC64) { 435 // VAARG always uses double-word chunks, so promote anything smaller. 436 setOperationAction(ISD::VAARG, MVT::i1, Promote); 437 AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64); 438 setOperationAction(ISD::VAARG, MVT::i8, Promote); 439 AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64); 440 setOperationAction(ISD::VAARG, MVT::i16, Promote); 441 AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64); 442 setOperationAction(ISD::VAARG, MVT::i32, Promote); 443 AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64); 444 setOperationAction(ISD::VAARG, MVT::Other, Expand); 445 } else { 446 // VAARG is custom lowered with the 32-bit SVR4 ABI. 447 setOperationAction(ISD::VAARG, MVT::Other, Custom); 448 setOperationAction(ISD::VAARG, MVT::i64, Custom); 449 } 450 } else 451 setOperationAction(ISD::VAARG, MVT::Other, Expand); 452 453 if (Subtarget.isSVR4ABI() && !isPPC64) 454 // VACOPY is custom lowered with the 32-bit SVR4 ABI. 455 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 456 else 457 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 458 459 // Use the default implementation. 460 setOperationAction(ISD::VAEND , MVT::Other, Expand); 461 setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); 462 setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); 463 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); 464 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); 465 setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom); 466 setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom); 467 setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); 468 setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom); 469 470 // We want to custom lower some of our intrinsics. 471 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 472 473 // To handle counter-based loop conditions. 474 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); 475 476 setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); 477 setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); 478 setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom); 479 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 480 481 // Comparisons that require checking two conditions. 482 if (Subtarget.hasSPE()) { 483 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 484 setCondCodeAction(ISD::SETO, MVT::f64, Expand); 485 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 486 setCondCodeAction(ISD::SETUO, MVT::f64, Expand); 487 } 488 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 489 setCondCodeAction(ISD::SETULT, MVT::f64, Expand); 490 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 491 setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); 492 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 493 setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); 494 setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); 495 setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); 496 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 497 setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); 498 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 499 setCondCodeAction(ISD::SETONE, MVT::f64, Expand); 500 501 if (Subtarget.has64BitSupport()) { 502 // They also have instructions for converting between i64 and fp. 503 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 504 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); 505 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 506 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); 507 // This is just the low 32 bits of a (signed) fp->i64 conversion. 508 // We cannot do this with Promote because i64 is not a legal type. 509 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 510 511 if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) 512 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 513 } else { 514 // PowerPC does not have FP_TO_UINT on 32-bit implementations. 515 if (Subtarget.hasSPE()) 516 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); 517 else 518 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); 519 } 520 521 // With the instructions enabled under FPCVT, we can do everything. 522 if (Subtarget.hasFPCVT()) { 523 if (Subtarget.has64BitSupport()) { 524 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 525 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 526 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 527 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 528 } 529 530 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 531 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 532 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 533 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 534 } 535 536 if (Subtarget.use64BitRegs()) { 537 // 64-bit PowerPC implementations can support i64 types directly 538 addRegisterClass(MVT::i64, &PPC::G8RCRegClass); 539 // BUILD_PAIR can't be handled natively, and should be expanded to shl/or 540 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); 541 // 64-bit PowerPC wants to expand i128 shifts itself. 542 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 543 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 544 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 545 } else { 546 // 32-bit PowerPC wants to expand i64 shifts itself. 547 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 548 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 549 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 550 } 551 552 if (Subtarget.hasAltivec()) { 553 // First set operation action for all vector types to expand. Then we 554 // will selectively turn on ones that can be effectively codegen'd. 555 for (MVT VT : MVT::vector_valuetypes()) { 556 // add/sub are legal for all supported vector VT's. 557 setOperationAction(ISD::ADD, VT, Legal); 558 setOperationAction(ISD::SUB, VT, Legal); 559 560 // Vector instructions introduced in P8 561 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { 562 setOperationAction(ISD::CTPOP, VT, Legal); 563 setOperationAction(ISD::CTLZ, VT, Legal); 564 } 565 else { 566 setOperationAction(ISD::CTPOP, VT, Expand); 567 setOperationAction(ISD::CTLZ, VT, Expand); 568 } 569 570 // Vector instructions introduced in P9 571 if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128)) 572 setOperationAction(ISD::CTTZ, VT, Legal); 573 else 574 setOperationAction(ISD::CTTZ, VT, Expand); 575 576 // We promote all shuffles to v16i8. 577 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote); 578 AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8); 579 580 // We promote all non-typed operations to v4i32. 581 setOperationAction(ISD::AND , VT, Promote); 582 AddPromotedToType (ISD::AND , VT, MVT::v4i32); 583 setOperationAction(ISD::OR , VT, Promote); 584 AddPromotedToType (ISD::OR , VT, MVT::v4i32); 585 setOperationAction(ISD::XOR , VT, Promote); 586 AddPromotedToType (ISD::XOR , VT, MVT::v4i32); 587 setOperationAction(ISD::LOAD , VT, Promote); 588 AddPromotedToType (ISD::LOAD , VT, MVT::v4i32); 589 setOperationAction(ISD::SELECT, VT, Promote); 590 AddPromotedToType (ISD::SELECT, VT, MVT::v4i32); 591 setOperationAction(ISD::VSELECT, VT, Legal); 592 setOperationAction(ISD::SELECT_CC, VT, Promote); 593 AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32); 594 setOperationAction(ISD::STORE, VT, Promote); 595 AddPromotedToType (ISD::STORE, VT, MVT::v4i32); 596 597 // No other operations are legal. 598 setOperationAction(ISD::MUL , VT, Expand); 599 setOperationAction(ISD::SDIV, VT, Expand); 600 setOperationAction(ISD::SREM, VT, Expand); 601 setOperationAction(ISD::UDIV, VT, Expand); 602 setOperationAction(ISD::UREM, VT, Expand); 603 setOperationAction(ISD::FDIV, VT, Expand); 604 setOperationAction(ISD::FREM, VT, Expand); 605 setOperationAction(ISD::FNEG, VT, Expand); 606 setOperationAction(ISD::FSQRT, VT, Expand); 607 setOperationAction(ISD::FLOG, VT, Expand); 608 setOperationAction(ISD::FLOG10, VT, Expand); 609 setOperationAction(ISD::FLOG2, VT, Expand); 610 setOperationAction(ISD::FEXP, VT, Expand); 611 setOperationAction(ISD::FEXP2, VT, Expand); 612 setOperationAction(ISD::FSIN, VT, Expand); 613 setOperationAction(ISD::FCOS, VT, Expand); 614 setOperationAction(ISD::FABS, VT, Expand); 615 setOperationAction(ISD::FFLOOR, VT, Expand); 616 setOperationAction(ISD::FCEIL, VT, Expand); 617 setOperationAction(ISD::FTRUNC, VT, Expand); 618 setOperationAction(ISD::FRINT, VT, Expand); 619 setOperationAction(ISD::FNEARBYINT, VT, Expand); 620 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); 621 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 622 setOperationAction(ISD::BUILD_VECTOR, VT, Expand); 623 setOperationAction(ISD::MULHU, VT, Expand); 624 setOperationAction(ISD::MULHS, VT, Expand); 625 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 626 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 627 setOperationAction(ISD::UDIVREM, VT, Expand); 628 setOperationAction(ISD::SDIVREM, VT, Expand); 629 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 630 setOperationAction(ISD::FPOW, VT, Expand); 631 setOperationAction(ISD::BSWAP, VT, Expand); 632 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 633 setOperationAction(ISD::ROTL, VT, Expand); 634 setOperationAction(ISD::ROTR, VT, Expand); 635 636 for (MVT InnerVT : MVT::vector_valuetypes()) { 637 setTruncStoreAction(VT, InnerVT, Expand); 638 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 639 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 640 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 641 } 642 } 643 644 // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle 645 // with merges, splats, etc. 646 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); 647 648 setOperationAction(ISD::AND , MVT::v4i32, Legal); 649 setOperationAction(ISD::OR , MVT::v4i32, Legal); 650 setOperationAction(ISD::XOR , MVT::v4i32, Legal); 651 setOperationAction(ISD::LOAD , MVT::v4i32, Legal); 652 setOperationAction(ISD::SELECT, MVT::v4i32, 653 Subtarget.useCRBits() ? Legal : Expand); 654 setOperationAction(ISD::STORE , MVT::v4i32, Legal); 655 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 656 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); 657 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 658 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); 659 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 660 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 661 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 662 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); 663 664 addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); 665 addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); 666 addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); 667 addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass); 668 669 setOperationAction(ISD::MUL, MVT::v4f32, Legal); 670 setOperationAction(ISD::FMA, MVT::v4f32, Legal); 671 672 if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) { 673 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 674 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 675 } 676 677 if (Subtarget.hasP8Altivec()) 678 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 679 else 680 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 681 682 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 683 setOperationAction(ISD::MUL, MVT::v16i8, Custom); 684 685 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom); 686 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom); 687 688 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); 689 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); 690 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); 691 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 692 693 // Altivec does not contain unordered floating-point compare instructions 694 setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); 695 setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand); 696 setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); 697 setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); 698 699 if (Subtarget.hasVSX()) { 700 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); 701 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); 702 if (Subtarget.hasP8Vector()) { 703 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); 704 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); 705 } 706 if (Subtarget.hasDirectMove() && isPPC64) { 707 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal); 708 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal); 709 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal); 710 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal); 711 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal); 712 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal); 713 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); 714 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 715 } 716 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); 717 718 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); 719 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); 720 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); 721 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); 722 setOperationAction(ISD::FROUND, MVT::v2f64, Legal); 723 724 setOperationAction(ISD::FROUND, MVT::v4f32, Legal); 725 726 setOperationAction(ISD::MUL, MVT::v2f64, Legal); 727 setOperationAction(ISD::FMA, MVT::v2f64, Legal); 728 729 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 730 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 731 732 // Share the Altivec comparison restrictions. 733 setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); 734 setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); 735 setCondCodeAction(ISD::SETO, MVT::v2f64, Expand); 736 setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand); 737 738 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 739 setOperationAction(ISD::STORE, MVT::v2f64, Legal); 740 741 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal); 742 743 if (Subtarget.hasP8Vector()) 744 addRegisterClass(MVT::f32, &PPC::VSSRCRegClass); 745 746 addRegisterClass(MVT::f64, &PPC::VSFRCRegClass); 747 748 addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass); 749 addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass); 750 addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass); 751 752 if (Subtarget.hasP8Altivec()) { 753 setOperationAction(ISD::SHL, MVT::v2i64, Legal); 754 setOperationAction(ISD::SRA, MVT::v2i64, Legal); 755 setOperationAction(ISD::SRL, MVT::v2i64, Legal); 756 757 // 128 bit shifts can be accomplished via 3 instructions for SHL and 758 // SRL, but not for SRA because of the instructions available: 759 // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth 760 // doing 761 setOperationAction(ISD::SHL, MVT::v1i128, Expand); 762 setOperationAction(ISD::SRL, MVT::v1i128, Expand); 763 setOperationAction(ISD::SRA, MVT::v1i128, Expand); 764 765 setOperationAction(ISD::SETCC, MVT::v2i64, Legal); 766 } 767 else { 768 setOperationAction(ISD::SHL, MVT::v2i64, Expand); 769 setOperationAction(ISD::SRA, MVT::v2i64, Expand); 770 setOperationAction(ISD::SRL, MVT::v2i64, Expand); 771 772 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 773 774 // VSX v2i64 only supports non-arithmetic operations. 775 setOperationAction(ISD::ADD, MVT::v2i64, Expand); 776 setOperationAction(ISD::SUB, MVT::v2i64, Expand); 777 } 778 779 setOperationAction(ISD::LOAD, MVT::v2i64, Promote); 780 AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64); 781 setOperationAction(ISD::STORE, MVT::v2i64, Promote); 782 AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64); 783 784 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal); 785 786 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); 787 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); 788 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); 789 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); 790 791 setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom); 792 setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom); 793 794 setOperationAction(ISD::FNEG, MVT::v4f32, Legal); 795 setOperationAction(ISD::FNEG, MVT::v2f64, Legal); 796 setOperationAction(ISD::FABS, MVT::v4f32, Legal); 797 setOperationAction(ISD::FABS, MVT::v2f64, Legal); 798 799 if (Subtarget.hasDirectMove()) 800 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 801 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 802 803 addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass); 804 } 805 806 if (Subtarget.hasP8Altivec()) { 807 addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass); 808 addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass); 809 } 810 811 if (Subtarget.hasP9Vector()) { 812 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 813 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 814 815 // 128 bit shifts can be accomplished via 3 instructions for SHL and 816 // SRL, but not for SRA because of the instructions available: 817 // VS{RL} and VS{RL}O. 818 setOperationAction(ISD::SHL, MVT::v1i128, Legal); 819 setOperationAction(ISD::SRL, MVT::v1i128, Legal); 820 setOperationAction(ISD::SRA, MVT::v1i128, Expand); 821 822 if (EnableQuadPrecision) { 823 addRegisterClass(MVT::f128, &PPC::VRRCRegClass); 824 setOperationAction(ISD::FADD, MVT::f128, Legal); 825 setOperationAction(ISD::FSUB, MVT::f128, Legal); 826 setOperationAction(ISD::FDIV, MVT::f128, Legal); 827 setOperationAction(ISD::FMUL, MVT::f128, Legal); 828 setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal); 829 // No extending loads to f128 on PPC. 830 for (MVT FPT : MVT::fp_valuetypes()) 831 setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand); 832 setOperationAction(ISD::FMA, MVT::f128, Legal); 833 setCondCodeAction(ISD::SETULT, MVT::f128, Expand); 834 setCondCodeAction(ISD::SETUGT, MVT::f128, Expand); 835 setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand); 836 setCondCodeAction(ISD::SETOGE, MVT::f128, Expand); 837 setCondCodeAction(ISD::SETOLE, MVT::f128, Expand); 838 setCondCodeAction(ISD::SETONE, MVT::f128, Expand); 839 840 setOperationAction(ISD::FTRUNC, MVT::f128, Legal); 841 setOperationAction(ISD::FRINT, MVT::f128, Legal); 842 setOperationAction(ISD::FFLOOR, MVT::f128, Legal); 843 setOperationAction(ISD::FCEIL, MVT::f128, Legal); 844 setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal); 845 setOperationAction(ISD::FROUND, MVT::f128, Legal); 846 847 setOperationAction(ISD::SELECT, MVT::f128, Expand); 848 setOperationAction(ISD::FP_ROUND, MVT::f64, Legal); 849 setOperationAction(ISD::FP_ROUND, MVT::f32, Legal); 850 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 851 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 852 setOperationAction(ISD::BITCAST, MVT::i128, Custom); 853 // No implementation for these ops for PowerPC. 854 setOperationAction(ISD::FSIN , MVT::f128, Expand); 855 setOperationAction(ISD::FCOS , MVT::f128, Expand); 856 setOperationAction(ISD::FPOW, MVT::f128, Expand); 857 setOperationAction(ISD::FPOWI, MVT::f128, Expand); 858 setOperationAction(ISD::FREM, MVT::f128, Expand); 859 } 860 861 } 862 863 if (Subtarget.hasP9Altivec()) { 864 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 865 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 866 } 867 } 868 869 if (Subtarget.hasQPX()) { 870 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 871 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 872 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 873 setOperationAction(ISD::FREM, MVT::v4f64, Expand); 874 875 setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal); 876 setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand); 877 878 setOperationAction(ISD::LOAD , MVT::v4f64, Custom); 879 setOperationAction(ISD::STORE , MVT::v4f64, Custom); 880 881 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); 882 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom); 883 884 if (!Subtarget.useCRBits()) 885 setOperationAction(ISD::SELECT, MVT::v4f64, Expand); 886 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 887 888 setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal); 889 setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand); 890 setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand); 891 setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand); 892 setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom); 893 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal); 894 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 895 896 setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal); 897 setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); 898 899 setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); 900 setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand); 901 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); 902 903 setOperationAction(ISD::FNEG , MVT::v4f64, Legal); 904 setOperationAction(ISD::FABS , MVT::v4f64, Legal); 905 setOperationAction(ISD::FSIN , MVT::v4f64, Expand); 906 setOperationAction(ISD::FCOS , MVT::v4f64, Expand); 907 setOperationAction(ISD::FPOW , MVT::v4f64, Expand); 908 setOperationAction(ISD::FLOG , MVT::v4f64, Expand); 909 setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); 910 setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand); 911 setOperationAction(ISD::FEXP , MVT::v4f64, Expand); 912 setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand); 913 914 setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal); 915 setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal); 916 917 setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal); 918 setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal); 919 920 addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass); 921 922 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 923 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 924 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 925 setOperationAction(ISD::FREM, MVT::v4f32, Expand); 926 927 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); 928 setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand); 929 930 setOperationAction(ISD::LOAD , MVT::v4f32, Custom); 931 setOperationAction(ISD::STORE , MVT::v4f32, Custom); 932 933 if (!Subtarget.useCRBits()) 934 setOperationAction(ISD::SELECT, MVT::v4f32, Expand); 935 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 936 937 setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal); 938 setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand); 939 setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand); 940 setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand); 941 setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom); 942 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); 943 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 944 945 setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal); 946 setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand); 947 948 setOperationAction(ISD::FNEG , MVT::v4f32, Legal); 949 setOperationAction(ISD::FABS , MVT::v4f32, Legal); 950 setOperationAction(ISD::FSIN , MVT::v4f32, Expand); 951 setOperationAction(ISD::FCOS , MVT::v4f32, Expand); 952 setOperationAction(ISD::FPOW , MVT::v4f32, Expand); 953 setOperationAction(ISD::FLOG , MVT::v4f32, Expand); 954 setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); 955 setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand); 956 setOperationAction(ISD::FEXP , MVT::v4f32, Expand); 957 setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand); 958 959 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 960 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 961 962 setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal); 963 setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal); 964 965 addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass); 966 967 setOperationAction(ISD::AND , MVT::v4i1, Legal); 968 setOperationAction(ISD::OR , MVT::v4i1, Legal); 969 setOperationAction(ISD::XOR , MVT::v4i1, Legal); 970 971 if (!Subtarget.useCRBits()) 972 setOperationAction(ISD::SELECT, MVT::v4i1, Expand); 973 setOperationAction(ISD::VSELECT, MVT::v4i1, Legal); 974 975 setOperationAction(ISD::LOAD , MVT::v4i1, Custom); 976 setOperationAction(ISD::STORE , MVT::v4i1, Custom); 977 978 setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom); 979 setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand); 980 setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand); 981 setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand); 982 setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom); 983 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand); 984 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); 985 986 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); 987 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); 988 989 addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass); 990 991 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 992 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); 993 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); 994 setOperationAction(ISD::FROUND, MVT::v4f64, Legal); 995 996 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 997 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 998 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 999 setOperationAction(ISD::FROUND, MVT::v4f32, Legal); 1000 1001 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand); 1002 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 1003 1004 // These need to set FE_INEXACT, and so cannot be vectorized here. 1005 setOperationAction(ISD::FRINT, MVT::v4f64, Expand); 1006 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 1007 1008 if (TM.Options.UnsafeFPMath) { 1009 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1010 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1011 1012 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 1013 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 1014 } else { 1015 setOperationAction(ISD::FDIV, MVT::v4f64, Expand); 1016 setOperationAction(ISD::FSQRT, MVT::v4f64, Expand); 1017 1018 setOperationAction(ISD::FDIV, MVT::v4f32, Expand); 1019 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 1020 } 1021 } 1022 1023 if (Subtarget.has64BitSupport()) 1024 setOperationAction(ISD::PREFETCH, MVT::Other, Legal); 1025 1026 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom); 1027 1028 if (!isPPC64) { 1029 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); 1030 setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); 1031 } 1032 1033 setBooleanContents(ZeroOrOneBooleanContent); 1034 1035 if (Subtarget.hasAltivec()) { 1036 // Altivec instructions set fields to all zeros or all ones. 1037 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 1038 } 1039 1040 if (!isPPC64) { 1041 // These libcalls are not available in 32-bit. 1042 setLibcallName(RTLIB::SHL_I128, nullptr); 1043 setLibcallName(RTLIB::SRL_I128, nullptr); 1044 setLibcallName(RTLIB::SRA_I128, nullptr); 1045 } 1046 1047 setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); 1048 1049 // We have target-specific dag combine patterns for the following nodes: 1050 setTargetDAGCombine(ISD::ADD); 1051 setTargetDAGCombine(ISD::SHL); 1052 setTargetDAGCombine(ISD::SRA); 1053 setTargetDAGCombine(ISD::SRL); 1054 setTargetDAGCombine(ISD::SINT_TO_FP); 1055 setTargetDAGCombine(ISD::BUILD_VECTOR); 1056 if (Subtarget.hasFPCVT()) 1057 setTargetDAGCombine(ISD::UINT_TO_FP); 1058 setTargetDAGCombine(ISD::LOAD); 1059 setTargetDAGCombine(ISD::STORE); 1060 setTargetDAGCombine(ISD::BR_CC); 1061 if (Subtarget.useCRBits()) 1062 setTargetDAGCombine(ISD::BRCOND); 1063 setTargetDAGCombine(ISD::BSWAP); 1064 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 1065 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 1066 setTargetDAGCombine(ISD::INTRINSIC_VOID); 1067 1068 setTargetDAGCombine(ISD::SIGN_EXTEND); 1069 setTargetDAGCombine(ISD::ZERO_EXTEND); 1070 setTargetDAGCombine(ISD::ANY_EXTEND); 1071 1072 setTargetDAGCombine(ISD::TRUNCATE); 1073 1074 if (Subtarget.useCRBits()) { 1075 setTargetDAGCombine(ISD::TRUNCATE); 1076 setTargetDAGCombine(ISD::SETCC); 1077 setTargetDAGCombine(ISD::SELECT_CC); 1078 } 1079 1080 // Use reciprocal estimates. 1081 if (TM.Options.UnsafeFPMath) { 1082 setTargetDAGCombine(ISD::FDIV); 1083 setTargetDAGCombine(ISD::FSQRT); 1084 } 1085 1086 // Darwin long double math library functions have $LDBL128 appended. 1087 if (Subtarget.isDarwin()) { 1088 setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); 1089 setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128"); 1090 setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128"); 1091 setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128"); 1092 setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128"); 1093 setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128"); 1094 setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128"); 1095 setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128"); 1096 setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128"); 1097 setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128"); 1098 } 1099 1100 if (EnableQuadPrecision) { 1101 setLibcallName(RTLIB::LOG_F128, "logf128"); 1102 setLibcallName(RTLIB::LOG2_F128, "log2f128"); 1103 setLibcallName(RTLIB::LOG10_F128, "log10f128"); 1104 setLibcallName(RTLIB::EXP_F128, "expf128"); 1105 setLibcallName(RTLIB::EXP2_F128, "exp2f128"); 1106 setLibcallName(RTLIB::SIN_F128, "sinf128"); 1107 setLibcallName(RTLIB::COS_F128, "cosf128"); 1108 setLibcallName(RTLIB::POW_F128, "powf128"); 1109 setLibcallName(RTLIB::FMIN_F128, "fminf128"); 1110 setLibcallName(RTLIB::FMAX_F128, "fmaxf128"); 1111 setLibcallName(RTLIB::POWI_F128, "__powikf2"); 1112 setLibcallName(RTLIB::REM_F128, "fmodf128"); 1113 } 1114 1115 // With 32 condition bits, we don't need to sink (and duplicate) compares 1116 // aggressively in CodeGenPrep. 1117 if (Subtarget.useCRBits()) { 1118 setHasMultipleConditionRegisters(); 1119 setJumpIsExpensive(); 1120 } 1121 1122 setMinFunctionAlignment(2); 1123 if (Subtarget.isDarwin()) 1124 setPrefFunctionAlignment(4); 1125 1126 switch (Subtarget.getDarwinDirective()) { 1127 default: break; 1128 case PPC::DIR_970: 1129 case PPC::DIR_A2: 1130 case PPC::DIR_E500: 1131 case PPC::DIR_E500mc: 1132 case PPC::DIR_E5500: 1133 case PPC::DIR_PWR4: 1134 case PPC::DIR_PWR5: 1135 case PPC::DIR_PWR5X: 1136 case PPC::DIR_PWR6: 1137 case PPC::DIR_PWR6X: 1138 case PPC::DIR_PWR7: 1139 case PPC::DIR_PWR8: 1140 case PPC::DIR_PWR9: 1141 setPrefFunctionAlignment(4); 1142 setPrefLoopAlignment(4); 1143 break; 1144 } 1145 1146 if (Subtarget.enableMachineScheduler()) 1147 setSchedulingPreference(Sched::Source); 1148 else 1149 setSchedulingPreference(Sched::Hybrid); 1150 1151 computeRegisterProperties(STI.getRegisterInfo()); 1152 1153 // The Freescale cores do better with aggressive inlining of memcpy and 1154 // friends. GCC uses same threshold of 128 bytes (= 32 word stores). 1155 if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || 1156 Subtarget.getDarwinDirective() == PPC::DIR_E5500) { 1157 MaxStoresPerMemset = 32; 1158 MaxStoresPerMemsetOptSize = 16; 1159 MaxStoresPerMemcpy = 32; 1160 MaxStoresPerMemcpyOptSize = 8; 1161 MaxStoresPerMemmove = 32; 1162 MaxStoresPerMemmoveOptSize = 8; 1163 } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) { 1164 // The A2 also benefits from (very) aggressive inlining of memcpy and 1165 // friends. The overhead of a the function call, even when warm, can be 1166 // over one hundred cycles. 1167 MaxStoresPerMemset = 128; 1168 MaxStoresPerMemcpy = 128; 1169 MaxStoresPerMemmove = 128; 1170 MaxLoadsPerMemcmp = 128; 1171 } else { 1172 MaxLoadsPerMemcmp = 8; 1173 MaxLoadsPerMemcmpOptSize = 4; 1174 } 1175 } 1176 1177 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1178 /// the desired ByVal argument alignment. 1179 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, 1180 unsigned MaxMaxAlign) { 1181 if (MaxAlign == MaxMaxAlign) 1182 return; 1183 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1184 if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256) 1185 MaxAlign = 32; 1186 else if (VTy->getBitWidth() >= 128 && MaxAlign < 16) 1187 MaxAlign = 16; 1188 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1189 unsigned EltAlign = 0; 1190 getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign); 1191 if (EltAlign > MaxAlign) 1192 MaxAlign = EltAlign; 1193 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1194 for (auto *EltTy : STy->elements()) { 1195 unsigned EltAlign = 0; 1196 getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign); 1197 if (EltAlign > MaxAlign) 1198 MaxAlign = EltAlign; 1199 if (MaxAlign == MaxMaxAlign) 1200 break; 1201 } 1202 } 1203 } 1204 1205 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1206 /// function arguments in the caller parameter area. 1207 unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, 1208 const DataLayout &DL) const { 1209 // Darwin passes everything on 4 byte boundary. 1210 if (Subtarget.isDarwin()) 1211 return 4; 1212 1213 // 16byte and wider vectors are passed on 16byte boundary. 1214 // The rest is 8 on PPC64 and 4 on PPC32 boundary. 1215 unsigned Align = Subtarget.isPPC64() ? 8 : 4; 1216 if (Subtarget.hasAltivec() || Subtarget.hasQPX()) 1217 getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16); 1218 return Align; 1219 } 1220 1221 unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, 1222 CallingConv:: ID CC, 1223 EVT VT) const { 1224 if (Subtarget.hasSPE() && VT == MVT::f64) 1225 return 2; 1226 return PPCTargetLowering::getNumRegisters(Context, VT); 1227 } 1228 1229 MVT PPCTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, 1230 CallingConv:: ID CC, 1231 EVT VT) const { 1232 if (Subtarget.hasSPE() && VT == MVT::f64) 1233 return MVT::i32; 1234 return PPCTargetLowering::getRegisterType(Context, VT); 1235 } 1236 1237 bool PPCTargetLowering::useSoftFloat() const { 1238 return Subtarget.useSoftFloat(); 1239 } 1240 1241 bool PPCTargetLowering::hasSPE() const { 1242 return Subtarget.hasSPE(); 1243 } 1244 1245 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { 1246 switch ((PPCISD::NodeType)Opcode) { 1247 case PPCISD::FIRST_NUMBER: break; 1248 case PPCISD::FSEL: return "PPCISD::FSEL"; 1249 case PPCISD::FCFID: return "PPCISD::FCFID"; 1250 case PPCISD::FCFIDU: return "PPCISD::FCFIDU"; 1251 case PPCISD::FCFIDS: return "PPCISD::FCFIDS"; 1252 case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS"; 1253 case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ"; 1254 case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; 1255 case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ"; 1256 case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ"; 1257 case PPCISD::FP_TO_UINT_IN_VSR: 1258 return "PPCISD::FP_TO_UINT_IN_VSR,"; 1259 case PPCISD::FP_TO_SINT_IN_VSR: 1260 return "PPCISD::FP_TO_SINT_IN_VSR"; 1261 case PPCISD::FRE: return "PPCISD::FRE"; 1262 case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; 1263 case PPCISD::STFIWX: return "PPCISD::STFIWX"; 1264 case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; 1265 case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; 1266 case PPCISD::VPERM: return "PPCISD::VPERM"; 1267 case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; 1268 case PPCISD::VECINSERT: return "PPCISD::VECINSERT"; 1269 case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE"; 1270 case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; 1271 case PPCISD::VECSHL: return "PPCISD::VECSHL"; 1272 case PPCISD::CMPB: return "PPCISD::CMPB"; 1273 case PPCISD::Hi: return "PPCISD::Hi"; 1274 case PPCISD::Lo: return "PPCISD::Lo"; 1275 case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; 1276 case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8"; 1277 case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16"; 1278 case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; 1279 case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; 1280 case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; 1281 case PPCISD::SRL: return "PPCISD::SRL"; 1282 case PPCISD::SRA: return "PPCISD::SRA"; 1283 case PPCISD::SHL: return "PPCISD::SHL"; 1284 case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE"; 1285 case PPCISD::CALL: return "PPCISD::CALL"; 1286 case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP"; 1287 case PPCISD::MTCTR: return "PPCISD::MTCTR"; 1288 case PPCISD::BCTRL: return "PPCISD::BCTRL"; 1289 case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC"; 1290 case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; 1291 case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE"; 1292 case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; 1293 case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP"; 1294 case PPCISD::MFOCRF: return "PPCISD::MFOCRF"; 1295 case PPCISD::MFVSR: return "PPCISD::MFVSR"; 1296 case PPCISD::MTVSRA: return "PPCISD::MTVSRA"; 1297 case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ"; 1298 case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP"; 1299 case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP"; 1300 case PPCISD::ANDIo_1_EQ_BIT: return "PPCISD::ANDIo_1_EQ_BIT"; 1301 case PPCISD::ANDIo_1_GT_BIT: return "PPCISD::ANDIo_1_GT_BIT"; 1302 case PPCISD::VCMP: return "PPCISD::VCMP"; 1303 case PPCISD::VCMPo: return "PPCISD::VCMPo"; 1304 case PPCISD::LBRX: return "PPCISD::LBRX"; 1305 case PPCISD::STBRX: return "PPCISD::STBRX"; 1306 case PPCISD::LFIWAX: return "PPCISD::LFIWAX"; 1307 case PPCISD::LFIWZX: return "PPCISD::LFIWZX"; 1308 case PPCISD::LXSIZX: return "PPCISD::LXSIZX"; 1309 case PPCISD::STXSIX: return "PPCISD::STXSIX"; 1310 case PPCISD::VEXTS: return "PPCISD::VEXTS"; 1311 case PPCISD::SExtVElems: return "PPCISD::SExtVElems"; 1312 case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; 1313 case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; 1314 case PPCISD::ST_VSR_SCAL_INT: 1315 return "PPCISD::ST_VSR_SCAL_INT"; 1316 case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; 1317 case PPCISD::BDNZ: return "PPCISD::BDNZ"; 1318 case PPCISD::BDZ: return "PPCISD::BDZ"; 1319 case PPCISD::MFFS: return "PPCISD::MFFS"; 1320 case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ"; 1321 case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN"; 1322 case PPCISD::CR6SET: return "PPCISD::CR6SET"; 1323 case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET"; 1324 case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT"; 1325 case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT"; 1326 case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA"; 1327 case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L"; 1328 case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS"; 1329 case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA"; 1330 case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L"; 1331 case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR"; 1332 case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR"; 1333 case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA"; 1334 case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L"; 1335 case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR"; 1336 case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR"; 1337 case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; 1338 case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; 1339 case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; 1340 case PPCISD::SC: return "PPCISD::SC"; 1341 case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB"; 1342 case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE"; 1343 case PPCISD::RFEBB: return "PPCISD::RFEBB"; 1344 case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; 1345 case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; 1346 case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; 1347 case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; 1348 case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; 1349 case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI"; 1350 case PPCISD::QBFLT: return "PPCISD::QBFLT"; 1351 case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; 1352 case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; 1353 case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI"; 1354 } 1355 return nullptr; 1356 } 1357 1358 EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, 1359 EVT VT) const { 1360 if (!VT.isVector()) 1361 return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; 1362 1363 if (Subtarget.hasQPX()) 1364 return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements()); 1365 1366 return VT.changeVectorElementTypeToInteger(); 1367 } 1368 1369 bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const { 1370 assert(VT.isFloatingPoint() && "Non-floating-point FMA?"); 1371 return true; 1372 } 1373 1374 //===----------------------------------------------------------------------===// 1375 // Node matching predicates, for use by the tblgen matching code. 1376 //===----------------------------------------------------------------------===// 1377 1378 /// isFloatingPointZero - Return true if this is 0.0 or -0.0. 1379 static bool isFloatingPointZero(SDValue Op) { 1380 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 1381 return CFP->getValueAPF().isZero(); 1382 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 1383 // Maybe this has already been legalized into the constant pool? 1384 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1))) 1385 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 1386 return CFP->getValueAPF().isZero(); 1387 } 1388 return false; 1389 } 1390 1391 /// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return 1392 /// true if Op is undef or if it matches the specified value. 1393 static bool isConstantOrUndef(int Op, int Val) { 1394 return Op < 0 || Op == Val; 1395 } 1396 1397 /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a 1398 /// VPKUHUM instruction. 1399 /// The ShuffleKind distinguishes between big-endian operations with 1400 /// two different inputs (0), either-endian operations with two identical 1401 /// inputs (1), and little-endian operations with two different inputs (2). 1402 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). 1403 bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, 1404 SelectionDAG &DAG) { 1405 bool IsLE = DAG.getDataLayout().isLittleEndian(); 1406 if (ShuffleKind == 0) { 1407 if (IsLE) 1408 return false; 1409 for (unsigned i = 0; i != 16; ++i) 1410 if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) 1411 return false; 1412 } else if (ShuffleKind == 2) { 1413 if (!IsLE) 1414 return false; 1415 for (unsigned i = 0; i != 16; ++i) 1416 if (!isConstantOrUndef(N->getMaskElt(i), i*2)) 1417 return false; 1418 } else if (ShuffleKind == 1) { 1419 unsigned j = IsLE ? 0 : 1; 1420 for (unsigned i = 0; i != 8; ++i) 1421 if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || 1422 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) 1423 return false; 1424 } 1425 return true; 1426 } 1427 1428 /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a 1429 /// VPKUWUM instruction. 1430 /// The ShuffleKind distinguishes between big-endian operations with 1431 /// two different inputs (0), either-endian operations with two identical 1432 /// inputs (1), and little-endian operations with two different inputs (2). 1433 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). 1434 bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, 1435 SelectionDAG &DAG) { 1436 bool IsLE = DAG.getDataLayout().isLittleEndian(); 1437 if (ShuffleKind == 0) { 1438 if (IsLE) 1439 return false; 1440 for (unsigned i = 0; i != 16; i += 2) 1441 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || 1442 !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) 1443 return false; 1444 } else if (ShuffleKind == 2) { 1445 if (!IsLE) 1446 return false; 1447 for (unsigned i = 0; i != 16; i += 2) 1448 if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || 1449 !isConstantOrUndef(N->getMaskElt(i+1), i*2+1)) 1450 return false; 1451 } else if (ShuffleKind == 1) { 1452 unsigned j = IsLE ? 0 : 2; 1453 for (unsigned i = 0; i != 8; i += 2) 1454 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || 1455 !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || 1456 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || 1457 !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1)) 1458 return false; 1459 } 1460 return true; 1461 } 1462 1463 /// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a 1464 /// VPKUDUM instruction, AND the VPKUDUM instruction exists for the 1465 /// current subtarget. 1466 /// 1467 /// The ShuffleKind distinguishes between big-endian operations with 1468 /// two different inputs (0), either-endian operations with two identical 1469 /// inputs (1), and little-endian operations with two different inputs (2). 1470 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). 1471 bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, 1472 SelectionDAG &DAG) { 1473 const PPCSubtarget& Subtarget = 1474 static_cast<const PPCSubtarget&>(DAG.getSubtarget()); 1475 if (!Subtarget.hasP8Vector()) 1476 return false; 1477 1478 bool IsLE = DAG.getDataLayout().isLittleEndian(); 1479 if (ShuffleKind == 0) { 1480 if (IsLE) 1481 return false; 1482 for (unsigned i = 0; i != 16; i += 4) 1483 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) || 1484 !isConstantOrUndef(N->getMaskElt(i+1), i*2+5) || 1485 !isConstantOrUndef(N->getMaskElt(i+2), i*2+6) || 1486 !isConstantOrUndef(N->getMaskElt(i+3), i*2+7)) 1487 return false; 1488 } else if (ShuffleKind == 2) { 1489 if (!IsLE) 1490 return false; 1491 for (unsigned i = 0; i != 16; i += 4) 1492 if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || 1493 !isConstantOrUndef(N->getMaskElt(i+1), i*2+1) || 1494 !isConstantOrUndef(N->getMaskElt(i+2), i*2+2) || 1495 !isConstantOrUndef(N->getMaskElt(i+3), i*2+3)) 1496 return false; 1497 } else if (ShuffleKind == 1) { 1498 unsigned j = IsLE ? 0 : 4; 1499 for (unsigned i = 0; i != 8; i += 4) 1500 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || 1501 !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || 1502 !isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) || 1503 !isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) || 1504 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || 1505 !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) || 1506 !isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) || 1507 !isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3)) 1508 return false; 1509 } 1510 return true; 1511 } 1512 1513 /// isVMerge - Common function, used to match vmrg* shuffles. 1514 /// 1515 static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, 1516 unsigned LHSStart, unsigned RHSStart) { 1517 if (N->getValueType(0) != MVT::v16i8) 1518 return false; 1519 assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) && 1520 "Unsupported merge size!"); 1521 1522 for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units 1523 for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit 1524 if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j), 1525 LHSStart+j+i*UnitSize) || 1526 !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j), 1527 RHSStart+j+i*UnitSize)) 1528 return false; 1529 } 1530 return true; 1531 } 1532 1533 /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for 1534 /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). 1535 /// The ShuffleKind distinguishes between big-endian merges with two 1536 /// different inputs (0), either-endian merges with two identical inputs (1), 1537 /// and little-endian merges with two different inputs (2). For the latter, 1538 /// the input operands are swapped (see PPCInstrAltivec.td). 1539 bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, 1540 unsigned ShuffleKind, SelectionDAG &DAG) { 1541 if (DAG.getDataLayout().isLittleEndian()) { 1542 if (ShuffleKind == 1) // unary 1543 return isVMerge(N, UnitSize, 0, 0); 1544 else if (ShuffleKind == 2) // swapped 1545 return isVMerge(N, UnitSize, 0, 16); 1546 else 1547 return false; 1548 } else { 1549 if (ShuffleKind == 1) // unary 1550 return isVMerge(N, UnitSize, 8, 8); 1551 else if (ShuffleKind == 0) // normal 1552 return isVMerge(N, UnitSize, 8, 24); 1553 else 1554 return false; 1555 } 1556 } 1557 1558 /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for 1559 /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). 1560 /// The ShuffleKind distinguishes between big-endian merges with two 1561 /// different inputs (0), either-endian merges with two identical inputs (1), 1562 /// and little-endian merges with two different inputs (2). For the latter, 1563 /// the input operands are swapped (see PPCInstrAltivec.td). 1564 bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, 1565 unsigned ShuffleKind, SelectionDAG &DAG) { 1566 if (DAG.getDataLayout().isLittleEndian()) { 1567 if (ShuffleKind == 1) // unary 1568 return isVMerge(N, UnitSize, 8, 8); 1569 else if (ShuffleKind == 2) // swapped 1570 return isVMerge(N, UnitSize, 8, 24); 1571 else 1572 return false; 1573 } else { 1574 if (ShuffleKind == 1) // unary 1575 return isVMerge(N, UnitSize, 0, 0); 1576 else if (ShuffleKind == 0) // normal 1577 return isVMerge(N, UnitSize, 0, 16); 1578 else 1579 return false; 1580 } 1581 } 1582 1583 /** 1584 * Common function used to match vmrgew and vmrgow shuffles 1585 * 1586 * The indexOffset determines whether to look for even or odd words in 1587 * the shuffle mask. This is based on the of the endianness of the target 1588 * machine. 1589 * - Little Endian: 1590 * - Use offset of 0 to check for odd elements 1591 * - Use offset of 4 to check for even elements 1592 * - Big Endian: 1593 * - Use offset of 0 to check for even elements 1594 * - Use offset of 4 to check for odd elements 1595 * A detailed description of the vector element ordering for little endian and 1596 * big endian can be found at 1597 * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html 1598 * Targeting your applications - what little endian and big endian IBM XL C/C++ 1599 * compiler differences mean to you 1600 * 1601 * The mask to the shuffle vector instruction specifies the indices of the 1602 * elements from the two input vectors to place in the result. The elements are 1603 * numbered in array-access order, starting with the first vector. These vectors 1604 * are always of type v16i8, thus each vector will contain 16 elements of size 1605 * 8. More info on the shuffle vector can be found in the 1606 * http://llvm.org/docs/LangRef.html#shufflevector-instruction 1607 * Language Reference. 1608 * 1609 * The RHSStartValue indicates whether the same input vectors are used (unary) 1610 * or two different input vectors are used, based on the following: 1611 * - If the instruction uses the same vector for both inputs, the range of the 1612 * indices will be 0 to 15. In this case, the RHSStart value passed should 1613 * be 0. 1614 * - If the instruction has two different vectors then the range of the 1615 * indices will be 0 to 31. In this case, the RHSStart value passed should 1616 * be 16 (indices 0-15 specify elements in the first vector while indices 16 1617 * to 31 specify elements in the second vector). 1618 * 1619 * \param[in] N The shuffle vector SD Node to analyze 1620 * \param[in] IndexOffset Specifies whether to look for even or odd elements 1621 * \param[in] RHSStartValue Specifies the starting index for the righthand input 1622 * vector to the shuffle_vector instruction 1623 * \return true iff this shuffle vector represents an even or odd word merge 1624 */ 1625 static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset, 1626 unsigned RHSStartValue) { 1627 if (N->getValueType(0) != MVT::v16i8) 1628 return false; 1629 1630 for (unsigned i = 0; i < 2; ++i) 1631 for (unsigned j = 0; j < 4; ++j) 1632 if (!isConstantOrUndef(N->getMaskElt(i*4+j), 1633 i*RHSStartValue+j+IndexOffset) || 1634 !isConstantOrUndef(N->getMaskElt(i*4+j+8), 1635 i*RHSStartValue+j+IndexOffset+8)) 1636 return false; 1637 return true; 1638 } 1639 1640 /** 1641 * Determine if the specified shuffle mask is suitable for the vmrgew or 1642 * vmrgow instructions. 1643 * 1644 * \param[in] N The shuffle vector SD Node to analyze 1645 * \param[in] CheckEven Check for an even merge (true) or an odd merge (false) 1646 * \param[in] ShuffleKind Identify the type of merge: 1647 * - 0 = big-endian merge with two different inputs; 1648 * - 1 = either-endian merge with two identical inputs; 1649 * - 2 = little-endian merge with two different inputs (inputs are swapped for 1650 * little-endian merges). 1651 * \param[in] DAG The current SelectionDAG 1652 * \return true iff this shuffle mask 1653 */ 1654 bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, 1655 unsigned ShuffleKind, SelectionDAG &DAG) { 1656 if (DAG.getDataLayout().isLittleEndian()) { 1657 unsigned indexOffset = CheckEven ? 4 : 0; 1658 if (ShuffleKind == 1) // Unary 1659 return isVMerge(N, indexOffset, 0); 1660 else if (ShuffleKind == 2) // swapped 1661 return isVMerge(N, indexOffset, 16); 1662 else 1663 return false; 1664 } 1665 else { 1666 unsigned indexOffset = CheckEven ? 0 : 4; 1667 if (ShuffleKind == 1) // Unary 1668 return isVMerge(N, indexOffset, 0); 1669 else if (ShuffleKind == 0) // Normal 1670 return isVMerge(N, indexOffset, 16); 1671 else 1672 return false; 1673 } 1674 return false; 1675 } 1676 1677 /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift 1678 /// amount, otherwise return -1. 1679 /// The ShuffleKind distinguishes between big-endian operations with two 1680 /// different inputs (0), either-endian operations with two identical inputs 1681 /// (1), and little-endian operations with two different inputs (2). For the 1682 /// latter, the input operands are swapped (see PPCInstrAltivec.td). 1683 int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, 1684 SelectionDAG &DAG) { 1685 if (N->getValueType(0) != MVT::v16i8) 1686 return -1; 1687 1688 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 1689 1690 // Find the first non-undef value in the shuffle mask. 1691 unsigned i; 1692 for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i) 1693 /*search*/; 1694 1695 if (i == 16) return -1; // all undef. 1696 1697 // Otherwise, check to see if the rest of the elements are consecutively 1698 // numbered from this value. 1699 unsigned ShiftAmt = SVOp->getMaskElt(i); 1700 if (ShiftAmt < i) return -1; 1701 1702 ShiftAmt -= i; 1703 bool isLE = DAG.getDataLayout().isLittleEndian(); 1704 1705 if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { 1706 // Check the rest of the elements to see if they are consecutive. 1707 for (++i; i != 16; ++i) 1708 if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) 1709 return -1; 1710 } else if (ShuffleKind == 1) { 1711 // Check the rest of the elements to see if they are consecutive. 1712 for (++i; i != 16; ++i) 1713 if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) 1714 return -1; 1715 } else 1716 return -1; 1717 1718 if (isLE) 1719 ShiftAmt = 16 - ShiftAmt; 1720 1721 return ShiftAmt; 1722 } 1723 1724 /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand 1725 /// specifies a splat of a single element that is suitable for input to 1726 /// VSPLTB/VSPLTH/VSPLTW. 1727 bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { 1728 assert(N->getValueType(0) == MVT::v16i8 && 1729 (EltSize == 1 || EltSize == 2 || EltSize == 4)); 1730 1731 // The consecutive indices need to specify an element, not part of two 1732 // different elements. So abandon ship early if this isn't the case. 1733 if (N->getMaskElt(0) % EltSize != 0) 1734 return false; 1735 1736 // This is a splat operation if each element of the permute is the same, and 1737 // if the value doesn't reference the second vector. 1738 unsigned ElementBase = N->getMaskElt(0); 1739 1740 // FIXME: Handle UNDEF elements too! 1741 if (ElementBase >= 16) 1742 return false; 1743 1744 // Check that the indices are consecutive, in the case of a multi-byte element 1745 // splatted with a v16i8 mask. 1746 for (unsigned i = 1; i != EltSize; ++i) 1747 if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase)) 1748 return false; 1749 1750 for (unsigned i = EltSize, e = 16; i != e; i += EltSize) { 1751 if (N->getMaskElt(i) < 0) continue; 1752 for (unsigned j = 0; j != EltSize; ++j) 1753 if (N->getMaskElt(i+j) != N->getMaskElt(j)) 1754 return false; 1755 } 1756 return true; 1757 } 1758 1759 /// Check that the mask is shuffling N byte elements. Within each N byte 1760 /// element of the mask, the indices could be either in increasing or 1761 /// decreasing order as long as they are consecutive. 1762 /// \param[in] N the shuffle vector SD Node to analyze 1763 /// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/ 1764 /// Word/DoubleWord/QuadWord). 1765 /// \param[in] StepLen the delta indices number among the N byte element, if 1766 /// the mask is in increasing/decreasing order then it is 1/-1. 1767 /// \return true iff the mask is shuffling N byte elements. 1768 static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width, 1769 int StepLen) { 1770 assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) && 1771 "Unexpected element width."); 1772 assert((StepLen == 1 || StepLen == -1) && "Unexpected element width."); 1773 1774 unsigned NumOfElem = 16 / Width; 1775 unsigned MaskVal[16]; // Width is never greater than 16 1776 for (unsigned i = 0; i < NumOfElem; ++i) { 1777 MaskVal[0] = N->getMaskElt(i * Width); 1778 if ((StepLen == 1) && (MaskVal[0] % Width)) { 1779 return false; 1780 } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) { 1781 return false; 1782 } 1783 1784 for (unsigned int j = 1; j < Width; ++j) { 1785 MaskVal[j] = N->getMaskElt(i * Width + j); 1786 if (MaskVal[j] != MaskVal[j-1] + StepLen) { 1787 return false; 1788 } 1789 } 1790 } 1791 1792 return true; 1793 } 1794 1795 bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, 1796 unsigned &InsertAtByte, bool &Swap, bool IsLE) { 1797 if (!isNByteElemShuffleMask(N, 4, 1)) 1798 return false; 1799 1800 // Now we look at mask elements 0,4,8,12 1801 unsigned M0 = N->getMaskElt(0) / 4; 1802 unsigned M1 = N->getMaskElt(4) / 4; 1803 unsigned M2 = N->getMaskElt(8) / 4; 1804 unsigned M3 = N->getMaskElt(12) / 4; 1805 unsigned LittleEndianShifts[] = { 2, 1, 0, 3 }; 1806 unsigned BigEndianShifts[] = { 3, 0, 1, 2 }; 1807 1808 // Below, let H and L be arbitrary elements of the shuffle mask 1809 // where H is in the range [4,7] and L is in the range [0,3]. 1810 // H, 1, 2, 3 or L, 5, 6, 7 1811 if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) || 1812 (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) { 1813 ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3]; 1814 InsertAtByte = IsLE ? 12 : 0; 1815 Swap = M0 < 4; 1816 return true; 1817 } 1818 // 0, H, 2, 3 or 4, L, 6, 7 1819 if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) || 1820 (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) { 1821 ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3]; 1822 InsertAtByte = IsLE ? 8 : 4; 1823 Swap = M1 < 4; 1824 return true; 1825 } 1826 // 0, 1, H, 3 or 4, 5, L, 7 1827 if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) || 1828 (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) { 1829 ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3]; 1830 InsertAtByte = IsLE ? 4 : 8; 1831 Swap = M2 < 4; 1832 return true; 1833 } 1834 // 0, 1, 2, H or 4, 5, 6, L 1835 if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) || 1836 (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) { 1837 ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3]; 1838 InsertAtByte = IsLE ? 0 : 12; 1839 Swap = M3 < 4; 1840 return true; 1841 } 1842 1843 // If both vector operands for the shuffle are the same vector, the mask will 1844 // contain only elements from the first one and the second one will be undef. 1845 if (N->getOperand(1).isUndef()) { 1846 ShiftElts = 0; 1847 Swap = true; 1848 unsigned XXINSERTWSrcElem = IsLE ? 2 : 1; 1849 if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) { 1850 InsertAtByte = IsLE ? 12 : 0; 1851 return true; 1852 } 1853 if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) { 1854 InsertAtByte = IsLE ? 8 : 4; 1855 return true; 1856 } 1857 if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) { 1858 InsertAtByte = IsLE ? 4 : 8; 1859 return true; 1860 } 1861 if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) { 1862 InsertAtByte = IsLE ? 0 : 12; 1863 return true; 1864 } 1865 } 1866 1867 return false; 1868 } 1869 1870 bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, 1871 bool &Swap, bool IsLE) { 1872 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); 1873 // Ensure each byte index of the word is consecutive. 1874 if (!isNByteElemShuffleMask(N, 4, 1)) 1875 return false; 1876 1877 // Now we look at mask elements 0,4,8,12, which are the beginning of words. 1878 unsigned M0 = N->getMaskElt(0) / 4; 1879 unsigned M1 = N->getMaskElt(4) / 4; 1880 unsigned M2 = N->getMaskElt(8) / 4; 1881 unsigned M3 = N->getMaskElt(12) / 4; 1882 1883 // If both vector operands for the shuffle are the same vector, the mask will 1884 // contain only elements from the first one and the second one will be undef. 1885 if (N->getOperand(1).isUndef()) { 1886 assert(M0 < 4 && "Indexing into an undef vector?"); 1887 if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4) 1888 return false; 1889 1890 ShiftElts = IsLE ? (4 - M0) % 4 : M0; 1891 Swap = false; 1892 return true; 1893 } 1894 1895 // Ensure each word index of the ShuffleVector Mask is consecutive. 1896 if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8) 1897 return false; 1898 1899 if (IsLE) { 1900 if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) { 1901 // Input vectors don't need to be swapped if the leading element 1902 // of the result is one of the 3 left elements of the second vector 1903 // (or if there is no shift to be done at all). 1904 Swap = false; 1905 ShiftElts = (8 - M0) % 8; 1906 } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) { 1907 // Input vectors need to be swapped if the leading element 1908 // of the result is one of the 3 left elements of the first vector 1909 // (or if we're shifting by 4 - thereby simply swapping the vectors). 1910 Swap = true; 1911 ShiftElts = (4 - M0) % 4; 1912 } 1913 1914 return true; 1915 } else { // BE 1916 if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) { 1917 // Input vectors don't need to be swapped if the leading element 1918 // of the result is one of the 4 elements of the first vector. 1919 Swap = false; 1920 ShiftElts = M0; 1921 } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) { 1922 // Input vectors need to be swapped if the leading element 1923 // of the result is one of the 4 elements of the right vector. 1924 Swap = true; 1925 ShiftElts = M0 - 4; 1926 } 1927 1928 return true; 1929 } 1930 } 1931 1932 bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) { 1933 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); 1934 1935 if (!isNByteElemShuffleMask(N, Width, -1)) 1936 return false; 1937 1938 for (int i = 0; i < 16; i += Width) 1939 if (N->getMaskElt(i) != i + Width - 1) 1940 return false; 1941 1942 return true; 1943 } 1944 1945 bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) { 1946 return isXXBRShuffleMaskHelper(N, 2); 1947 } 1948 1949 bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) { 1950 return isXXBRShuffleMaskHelper(N, 4); 1951 } 1952 1953 bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) { 1954 return isXXBRShuffleMaskHelper(N, 8); 1955 } 1956 1957 bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) { 1958 return isXXBRShuffleMaskHelper(N, 16); 1959 } 1960 1961 /// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap 1962 /// if the inputs to the instruction should be swapped and set \p DM to the 1963 /// value for the immediate. 1964 /// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI 1965 /// AND element 0 of the result comes from the first input (LE) or second input 1966 /// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered. 1967 /// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle 1968 /// mask. 1969 bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM, 1970 bool &Swap, bool IsLE) { 1971 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); 1972 1973 // Ensure each byte index of the double word is consecutive. 1974 if (!isNByteElemShuffleMask(N, 8, 1)) 1975 return false; 1976 1977 unsigned M0 = N->getMaskElt(0) / 8; 1978 unsigned M1 = N->getMaskElt(8) / 8; 1979 assert(((M0 | M1) < 4) && "A mask element out of bounds?"); 1980 1981 // If both vector operands for the shuffle are the same vector, the mask will 1982 // contain only elements from the first one and the second one will be undef. 1983 if (N->getOperand(1).isUndef()) { 1984 if ((M0 | M1) < 2) { 1985 DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1); 1986 Swap = false; 1987 return true; 1988 } else 1989 return false; 1990 } 1991 1992 if (IsLE) { 1993 if (M0 > 1 && M1 < 2) { 1994 Swap = false; 1995 } else if (M0 < 2 && M1 > 1) { 1996 M0 = (M0 + 2) % 4; 1997 M1 = (M1 + 2) % 4; 1998 Swap = true; 1999 } else 2000 return false; 2001 2002 // Note: if control flow comes here that means Swap is already set above 2003 DM = (((~M1) & 1) << 1) + ((~M0) & 1); 2004 return true; 2005 } else { // BE 2006 if (M0 < 2 && M1 > 1) { 2007 Swap = false; 2008 } else if (M0 > 1 && M1 < 2) { 2009 M0 = (M0 + 2) % 4; 2010 M1 = (M1 + 2) % 4; 2011 Swap = true; 2012 } else 2013 return false; 2014 2015 // Note: if control flow comes here that means Swap is already set above 2016 DM = (M0 << 1) + (M1 & 1); 2017 return true; 2018 } 2019 } 2020 2021 2022 /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the 2023 /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. 2024 unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, 2025 SelectionDAG &DAG) { 2026 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2027 assert(isSplatShuffleMask(SVOp, EltSize)); 2028 if (DAG.getDataLayout().isLittleEndian()) 2029 return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); 2030 else 2031 return SVOp->getMaskElt(0) / EltSize; 2032 } 2033 2034 /// get_VSPLTI_elt - If this is a build_vector of constants which can be formed 2035 /// by using a vspltis[bhw] instruction of the specified element size, return 2036 /// the constant being splatted. The ByteSize field indicates the number of 2037 /// bytes of each element [124] -> [bhw]. 2038 SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { 2039 SDValue OpVal(nullptr, 0); 2040 2041 // If ByteSize of the splat is bigger than the element size of the 2042 // build_vector, then we have a case where we are checking for a splat where 2043 // multiple elements of the buildvector are folded together into a single 2044 // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8). 2045 unsigned EltSize = 16/N->getNumOperands(); 2046 if (EltSize < ByteSize) { 2047 unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval. 2048 SDValue UniquedVals[4]; 2049 assert(Multiple > 1 && Multiple <= 4 && "How can this happen?"); 2050 2051 // See if all of the elements in the buildvector agree across. 2052 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 2053 if (N->getOperand(i).isUndef()) continue; 2054 // If the element isn't a constant, bail fully out. 2055 if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue(); 2056 2057 if (!UniquedVals[i&(Multiple-1)].getNode()) 2058 UniquedVals[i&(Multiple-1)] = N->getOperand(i); 2059 else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i)) 2060 return SDValue(); // no match. 2061 } 2062 2063 // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains 2064 // either constant or undef values that are identical for each chunk. See 2065 // if these chunks can form into a larger vspltis*. 2066 2067 // Check to see if all of the leading entries are either 0 or -1. If 2068 // neither, then this won't fit into the immediate field. 2069 bool LeadingZero = true; 2070 bool LeadingOnes = true; 2071 for (unsigned i = 0; i != Multiple-1; ++i) { 2072 if (!UniquedVals[i].getNode()) continue; // Must have been undefs. 2073 2074 LeadingZero &= isNullConstant(UniquedVals[i]); 2075 LeadingOnes &= isAllOnesConstant(UniquedVals[i]); 2076 } 2077 // Finally, check the least significant entry. 2078 if (LeadingZero) { 2079 if (!UniquedVals[Multiple-1].getNode()) 2080 return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef 2081 int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue(); 2082 if (Val < 16) // 0,0,0,4 -> vspltisw(4) 2083 return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); 2084 } 2085 if (LeadingOnes) { 2086 if (!UniquedVals[Multiple-1].getNode()) 2087 return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef 2088 int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue(); 2089 if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2) 2090 return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); 2091 } 2092 2093 return SDValue(); 2094 } 2095 2096 // Check to see if this buildvec has a single non-undef value in its elements. 2097 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 2098 if (N->getOperand(i).isUndef()) continue; 2099 if (!OpVal.getNode()) 2100 OpVal = N->getOperand(i); 2101 else if (OpVal != N->getOperand(i)) 2102 return SDValue(); 2103 } 2104 2105 if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def. 2106 2107 unsigned ValSizeInBytes = EltSize; 2108 uint64_t Value = 0; 2109 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) { 2110 Value = CN->getZExtValue(); 2111 } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) { 2112 assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!"); 2113 Value = FloatToBits(CN->getValueAPF().convertToFloat()); 2114 } 2115 2116 // If the splat value is larger than the element value, then we can never do 2117 // this splat. The only case that we could fit the replicated bits into our 2118 // immediate field for would be zero, and we prefer to use vxor for it. 2119 if (ValSizeInBytes < ByteSize) return SDValue(); 2120 2121 // If the element value is larger than the splat value, check if it consists 2122 // of a repeated bit pattern of size ByteSize. 2123 if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8)) 2124 return SDValue(); 2125 2126 // Properly sign extend the value. 2127 int MaskVal = SignExtend32(Value, ByteSize * 8); 2128 2129 // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros. 2130 if (MaskVal == 0) return SDValue(); 2131 2132 // Finally, if this value fits in a 5 bit sext field, return it 2133 if (SignExtend32<5>(MaskVal) == MaskVal) 2134 return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32); 2135 return SDValue(); 2136 } 2137 2138 /// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift 2139 /// amount, otherwise return -1. 2140 int PPC::isQVALIGNIShuffleMask(SDNode *N) { 2141 EVT VT = N->getValueType(0); 2142 if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1) 2143 return -1; 2144 2145 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2146 2147 // Find the first non-undef value in the shuffle mask. 2148 unsigned i; 2149 for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i) 2150 /*search*/; 2151 2152 if (i == 4) return -1; // all undef. 2153 2154 // Otherwise, check to see if the rest of the elements are consecutively 2155 // numbered from this value. 2156 unsigned ShiftAmt = SVOp->getMaskElt(i); 2157 if (ShiftAmt < i) return -1; 2158 ShiftAmt -= i; 2159 2160 // Check the rest of the elements to see if they are consecutive. 2161 for (++i; i != 4; ++i) 2162 if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) 2163 return -1; 2164 2165 return ShiftAmt; 2166 } 2167 2168 //===----------------------------------------------------------------------===// 2169 // Addressing Mode Selection 2170 //===----------------------------------------------------------------------===// 2171 2172 /// isIntS16Immediate - This method tests to see if the node is either a 32-bit 2173 /// or 64-bit immediate, and if the value can be accurately represented as a 2174 /// sign extension from a 16-bit value. If so, this returns true and the 2175 /// immediate. 2176 bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) { 2177 if (!isa<ConstantSDNode>(N)) 2178 return false; 2179 2180 Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue(); 2181 if (N->getValueType(0) == MVT::i32) 2182 return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); 2183 else 2184 return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); 2185 } 2186 bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) { 2187 return isIntS16Immediate(Op.getNode(), Imm); 2188 } 2189 2190 /// SelectAddressRegReg - Given the specified addressed, check to see if it 2191 /// can be represented as an indexed [r+r] operation. Returns false if it 2192 /// can be more efficiently represented with [r+imm]. 2193 bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, 2194 SDValue &Index, 2195 SelectionDAG &DAG) const { 2196 int16_t imm = 0; 2197 if (N.getOpcode() == ISD::ADD) { 2198 if (isIntS16Immediate(N.getOperand(1), imm)) 2199 return false; // r+i 2200 if (N.getOperand(1).getOpcode() == PPCISD::Lo) 2201 return false; // r+i 2202 2203 Base = N.getOperand(0); 2204 Index = N.getOperand(1); 2205 return true; 2206 } else if (N.getOpcode() == ISD::OR) { 2207 if (isIntS16Immediate(N.getOperand(1), imm)) 2208 return false; // r+i can fold it if we can. 2209 2210 // If this is an or of disjoint bitfields, we can codegen this as an add 2211 // (for better address arithmetic) if the LHS and RHS of the OR are provably 2212 // disjoint. 2213 KnownBits LHSKnown, RHSKnown; 2214 DAG.computeKnownBits(N.getOperand(0), LHSKnown); 2215 2216 if (LHSKnown.Zero.getBoolValue()) { 2217 DAG.computeKnownBits(N.getOperand(1), RHSKnown); 2218 // If all of the bits are known zero on the LHS or RHS, the add won't 2219 // carry. 2220 if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) { 2221 Base = N.getOperand(0); 2222 Index = N.getOperand(1); 2223 return true; 2224 } 2225 } 2226 } 2227 2228 return false; 2229 } 2230 2231 // If we happen to be doing an i64 load or store into a stack slot that has 2232 // less than a 4-byte alignment, then the frame-index elimination may need to 2233 // use an indexed load or store instruction (because the offset may not be a 2234 // multiple of 4). The extra register needed to hold the offset comes from the 2235 // register scavenger, and it is possible that the scavenger will need to use 2236 // an emergency spill slot. As a result, we need to make sure that a spill slot 2237 // is allocated when doing an i64 load/store into a less-than-4-byte-aligned 2238 // stack slot. 2239 static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { 2240 // FIXME: This does not handle the LWA case. 2241 if (VT != MVT::i64) 2242 return; 2243 2244 // NOTE: We'll exclude negative FIs here, which come from argument 2245 // lowering, because there are no known test cases triggering this problem 2246 // using packed structures (or similar). We can remove this exclusion if 2247 // we find such a test case. The reason why this is so test-case driven is 2248 // because this entire 'fixup' is only to prevent crashes (from the 2249 // register scavenger) on not-really-valid inputs. For example, if we have: 2250 // %a = alloca i1 2251 // %b = bitcast i1* %a to i64* 2252 // store i64* a, i64 b 2253 // then the store should really be marked as 'align 1', but is not. If it 2254 // were marked as 'align 1' then the indexed form would have been 2255 // instruction-selected initially, and the problem this 'fixup' is preventing 2256 // won't happen regardless. 2257 if (FrameIdx < 0) 2258 return; 2259 2260 MachineFunction &MF = DAG.getMachineFunction(); 2261 MachineFrameInfo &MFI = MF.getFrameInfo(); 2262 2263 unsigned Align = MFI.getObjectAlignment(FrameIdx); 2264 if (Align >= 4) 2265 return; 2266 2267 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 2268 FuncInfo->setHasNonRISpills(); 2269 } 2270 2271 /// Returns true if the address N can be represented by a base register plus 2272 /// a signed 16-bit displacement [r+imm], and if it is not better 2273 /// represented as reg+reg. If \p Alignment is non-zero, only accept 2274 /// displacements that are multiples of that value. 2275 bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, 2276 SDValue &Base, 2277 SelectionDAG &DAG, 2278 unsigned Alignment) const { 2279 // FIXME dl should come from parent load or store, not from address 2280 SDLoc dl(N); 2281 // If this can be more profitably realized as r+r, fail. 2282 if (SelectAddressRegReg(N, Disp, Base, DAG)) 2283 return false; 2284 2285 if (N.getOpcode() == ISD::ADD) { 2286 int16_t imm = 0; 2287 if (isIntS16Immediate(N.getOperand(1), imm) && 2288 (!Alignment || (imm % Alignment) == 0)) { 2289 Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); 2290 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { 2291 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); 2292 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); 2293 } else { 2294 Base = N.getOperand(0); 2295 } 2296 return true; // [r+i] 2297 } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { 2298 // Match LOAD (ADD (X, Lo(G))). 2299 assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue() 2300 && "Cannot handle constant offsets yet!"); 2301 Disp = N.getOperand(1).getOperand(0); // The global address. 2302 assert(Disp.getOpcode() == ISD::TargetGlobalAddress || 2303 Disp.getOpcode() == ISD::TargetGlobalTLSAddress || 2304 Disp.getOpcode() == ISD::TargetConstantPool || 2305 Disp.getOpcode() == ISD::TargetJumpTable); 2306 Base = N.getOperand(0); 2307 return true; // [&g+r] 2308 } 2309 } else if (N.getOpcode() == ISD::OR) { 2310 int16_t imm = 0; 2311 if (isIntS16Immediate(N.getOperand(1), imm) && 2312 (!Alignment || (imm % Alignment) == 0)) { 2313 // If this is an or of disjoint bitfields, we can codegen this as an add 2314 // (for better address arithmetic) if the LHS and RHS of the OR are 2315 // provably disjoint. 2316 KnownBits LHSKnown; 2317 DAG.computeKnownBits(N.getOperand(0), LHSKnown); 2318 2319 if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { 2320 // If all of the bits are known zero on the LHS or RHS, the add won't 2321 // carry. 2322 if (FrameIndexSDNode *FI = 2323 dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { 2324 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); 2325 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); 2326 } else { 2327 Base = N.getOperand(0); 2328 } 2329 Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); 2330 return true; 2331 } 2332 } 2333 } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { 2334 // Loading from a constant address. 2335 2336 // If this address fits entirely in a 16-bit sext immediate field, codegen 2337 // this as "d, 0" 2338 int16_t Imm; 2339 if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) { 2340 Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); 2341 Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, 2342 CN->getValueType(0)); 2343 return true; 2344 } 2345 2346 // Handle 32-bit sext immediates with LIS + addr mode. 2347 if ((CN->getValueType(0) == MVT::i32 || 2348 (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && 2349 (!Alignment || (CN->getZExtValue() % Alignment) == 0)) { 2350 int Addr = (int)CN->getZExtValue(); 2351 2352 // Otherwise, break this down into an LIS + disp. 2353 Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32); 2354 2355 Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl, 2356 MVT::i32); 2357 unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8; 2358 Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0); 2359 return true; 2360 } 2361 } 2362 2363 Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout())); 2364 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) { 2365 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); 2366 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); 2367 } else 2368 Base = N; 2369 return true; // [r+0] 2370 } 2371 2372 /// SelectAddressRegRegOnly - Given the specified addressed, force it to be 2373 /// represented as an indexed [r+r] operation. 2374 bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, 2375 SDValue &Index, 2376 SelectionDAG &DAG) const { 2377 // Check to see if we can easily represent this as an [r+r] address. This 2378 // will fail if it thinks that the address is more profitably represented as 2379 // reg+imm, e.g. where imm = 0. 2380 if (SelectAddressRegReg(N, Base, Index, DAG)) 2381 return true; 2382 2383 // If the address is the result of an add, we will utilize the fact that the 2384 // address calculation includes an implicit add. However, we can reduce 2385 // register pressure if we do not materialize a constant just for use as the 2386 // index register. We only get rid of the add if it is not an add of a 2387 // value and a 16-bit signed constant and both have a single use. 2388 int16_t imm = 0; 2389 if (N.getOpcode() == ISD::ADD && 2390 (!isIntS16Immediate(N.getOperand(1), imm) || 2391 !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) { 2392 Base = N.getOperand(0); 2393 Index = N.getOperand(1); 2394 return true; 2395 } 2396 2397 // Otherwise, do it the hard way, using R0 as the base register. 2398 Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, 2399 N.getValueType()); 2400 Index = N; 2401 return true; 2402 } 2403 2404 /// Returns true if we should use a direct load into vector instruction 2405 /// (such as lxsd or lfd), instead of a load into gpr + direct move sequence. 2406 static bool usePartialVectorLoads(SDNode *N) { 2407 if (!N->hasOneUse()) 2408 return false; 2409 2410 // If there are any other uses other than scalar to vector, then we should 2411 // keep it as a scalar load -> direct move pattern to prevent multiple 2412 // loads. Currently, only check for i64 since we have lxsd/lfd to do this 2413 // efficiently, but no update equivalent. 2414 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 2415 EVT MemVT = LD->getMemoryVT(); 2416 if (MemVT.isSimple() && MemVT.getSimpleVT().SimpleTy == MVT::i64) { 2417 SDNode *User = *(LD->use_begin()); 2418 if (User->getOpcode() == ISD::SCALAR_TO_VECTOR) 2419 return true; 2420 } 2421 } 2422 2423 return false; 2424 } 2425 2426 /// getPreIndexedAddressParts - returns true by value, base pointer and 2427 /// offset pointer and addressing mode by reference if the node's address 2428 /// can be legally represented as pre-indexed load / store address. 2429 bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 2430 SDValue &Offset, 2431 ISD::MemIndexedMode &AM, 2432 SelectionDAG &DAG) const { 2433 if (DisablePPCPreinc) return false; 2434 2435 bool isLoad = true; 2436 SDValue Ptr; 2437 EVT VT; 2438 unsigned Alignment; 2439 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 2440 Ptr = LD->getBasePtr(); 2441 VT = LD->getMemoryVT(); 2442 Alignment = LD->getAlignment(); 2443 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 2444 Ptr = ST->getBasePtr(); 2445 VT = ST->getMemoryVT(); 2446 Alignment = ST->getAlignment(); 2447 isLoad = false; 2448 } else 2449 return false; 2450 2451 // Do not generate pre-inc forms for specific loads that feed scalar_to_vector 2452 // instructions because we can fold these into a more efficient instruction 2453 // instead, (such as LXSD). 2454 if (isLoad && usePartialVectorLoads(N)) { 2455 return false; 2456 } 2457 2458 // PowerPC doesn't have preinc load/store instructions for vectors (except 2459 // for QPX, which does have preinc r+r forms). 2460 if (VT.isVector()) { 2461 if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) { 2462 return false; 2463 } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) { 2464 AM = ISD::PRE_INC; 2465 return true; 2466 } 2467 } 2468 2469 if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { 2470 // Common code will reject creating a pre-inc form if the base pointer 2471 // is a frame index, or if N is a store and the base pointer is either 2472 // the same as or a predecessor of the value being stored. Check for 2473 // those situations here, and try with swapped Base/Offset instead. 2474 bool Swap = false; 2475 2476 if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base)) 2477 Swap = true; 2478 else if (!isLoad) { 2479 SDValue Val = cast<StoreSDNode>(N)->getValue(); 2480 if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode())) 2481 Swap = true; 2482 } 2483 2484 if (Swap) 2485 std::swap(Base, Offset); 2486 2487 AM = ISD::PRE_INC; 2488 return true; 2489 } 2490 2491 // LDU/STU can only handle immediates that are a multiple of 4. 2492 if (VT != MVT::i64) { 2493 if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0)) 2494 return false; 2495 } else { 2496 // LDU/STU need an address with at least 4-byte alignment. 2497 if (Alignment < 4) 2498 return false; 2499 2500 if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4)) 2501 return false; 2502 } 2503 2504 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 2505 // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of 2506 // sext i32 to i64 when addr mode is r+i. 2507 if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 && 2508 LD->getExtensionType() == ISD::SEXTLOAD && 2509 isa<ConstantSDNode>(Offset)) 2510 return false; 2511 } 2512 2513 AM = ISD::PRE_INC; 2514 return true; 2515 } 2516 2517 //===----------------------------------------------------------------------===// 2518 // LowerOperation implementation 2519 //===----------------------------------------------------------------------===// 2520 2521 /// Return true if we should reference labels using a PICBase, set the HiOpFlags 2522 /// and LoOpFlags to the target MO flags. 2523 static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget, 2524 unsigned &HiOpFlags, unsigned &LoOpFlags, 2525 const GlobalValue *GV = nullptr) { 2526 HiOpFlags = PPCII::MO_HA; 2527 LoOpFlags = PPCII::MO_LO; 2528 2529 // Don't use the pic base if not in PIC relocation model. 2530 if (IsPIC) { 2531 HiOpFlags |= PPCII::MO_PIC_FLAG; 2532 LoOpFlags |= PPCII::MO_PIC_FLAG; 2533 } 2534 2535 // If this is a reference to a global value that requires a non-lazy-ptr, make 2536 // sure that instruction lowering adds it. 2537 if (GV && Subtarget.hasLazyResolverStub(GV)) { 2538 HiOpFlags |= PPCII::MO_NLP_FLAG; 2539 LoOpFlags |= PPCII::MO_NLP_FLAG; 2540 2541 if (GV->hasHiddenVisibility()) { 2542 HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; 2543 LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; 2544 } 2545 } 2546 } 2547 2548 static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC, 2549 SelectionDAG &DAG) { 2550 SDLoc DL(HiPart); 2551 EVT PtrVT = HiPart.getValueType(); 2552 SDValue Zero = DAG.getConstant(0, DL, PtrVT); 2553 2554 SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero); 2555 SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero); 2556 2557 // With PIC, the first instruction is actually "GR+hi(&G)". 2558 if (isPIC) 2559 Hi = DAG.getNode(ISD::ADD, DL, PtrVT, 2560 DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi); 2561 2562 // Generate non-pic code that has direct accesses to the constant pool. 2563 // The address of the global is just (hi(&g)+lo(&g)). 2564 return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo); 2565 } 2566 2567 static void setUsesTOCBasePtr(MachineFunction &MF) { 2568 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 2569 FuncInfo->setUsesTOCBasePtr(); 2570 } 2571 2572 static void setUsesTOCBasePtr(SelectionDAG &DAG) { 2573 setUsesTOCBasePtr(DAG.getMachineFunction()); 2574 } 2575 2576 static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit, 2577 SDValue GA) { 2578 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 2579 SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) : 2580 DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); 2581 2582 SDValue Ops[] = { GA, Reg }; 2583 return DAG.getMemIntrinsicNode( 2584 PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, 2585 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, 2586 MachineMemOperand::MOLoad); 2587 } 2588 2589 SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, 2590 SelectionDAG &DAG) const { 2591 EVT PtrVT = Op.getValueType(); 2592 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 2593 const Constant *C = CP->getConstVal(); 2594 2595 // 64-bit SVR4 ABI code is always position-independent. 2596 // The actual address of the GlobalValue is stored in the TOC. 2597 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2598 setUsesTOCBasePtr(DAG); 2599 SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); 2600 return getTOCEntry(DAG, SDLoc(CP), true, GA); 2601 } 2602 2603 unsigned MOHiFlag, MOLoFlag; 2604 bool IsPIC = isPositionIndependent(); 2605 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); 2606 2607 if (IsPIC && Subtarget.isSVR4ABI()) { 2608 SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 2609 PPCII::MO_PIC_FLAG); 2610 return getTOCEntry(DAG, SDLoc(CP), false, GA); 2611 } 2612 2613 SDValue CPIHi = 2614 DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag); 2615 SDValue CPILo = 2616 DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag); 2617 return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG); 2618 } 2619 2620 // For 64-bit PowerPC, prefer the more compact relative encodings. 2621 // This trades 32 bits per jump table entry for one or two instructions 2622 // on the jump site. 2623 unsigned PPCTargetLowering::getJumpTableEncoding() const { 2624 if (isJumpTableRelative()) 2625 return MachineJumpTableInfo::EK_LabelDifference32; 2626 2627 return TargetLowering::getJumpTableEncoding(); 2628 } 2629 2630 bool PPCTargetLowering::isJumpTableRelative() const { 2631 if (Subtarget.isPPC64()) 2632 return true; 2633 return TargetLowering::isJumpTableRelative(); 2634 } 2635 2636 SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table, 2637 SelectionDAG &DAG) const { 2638 if (!Subtarget.isPPC64()) 2639 return TargetLowering::getPICJumpTableRelocBase(Table, DAG); 2640 2641 switch (getTargetMachine().getCodeModel()) { 2642 case CodeModel::Small: 2643 case CodeModel::Medium: 2644 return TargetLowering::getPICJumpTableRelocBase(Table, DAG); 2645 default: 2646 return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(), 2647 getPointerTy(DAG.getDataLayout())); 2648 } 2649 } 2650 2651 const MCExpr * 2652 PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF, 2653 unsigned JTI, 2654 MCContext &Ctx) const { 2655 if (!Subtarget.isPPC64()) 2656 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 2657 2658 switch (getTargetMachine().getCodeModel()) { 2659 case CodeModel::Small: 2660 case CodeModel::Medium: 2661 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 2662 default: 2663 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); 2664 } 2665 } 2666 2667 SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 2668 EVT PtrVT = Op.getValueType(); 2669 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 2670 2671 // 64-bit SVR4 ABI code is always position-independent. 2672 // The actual address of the GlobalValue is stored in the TOC. 2673 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2674 setUsesTOCBasePtr(DAG); 2675 SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); 2676 return getTOCEntry(DAG, SDLoc(JT), true, GA); 2677 } 2678 2679 unsigned MOHiFlag, MOLoFlag; 2680 bool IsPIC = isPositionIndependent(); 2681 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); 2682 2683 if (IsPIC && Subtarget.isSVR4ABI()) { 2684 SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 2685 PPCII::MO_PIC_FLAG); 2686 return getTOCEntry(DAG, SDLoc(GA), false, GA); 2687 } 2688 2689 SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); 2690 SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag); 2691 return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG); 2692 } 2693 2694 SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, 2695 SelectionDAG &DAG) const { 2696 EVT PtrVT = Op.getValueType(); 2697 BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op); 2698 const BlockAddress *BA = BASDN->getBlockAddress(); 2699 2700 // 64-bit SVR4 ABI code is always position-independent. 2701 // The actual BlockAddress is stored in the TOC. 2702 if (Subtarget.isSVR4ABI() && 2703 (Subtarget.isPPC64() || isPositionIndependent())) { 2704 if (Subtarget.isPPC64()) 2705 setUsesTOCBasePtr(DAG); 2706 SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()); 2707 return getTOCEntry(DAG, SDLoc(BASDN), Subtarget.isPPC64(), GA); 2708 } 2709 2710 unsigned MOHiFlag, MOLoFlag; 2711 bool IsPIC = isPositionIndependent(); 2712 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); 2713 SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag); 2714 SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag); 2715 return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG); 2716 } 2717 2718 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, 2719 SelectionDAG &DAG) const { 2720 // FIXME: TLS addresses currently use medium model code sequences, 2721 // which is the most useful form. Eventually support for small and 2722 // large models could be added if users need it, at the cost of 2723 // additional complexity. 2724 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 2725 if (DAG.getTarget().useEmulatedTLS()) 2726 return LowerToTLSEmulatedModel(GA, DAG); 2727 2728 SDLoc dl(GA); 2729 const GlobalValue *GV = GA->getGlobal(); 2730 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2731 bool is64bit = Subtarget.isPPC64(); 2732 const Module *M = DAG.getMachineFunction().getFunction().getParent(); 2733 PICLevel::Level picLevel = M->getPICLevel(); 2734 2735 TLSModel::Model Model = getTargetMachine().getTLSModel(GV); 2736 2737 if (Model == TLSModel::LocalExec) { 2738 SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2739 PPCII::MO_TPREL_HA); 2740 SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2741 PPCII::MO_TPREL_LO); 2742 SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64) 2743 : DAG.getRegister(PPC::R2, MVT::i32); 2744 2745 SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); 2746 return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); 2747 } 2748 2749 if (Model == TLSModel::InitialExec) { 2750 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); 2751 SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2752 PPCII::MO_TLS); 2753 SDValue GOTPtr; 2754 if (is64bit) { 2755 setUsesTOCBasePtr(DAG); 2756 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); 2757 GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, 2758 PtrVT, GOTReg, TGA); 2759 } else 2760 GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); 2761 SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, 2762 PtrVT, TGA, GOTPtr); 2763 return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS); 2764 } 2765 2766 if (Model == TLSModel::GeneralDynamic) { 2767 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); 2768 SDValue GOTPtr; 2769 if (is64bit) { 2770 setUsesTOCBasePtr(DAG); 2771 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); 2772 GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, 2773 GOTReg, TGA); 2774 } else { 2775 if (picLevel == PICLevel::SmallPIC) 2776 GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); 2777 else 2778 GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); 2779 } 2780 return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT, 2781 GOTPtr, TGA, TGA); 2782 } 2783 2784 if (Model == TLSModel::LocalDynamic) { 2785 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); 2786 SDValue GOTPtr; 2787 if (is64bit) { 2788 setUsesTOCBasePtr(DAG); 2789 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); 2790 GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, 2791 GOTReg, TGA); 2792 } else { 2793 if (picLevel == PICLevel::SmallPIC) 2794 GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); 2795 else 2796 GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); 2797 } 2798 SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl, 2799 PtrVT, GOTPtr, TGA, TGA); 2800 SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, 2801 PtrVT, TLSAddr, TGA); 2802 return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); 2803 } 2804 2805 llvm_unreachable("Unknown TLS model!"); 2806 } 2807 2808 SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, 2809 SelectionDAG &DAG) const { 2810 EVT PtrVT = Op.getValueType(); 2811 GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op); 2812 SDLoc DL(GSDN); 2813 const GlobalValue *GV = GSDN->getGlobal(); 2814 2815 // 64-bit SVR4 ABI code is always position-independent. 2816 // The actual address of the GlobalValue is stored in the TOC. 2817 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2818 setUsesTOCBasePtr(DAG); 2819 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset()); 2820 return getTOCEntry(DAG, DL, true, GA); 2821 } 2822 2823 unsigned MOHiFlag, MOLoFlag; 2824 bool IsPIC = isPositionIndependent(); 2825 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV); 2826 2827 if (IsPIC && Subtarget.isSVR4ABI()) { 2828 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 2829 GSDN->getOffset(), 2830 PPCII::MO_PIC_FLAG); 2831 return getTOCEntry(DAG, DL, false, GA); 2832 } 2833 2834 SDValue GAHi = 2835 DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag); 2836 SDValue GALo = 2837 DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag); 2838 2839 SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG); 2840 2841 // If the global reference is actually to a non-lazy-pointer, we have to do an 2842 // extra load to get the address of the global. 2843 if (MOHiFlag & PPCII::MO_NLP_FLAG) 2844 Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo()); 2845 return Ptr; 2846 } 2847 2848 SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 2849 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 2850 SDLoc dl(Op); 2851 2852 if (Op.getValueType() == MVT::v2i64) { 2853 // When the operands themselves are v2i64 values, we need to do something 2854 // special because VSX has no underlying comparison operations for these. 2855 if (Op.getOperand(0).getValueType() == MVT::v2i64) { 2856 // Equality can be handled by casting to the legal type for Altivec 2857 // comparisons, everything else needs to be expanded. 2858 if (CC == ISD::SETEQ || CC == ISD::SETNE) { 2859 return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 2860 DAG.getSetCC(dl, MVT::v4i32, 2861 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), 2862 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)), 2863 CC)); 2864 } 2865 2866 return SDValue(); 2867 } 2868 2869 // We handle most of these in the usual way. 2870 return Op; 2871 } 2872 2873 // If we're comparing for equality to zero, expose the fact that this is 2874 // implemented as a ctlz/srl pair on ppc, so that the dag combiner can 2875 // fold the new nodes. 2876 if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG)) 2877 return V; 2878 2879 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 2880 // Leave comparisons against 0 and -1 alone for now, since they're usually 2881 // optimized. FIXME: revisit this when we can custom lower all setcc 2882 // optimizations. 2883 if (C->isAllOnesValue() || C->isNullValue()) 2884 return SDValue(); 2885 } 2886 2887 // If we have an integer seteq/setne, turn it into a compare against zero 2888 // by xor'ing the rhs with the lhs, which is faster than setting a 2889 // condition register, reading it back out, and masking the correct bit. The 2890 // normal approach here uses sub to do this instead of xor. Using xor exposes 2891 // the result to other bit-twiddling opportunities. 2892 EVT LHSVT = Op.getOperand(0).getValueType(); 2893 if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 2894 EVT VT = Op.getValueType(); 2895 SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0), 2896 Op.getOperand(1)); 2897 return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC); 2898 } 2899 return SDValue(); 2900 } 2901 2902 SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 2903 SDNode *Node = Op.getNode(); 2904 EVT VT = Node->getValueType(0); 2905 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2906 SDValue InChain = Node->getOperand(0); 2907 SDValue VAListPtr = Node->getOperand(1); 2908 const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); 2909 SDLoc dl(Node); 2910 2911 assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only"); 2912 2913 // gpr_index 2914 SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, 2915 VAListPtr, MachinePointerInfo(SV), MVT::i8); 2916 InChain = GprIndex.getValue(1); 2917 2918 if (VT == MVT::i64) { 2919 // Check if GprIndex is even 2920 SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex, 2921 DAG.getConstant(1, dl, MVT::i32)); 2922 SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd, 2923 DAG.getConstant(0, dl, MVT::i32), ISD::SETNE); 2924 SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex, 2925 DAG.getConstant(1, dl, MVT::i32)); 2926 // Align GprIndex to be even if it isn't 2927 GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne, 2928 GprIndex); 2929 } 2930 2931 // fpr index is 1 byte after gpr 2932 SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, 2933 DAG.getConstant(1, dl, MVT::i32)); 2934 2935 // fpr 2936 SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, 2937 FprPtr, MachinePointerInfo(SV), MVT::i8); 2938 InChain = FprIndex.getValue(1); 2939 2940 SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, 2941 DAG.getConstant(8, dl, MVT::i32)); 2942 2943 SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, 2944 DAG.getConstant(4, dl, MVT::i32)); 2945 2946 // areas 2947 SDValue OverflowArea = 2948 DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo()); 2949 InChain = OverflowArea.getValue(1); 2950 2951 SDValue RegSaveArea = 2952 DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo()); 2953 InChain = RegSaveArea.getValue(1); 2954 2955 // select overflow_area if index > 8 2956 SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex, 2957 DAG.getConstant(8, dl, MVT::i32), ISD::SETLT); 2958 2959 // adjustment constant gpr_index * 4/8 2960 SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32, 2961 VT.isInteger() ? GprIndex : FprIndex, 2962 DAG.getConstant(VT.isInteger() ? 4 : 8, dl, 2963 MVT::i32)); 2964 2965 // OurReg = RegSaveArea + RegConstant 2966 SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea, 2967 RegConstant); 2968 2969 // Floating types are 32 bytes into RegSaveArea 2970 if (VT.isFloatingPoint()) 2971 OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg, 2972 DAG.getConstant(32, dl, MVT::i32)); 2973 2974 // increase {f,g}pr_index by 1 (or 2 if VT is i64) 2975 SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32, 2976 VT.isInteger() ? GprIndex : FprIndex, 2977 DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl, 2978 MVT::i32)); 2979 2980 InChain = DAG.getTruncStore(InChain, dl, IndexPlus1, 2981 VT.isInteger() ? VAListPtr : FprPtr, 2982 MachinePointerInfo(SV), MVT::i8); 2983 2984 // determine if we should load from reg_save_area or overflow_area 2985 SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea); 2986 2987 // increase overflow_area by 4/8 if gpr/fpr > 8 2988 SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea, 2989 DAG.getConstant(VT.isInteger() ? 4 : 8, 2990 dl, MVT::i32)); 2991 2992 OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea, 2993 OverflowAreaPlusN); 2994 2995 InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr, 2996 MachinePointerInfo(), MVT::i32); 2997 2998 return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo()); 2999 } 3000 3001 SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 3002 assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only"); 3003 3004 // We have to copy the entire va_list struct: 3005 // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte 3006 return DAG.getMemcpy(Op.getOperand(0), Op, 3007 Op.getOperand(1), Op.getOperand(2), 3008 DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true, 3009 false, MachinePointerInfo(), MachinePointerInfo()); 3010 } 3011 3012 SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, 3013 SelectionDAG &DAG) const { 3014 return Op.getOperand(0); 3015 } 3016 3017 SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 3018 SelectionDAG &DAG) const { 3019 SDValue Chain = Op.getOperand(0); 3020 SDValue Trmp = Op.getOperand(1); // trampoline 3021 SDValue FPtr = Op.getOperand(2); // nested function 3022 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 3023 SDLoc dl(Op); 3024 3025 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3026 bool isPPC64 = (PtrVT == MVT::i64); 3027 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); 3028 3029 TargetLowering::ArgListTy Args; 3030 TargetLowering::ArgListEntry Entry; 3031 3032 Entry.Ty = IntPtrTy; 3033 Entry.Node = Trmp; Args.push_back(Entry); 3034 3035 // TrampSize == (isPPC64 ? 48 : 40); 3036 Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl, 3037 isPPC64 ? MVT::i64 : MVT::i32); 3038 Args.push_back(Entry); 3039 3040 Entry.Node = FPtr; Args.push_back(Entry); 3041 Entry.Node = Nest; Args.push_back(Entry); 3042 3043 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) 3044 TargetLowering::CallLoweringInfo CLI(DAG); 3045 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 3046 CallingConv::C, Type::getVoidTy(*DAG.getContext()), 3047 DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args)); 3048 3049 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3050 return CallResult.second; 3051 } 3052 3053 SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 3054 MachineFunction &MF = DAG.getMachineFunction(); 3055 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3056 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3057 3058 SDLoc dl(Op); 3059 3060 if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) { 3061 // vastart just stores the address of the VarArgsFrameIndex slot into the 3062 // memory location argument. 3063 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3064 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3065 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3066 MachinePointerInfo(SV)); 3067 } 3068 3069 // For the 32-bit SVR4 ABI we follow the layout of the va_list struct. 3070 // We suppose the given va_list is already allocated. 3071 // 3072 // typedef struct { 3073 // char gpr; /* index into the array of 8 GPRs 3074 // * stored in the register save area 3075 // * gpr=0 corresponds to r3, 3076 // * gpr=1 to r4, etc. 3077 // */ 3078 // char fpr; /* index into the array of 8 FPRs 3079 // * stored in the register save area 3080 // * fpr=0 corresponds to f1, 3081 // * fpr=1 to f2, etc. 3082 // */ 3083 // char *overflow_arg_area; 3084 // /* location on stack that holds 3085 // * the next overflow argument 3086 // */ 3087 // char *reg_save_area; 3088 // /* where r3:r10 and f1:f8 (if saved) 3089 // * are stored 3090 // */ 3091 // } va_list[1]; 3092 3093 SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32); 3094 SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32); 3095 SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(), 3096 PtrVT); 3097 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 3098 PtrVT); 3099 3100 uint64_t FrameOffset = PtrVT.getSizeInBits()/8; 3101 SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT); 3102 3103 uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1; 3104 SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT); 3105 3106 uint64_t FPROffset = 1; 3107 SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT); 3108 3109 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3110 3111 // Store first byte : number of int regs 3112 SDValue firstStore = 3113 DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1), 3114 MachinePointerInfo(SV), MVT::i8); 3115 uint64_t nextOffset = FPROffset; 3116 SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1), 3117 ConstFPROffset); 3118 3119 // Store second byte : number of float regs 3120 SDValue secondStore = 3121 DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr, 3122 MachinePointerInfo(SV, nextOffset), MVT::i8); 3123 nextOffset += StackOffset; 3124 nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset); 3125 3126 // Store second word : arguments given on stack 3127 SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr, 3128 MachinePointerInfo(SV, nextOffset)); 3129 nextOffset += FrameOffset; 3130 nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset); 3131 3132 // Store third word : arguments given in registers 3133 return DAG.getStore(thirdStore, dl, FR, nextPtr, 3134 MachinePointerInfo(SV, nextOffset)); 3135 } 3136 3137 #include "PPCGenCallingConv.inc" 3138 3139 // Function whose sole purpose is to kill compiler warnings 3140 // stemming from unused functions included from PPCGenCallingConv.inc. 3141 CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { 3142 return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; 3143 } 3144 3145 bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, 3146 CCValAssign::LocInfo &LocInfo, 3147 ISD::ArgFlagsTy &ArgFlags, 3148 CCState &State) { 3149 return true; 3150 } 3151 3152 bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, 3153 MVT &LocVT, 3154 CCValAssign::LocInfo &LocInfo, 3155 ISD::ArgFlagsTy &ArgFlags, 3156 CCState &State) { 3157 static const MCPhysReg ArgRegs[] = { 3158 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 3159 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 3160 }; 3161 const unsigned NumArgRegs = array_lengthof(ArgRegs); 3162 3163 unsigned RegNum = State.getFirstUnallocated(ArgRegs); 3164 3165 // Skip one register if the first unallocated register has an even register 3166 // number and there are still argument registers available which have not been 3167 // allocated yet. RegNum is actually an index into ArgRegs, which means we 3168 // need to skip a register if RegNum is odd. 3169 if (RegNum != NumArgRegs && RegNum % 2 == 1) { 3170 State.AllocateReg(ArgRegs[RegNum]); 3171 } 3172 3173 // Always return false here, as this function only makes sure that the first 3174 // unallocated register has an odd register number and does not actually 3175 // allocate a register for the current argument. 3176 return false; 3177 } 3178 3179 bool 3180 llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT, 3181 MVT &LocVT, 3182 CCValAssign::LocInfo &LocInfo, 3183 ISD::ArgFlagsTy &ArgFlags, 3184 CCState &State) { 3185 static const MCPhysReg ArgRegs[] = { 3186 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 3187 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 3188 }; 3189 const unsigned NumArgRegs = array_lengthof(ArgRegs); 3190 3191 unsigned RegNum = State.getFirstUnallocated(ArgRegs); 3192 int RegsLeft = NumArgRegs - RegNum; 3193 3194 // Skip if there is not enough registers left for long double type (4 gpr regs 3195 // in soft float mode) and put long double argument on the stack. 3196 if (RegNum != NumArgRegs && RegsLeft < 4) { 3197 for (int i = 0; i < RegsLeft; i++) { 3198 State.AllocateReg(ArgRegs[RegNum + i]); 3199 } 3200 } 3201 3202 return false; 3203 } 3204 3205 bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, 3206 MVT &LocVT, 3207 CCValAssign::LocInfo &LocInfo, 3208 ISD::ArgFlagsTy &ArgFlags, 3209 CCState &State) { 3210 static const MCPhysReg ArgRegs[] = { 3211 PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, 3212 PPC::F8 3213 }; 3214 3215 const unsigned NumArgRegs = array_lengthof(ArgRegs); 3216 3217 unsigned RegNum = State.getFirstUnallocated(ArgRegs); 3218 3219 // If there is only one Floating-point register left we need to put both f64 3220 // values of a split ppc_fp128 value on the stack. 3221 if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) { 3222 State.AllocateReg(ArgRegs[RegNum]); 3223 } 3224 3225 // Always return false here, as this function only makes sure that the two f64 3226 // values a ppc_fp128 value is split into are both passed in registers or both 3227 // passed on the stack and does not actually allocate a register for the 3228 // current argument. 3229 return false; 3230 } 3231 3232 /// FPR - The set of FP registers that should be allocated for arguments, 3233 /// on Darwin. 3234 static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, 3235 PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, 3236 PPC::F11, PPC::F12, PPC::F13}; 3237 3238 /// QFPR - The set of QPX registers that should be allocated for arguments. 3239 static const MCPhysReg QFPR[] = { 3240 PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, 3241 PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; 3242 3243 /// CalculateStackSlotSize - Calculates the size reserved for this argument on 3244 /// the stack. 3245 static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, 3246 unsigned PtrByteSize) { 3247 unsigned ArgSize = ArgVT.getStoreSize(); 3248 if (Flags.isByVal()) 3249 ArgSize = Flags.getByValSize(); 3250 3251 // Round up to multiples of the pointer size, except for array members, 3252 // which are always packed. 3253 if (!Flags.isInConsecutiveRegs()) 3254 ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3255 3256 return ArgSize; 3257 } 3258 3259 /// CalculateStackSlotAlignment - Calculates the alignment of this argument 3260 /// on the stack. 3261 static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, 3262 ISD::ArgFlagsTy Flags, 3263 unsigned PtrByteSize) { 3264 unsigned Align = PtrByteSize; 3265 3266 // Altivec parameters are padded to a 16 byte boundary. 3267 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || 3268 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || 3269 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || 3270 ArgVT == MVT::v1i128 || ArgVT == MVT::f128) 3271 Align = 16; 3272 // QPX vector types stored in double-precision are padded to a 32 byte 3273 // boundary. 3274 else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1) 3275 Align = 32; 3276 3277 // ByVal parameters are aligned as requested. 3278 if (Flags.isByVal()) { 3279 unsigned BVAlign = Flags.getByValAlign(); 3280 if (BVAlign > PtrByteSize) { 3281 if (BVAlign % PtrByteSize != 0) 3282 llvm_unreachable( 3283 "ByVal alignment is not a multiple of the pointer size"); 3284 3285 Align = BVAlign; 3286 } 3287 } 3288 3289 // Array members are always packed to their original alignment. 3290 if (Flags.isInConsecutiveRegs()) { 3291 // If the array member was split into multiple registers, the first 3292 // needs to be aligned to the size of the full type. (Except for 3293 // ppcf128, which is only aligned as its f64 components.) 3294 if (Flags.isSplit() && OrigVT != MVT::ppcf128) 3295 Align = OrigVT.getStoreSize(); 3296 else 3297 Align = ArgVT.getStoreSize(); 3298 } 3299 3300 return Align; 3301 } 3302 3303 /// CalculateStackSlotUsed - Return whether this argument will use its 3304 /// stack slot (instead of being passed in registers). ArgOffset, 3305 /// AvailableFPRs, and AvailableVRs must hold the current argument 3306 /// position, and will be updated to account for this argument. 3307 static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, 3308 ISD::ArgFlagsTy Flags, 3309 unsigned PtrByteSize, 3310 unsigned LinkageSize, 3311 unsigned ParamAreaSize, 3312 unsigned &ArgOffset, 3313 unsigned &AvailableFPRs, 3314 unsigned &AvailableVRs, bool HasQPX) { 3315 bool UseMemory = false; 3316 3317 // Respect alignment of argument on the stack. 3318 unsigned Align = 3319 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); 3320 ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; 3321 // If there's no space left in the argument save area, we must 3322 // use memory (this check also catches zero-sized arguments). 3323 if (ArgOffset >= LinkageSize + ParamAreaSize) 3324 UseMemory = true; 3325 3326 // Allocate argument on the stack. 3327 ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); 3328 if (Flags.isInConsecutiveRegsLast()) 3329 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3330 // If we overran the argument save area, we must use memory 3331 // (this check catches arguments passed partially in memory) 3332 if (ArgOffset > LinkageSize + ParamAreaSize) 3333 UseMemory = true; 3334 3335 // However, if the argument is actually passed in an FPR or a VR, 3336 // we don't use memory after all. 3337 if (!Flags.isByVal()) { 3338 if (ArgVT == MVT::f32 || ArgVT == MVT::f64 || 3339 // QPX registers overlap with the scalar FP registers. 3340 (HasQPX && (ArgVT == MVT::v4f32 || 3341 ArgVT == MVT::v4f64 || 3342 ArgVT == MVT::v4i1))) 3343 if (AvailableFPRs > 0) { 3344 --AvailableFPRs; 3345 return false; 3346 } 3347 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || 3348 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || 3349 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || 3350 ArgVT == MVT::v1i128 || ArgVT == MVT::f128) 3351 if (AvailableVRs > 0) { 3352 --AvailableVRs; 3353 return false; 3354 } 3355 } 3356 3357 return UseMemory; 3358 } 3359 3360 /// EnsureStackAlignment - Round stack frame size up from NumBytes to 3361 /// ensure minimum alignment required for target. 3362 static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering, 3363 unsigned NumBytes) { 3364 unsigned TargetAlign = Lowering->getStackAlignment(); 3365 unsigned AlignMask = TargetAlign - 1; 3366 NumBytes = (NumBytes + AlignMask) & ~AlignMask; 3367 return NumBytes; 3368 } 3369 3370 SDValue PPCTargetLowering::LowerFormalArguments( 3371 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3372 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3373 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3374 if (Subtarget.isSVR4ABI()) { 3375 if (Subtarget.isPPC64()) 3376 return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, 3377 dl, DAG, InVals); 3378 else 3379 return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, 3380 dl, DAG, InVals); 3381 } else { 3382 return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, 3383 dl, DAG, InVals); 3384 } 3385 } 3386 3387 SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( 3388 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3389 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3390 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3391 3392 // 32-bit SVR4 ABI Stack Frame Layout: 3393 // +-----------------------------------+ 3394 // +--> | Back chain | 3395 // | +-----------------------------------+ 3396 // | | Floating-point register save area | 3397 // | +-----------------------------------+ 3398 // | | General register save area | 3399 // | +-----------------------------------+ 3400 // | | CR save word | 3401 // | +-----------------------------------+ 3402 // | | VRSAVE save word | 3403 // | +-----------------------------------+ 3404 // | | Alignment padding | 3405 // | +-----------------------------------+ 3406 // | | Vector register save area | 3407 // | +-----------------------------------+ 3408 // | | Local variable space | 3409 // | +-----------------------------------+ 3410 // | | Parameter list area | 3411 // | +-----------------------------------+ 3412 // | | LR save word | 3413 // | +-----------------------------------+ 3414 // SP--> +--- | Back chain | 3415 // +-----------------------------------+ 3416 // 3417 // Specifications: 3418 // System V Application Binary Interface PowerPC Processor Supplement 3419 // AltiVec Technology Programming Interface Manual 3420 3421 MachineFunction &MF = DAG.getMachineFunction(); 3422 MachineFrameInfo &MFI = MF.getFrameInfo(); 3423 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3424 3425 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3426 // Potential tail calls could cause overwriting of argument stack slots. 3427 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && 3428 (CallConv == CallingConv::Fast)); 3429 unsigned PtrByteSize = 4; 3430 3431 // Assign locations to all of the incoming arguments. 3432 SmallVector<CCValAssign, 16> ArgLocs; 3433 PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3434 *DAG.getContext()); 3435 3436 // Reserve space for the linkage area on the stack. 3437 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 3438 CCInfo.AllocateStack(LinkageSize, PtrByteSize); 3439 if (useSoftFloat() || hasSPE()) 3440 CCInfo.PreAnalyzeFormalArguments(Ins); 3441 3442 CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); 3443 CCInfo.clearWasPPCF128(); 3444 3445 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3446 CCValAssign &VA = ArgLocs[i]; 3447 3448 // Arguments stored in registers. 3449 if (VA.isRegLoc()) { 3450 const TargetRegisterClass *RC; 3451 EVT ValVT = VA.getValVT(); 3452 3453 switch (ValVT.getSimpleVT().SimpleTy) { 3454 default: 3455 llvm_unreachable("ValVT not supported by formal arguments Lowering"); 3456 case MVT::i1: 3457 case MVT::i32: 3458 RC = &PPC::GPRCRegClass; 3459 break; 3460 case MVT::f32: 3461 if (Subtarget.hasP8Vector()) 3462 RC = &PPC::VSSRCRegClass; 3463 else if (Subtarget.hasSPE()) 3464 RC = &PPC::SPE4RCRegClass; 3465 else 3466 RC = &PPC::F4RCRegClass; 3467 break; 3468 case MVT::f64: 3469 if (Subtarget.hasVSX()) 3470 RC = &PPC::VSFRCRegClass; 3471 else if (Subtarget.hasSPE()) 3472 RC = &PPC::SPERCRegClass; 3473 else 3474 RC = &PPC::F8RCRegClass; 3475 break; 3476 case MVT::v16i8: 3477 case MVT::v8i16: 3478 case MVT::v4i32: 3479 RC = &PPC::VRRCRegClass; 3480 break; 3481 case MVT::v4f32: 3482 RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass; 3483 break; 3484 case MVT::v2f64: 3485 case MVT::v2i64: 3486 RC = &PPC::VRRCRegClass; 3487 break; 3488 case MVT::v4f64: 3489 RC = &PPC::QFRCRegClass; 3490 break; 3491 case MVT::v4i1: 3492 RC = &PPC::QBRCRegClass; 3493 break; 3494 } 3495 3496 // Transform the arguments stored in physical registers into virtual ones. 3497 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3498 SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, 3499 ValVT == MVT::i1 ? MVT::i32 : ValVT); 3500 3501 if (ValVT == MVT::i1) 3502 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); 3503 3504 InVals.push_back(ArgValue); 3505 } else { 3506 // Argument stored in memory. 3507 assert(VA.isMemLoc()); 3508 3509 // Get the extended size of the argument type in stack 3510 unsigned ArgSize = VA.getLocVT().getStoreSize(); 3511 // Get the actual size of the argument type 3512 unsigned ObjSize = VA.getValVT().getStoreSize(); 3513 unsigned ArgOffset = VA.getLocMemOffset(); 3514 // Stack objects in PPC32 are right justified. 3515 ArgOffset += ArgSize - ObjSize; 3516 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, isImmutable); 3517 3518 // Create load nodes to retrieve arguments from the stack. 3519 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3520 InVals.push_back( 3521 DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo())); 3522 } 3523 } 3524 3525 // Assign locations to all of the incoming aggregate by value arguments. 3526 // Aggregates passed by value are stored in the local variable space of the 3527 // caller's stack frame, right above the parameter list area. 3528 SmallVector<CCValAssign, 16> ByValArgLocs; 3529 CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), 3530 ByValArgLocs, *DAG.getContext()); 3531 3532 // Reserve stack space for the allocations in CCInfo. 3533 CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); 3534 3535 CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal); 3536 3537 // Area that is at least reserved in the caller of this function. 3538 unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); 3539 MinReservedArea = std::max(MinReservedArea, LinkageSize); 3540 3541 // Set the size that is at least reserved in caller of this function. Tail 3542 // call optimized function's reserved stack space needs to be aligned so that 3543 // taking the difference between two stack areas will result in an aligned 3544 // stack. 3545 MinReservedArea = 3546 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); 3547 FuncInfo->setMinReservedArea(MinReservedArea); 3548 3549 SmallVector<SDValue, 8> MemOps; 3550 3551 // If the function takes variable number of arguments, make a frame index for 3552 // the start of the first vararg value... for expansion of llvm.va_start. 3553 if (isVarArg) { 3554 static const MCPhysReg GPArgRegs[] = { 3555 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 3556 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 3557 }; 3558 const unsigned NumGPArgRegs = array_lengthof(GPArgRegs); 3559 3560 static const MCPhysReg FPArgRegs[] = { 3561 PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, 3562 PPC::F8 3563 }; 3564 unsigned NumFPArgRegs = array_lengthof(FPArgRegs); 3565 3566 if (useSoftFloat() || hasSPE()) 3567 NumFPArgRegs = 0; 3568 3569 FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs)); 3570 FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs)); 3571 3572 // Make room for NumGPArgRegs and NumFPArgRegs. 3573 int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 + 3574 NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8; 3575 3576 FuncInfo->setVarArgsStackOffset( 3577 MFI.CreateFixedObject(PtrVT.getSizeInBits()/8, 3578 CCInfo.getNextStackOffset(), true)); 3579 3580 FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false)); 3581 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3582 3583 // The fixed integer arguments of a variadic function are stored to the 3584 // VarArgsFrameIndex on the stack so that they may be loaded by 3585 // dereferencing the result of va_next. 3586 for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) { 3587 // Get an existing live-in vreg, or add a new one. 3588 unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]); 3589 if (!VReg) 3590 VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass); 3591 3592 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3593 SDValue Store = 3594 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 3595 MemOps.push_back(Store); 3596 // Increment the address by four for the next argument to store 3597 SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); 3598 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 3599 } 3600 3601 // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6 3602 // is set. 3603 // The double arguments are stored to the VarArgsFrameIndex 3604 // on the stack. 3605 for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) { 3606 // Get an existing live-in vreg, or add a new one. 3607 unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]); 3608 if (!VReg) 3609 VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass); 3610 3611 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64); 3612 SDValue Store = 3613 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 3614 MemOps.push_back(Store); 3615 // Increment the address by eight for the next argument to store 3616 SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl, 3617 PtrVT); 3618 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 3619 } 3620 } 3621 3622 if (!MemOps.empty()) 3623 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3624 3625 return Chain; 3626 } 3627 3628 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote 3629 // value to MVT::i64 and then truncate to the correct register size. 3630 SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, 3631 EVT ObjectVT, SelectionDAG &DAG, 3632 SDValue ArgVal, 3633 const SDLoc &dl) const { 3634 if (Flags.isSExt()) 3635 ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal, 3636 DAG.getValueType(ObjectVT)); 3637 else if (Flags.isZExt()) 3638 ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal, 3639 DAG.getValueType(ObjectVT)); 3640 3641 return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal); 3642 } 3643 3644 SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( 3645 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3646 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3647 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3648 // TODO: add description of PPC stack frame format, or at least some docs. 3649 // 3650 bool isELFv2ABI = Subtarget.isELFv2ABI(); 3651 bool isLittleEndian = Subtarget.isLittleEndian(); 3652 MachineFunction &MF = DAG.getMachineFunction(); 3653 MachineFrameInfo &MFI = MF.getFrameInfo(); 3654 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3655 3656 assert(!(CallConv == CallingConv::Fast && isVarArg) && 3657 "fastcc not supported on varargs functions"); 3658 3659 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3660 // Potential tail calls could cause overwriting of argument stack slots. 3661 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && 3662 (CallConv == CallingConv::Fast)); 3663 unsigned PtrByteSize = 8; 3664 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 3665 3666 static const MCPhysReg GPR[] = { 3667 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 3668 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 3669 }; 3670 static const MCPhysReg VR[] = { 3671 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 3672 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 3673 }; 3674 3675 const unsigned Num_GPR_Regs = array_lengthof(GPR); 3676 const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; 3677 const unsigned Num_VR_Regs = array_lengthof(VR); 3678 const unsigned Num_QFPR_Regs = Num_FPR_Regs; 3679 3680 // Do a first pass over the arguments to determine whether the ABI 3681 // guarantees that our caller has allocated the parameter save area 3682 // on its stack frame. In the ELFv1 ABI, this is always the case; 3683 // in the ELFv2 ABI, it is true if this is a vararg function or if 3684 // any parameter is located in a stack slot. 3685 3686 bool HasParameterArea = !isELFv2ABI || isVarArg; 3687 unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; 3688 unsigned NumBytes = LinkageSize; 3689 unsigned AvailableFPRs = Num_FPR_Regs; 3690 unsigned AvailableVRs = Num_VR_Regs; 3691 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 3692 if (Ins[i].Flags.isNest()) 3693 continue; 3694 3695 if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, 3696 PtrByteSize, LinkageSize, ParamAreaSize, 3697 NumBytes, AvailableFPRs, AvailableVRs, 3698 Subtarget.hasQPX())) 3699 HasParameterArea = true; 3700 } 3701 3702 // Add DAG nodes to load the arguments or copy them out of registers. On 3703 // entry to a function on PPC, the arguments start after the linkage area, 3704 // although the first ones are often in registers. 3705 3706 unsigned ArgOffset = LinkageSize; 3707 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 3708 unsigned &QFPR_idx = FPR_idx; 3709 SmallVector<SDValue, 8> MemOps; 3710 Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); 3711 unsigned CurArgIdx = 0; 3712 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { 3713 SDValue ArgVal; 3714 bool needsLoad = false; 3715 EVT ObjectVT = Ins[ArgNo].VT; 3716 EVT OrigVT = Ins[ArgNo].ArgVT; 3717 unsigned ObjSize = ObjectVT.getStoreSize(); 3718 unsigned ArgSize = ObjSize; 3719 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; 3720 if (Ins[ArgNo].isOrigArg()) { 3721 std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); 3722 CurArgIdx = Ins[ArgNo].getOrigArgIndex(); 3723 } 3724 // We re-align the argument offset for each argument, except when using the 3725 // fast calling convention, when we need to make sure we do that only when 3726 // we'll actually use a stack slot. 3727 unsigned CurArgOffset, Align; 3728 auto ComputeArgOffset = [&]() { 3729 /* Respect alignment of argument on the stack. */ 3730 Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); 3731 ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; 3732 CurArgOffset = ArgOffset; 3733 }; 3734 3735 if (CallConv != CallingConv::Fast) { 3736 ComputeArgOffset(); 3737 3738 /* Compute GPR index associated with argument offset. */ 3739 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; 3740 GPR_idx = std::min(GPR_idx, Num_GPR_Regs); 3741 } 3742 3743 // FIXME the codegen can be much improved in some cases. 3744 // We do not have to keep everything in memory. 3745 if (Flags.isByVal()) { 3746 assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); 3747 3748 if (CallConv == CallingConv::Fast) 3749 ComputeArgOffset(); 3750 3751 // ObjSize is the true size, ArgSize rounded up to multiple of registers. 3752 ObjSize = Flags.getByValSize(); 3753 ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3754 // Empty aggregate parameters do not take up registers. Examples: 3755 // struct { } a; 3756 // union { } b; 3757 // int c[0]; 3758 // etc. However, we have to provide a place-holder in InVals, so 3759 // pretend we have an 8-byte item at the current address for that 3760 // purpose. 3761 if (!ObjSize) { 3762 int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true); 3763 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3764 InVals.push_back(FIN); 3765 continue; 3766 } 3767 3768 // Create a stack object covering all stack doublewords occupied 3769 // by the argument. If the argument is (fully or partially) on 3770 // the stack, or if the argument is fully in registers but the 3771 // caller has allocated the parameter save anyway, we can refer 3772 // directly to the caller's stack frame. Otherwise, create a 3773 // local copy in our own frame. 3774 int FI; 3775 if (HasParameterArea || 3776 ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) 3777 FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true); 3778 else 3779 FI = MFI.CreateStackObject(ArgSize, Align, false); 3780 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3781 3782 // Handle aggregates smaller than 8 bytes. 3783 if (ObjSize < PtrByteSize) { 3784 // The value of the object is its address, which differs from the 3785 // address of the enclosing doubleword on big-endian systems. 3786 SDValue Arg = FIN; 3787 if (!isLittleEndian) { 3788 SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT); 3789 Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff); 3790 } 3791 InVals.push_back(Arg); 3792 3793 if (GPR_idx != Num_GPR_Regs) { 3794 unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); 3795 FuncInfo->addLiveInAttr(VReg, Flags); 3796 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3797 SDValue Store; 3798 3799 if (ObjSize==1 || ObjSize==2 || ObjSize==4) { 3800 EVT ObjType = (ObjSize == 1 ? MVT::i8 : 3801 (ObjSize == 2 ? MVT::i16 : MVT::i32)); 3802 Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, 3803 MachinePointerInfo(&*FuncArg), ObjType); 3804 } else { 3805 // For sizes that don't fit a truncating store (3, 5, 6, 7), 3806 // store the whole register as-is to the parameter save area 3807 // slot. 3808 Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3809 MachinePointerInfo(&*FuncArg)); 3810 } 3811 3812 MemOps.push_back(Store); 3813 } 3814 // Whether we copied from a register or not, advance the offset 3815 // into the parameter save area by a full doubleword. 3816 ArgOffset += PtrByteSize; 3817 continue; 3818 } 3819 3820 // The value of the object is its address, which is the address of 3821 // its first stack doubleword. 3822 InVals.push_back(FIN); 3823 3824 // Store whatever pieces of the object are in registers to memory. 3825 for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { 3826 if (GPR_idx == Num_GPR_Regs) 3827 break; 3828 3829 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 3830 FuncInfo->addLiveInAttr(VReg, Flags); 3831 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3832 SDValue Addr = FIN; 3833 if (j) { 3834 SDValue Off = DAG.getConstant(j, dl, PtrVT); 3835 Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); 3836 } 3837 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, 3838 MachinePointerInfo(&*FuncArg, j)); 3839 MemOps.push_back(Store); 3840 ++GPR_idx; 3841 } 3842 ArgOffset += ArgSize; 3843 continue; 3844 } 3845 3846 switch (ObjectVT.getSimpleVT().SimpleTy) { 3847 default: llvm_unreachable("Unhandled argument type!"); 3848 case MVT::i1: 3849 case MVT::i32: 3850 case MVT::i64: 3851 if (Flags.isNest()) { 3852 // The 'nest' parameter, if any, is passed in R11. 3853 unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass); 3854 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 3855 3856 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) 3857 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); 3858 3859 break; 3860 } 3861 3862 // These can be scalar arguments or elements of an integer array type 3863 // passed directly. Clang may use those instead of "byval" aggregate 3864 // types to avoid forcing arguments to memory unnecessarily. 3865 if (GPR_idx != Num_GPR_Regs) { 3866 unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); 3867 FuncInfo->addLiveInAttr(VReg, Flags); 3868 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 3869 3870 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) 3871 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote 3872 // value to MVT::i64 and then truncate to the correct register size. 3873 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); 3874 } else { 3875 if (CallConv == CallingConv::Fast) 3876 ComputeArgOffset(); 3877 3878 needsLoad = true; 3879 ArgSize = PtrByteSize; 3880 } 3881 if (CallConv != CallingConv::Fast || needsLoad) 3882 ArgOffset += 8; 3883 break; 3884 3885 case MVT::f32: 3886 case MVT::f64: 3887 // These can be scalar arguments or elements of a float array type 3888 // passed directly. The latter are used to implement ELFv2 homogenous 3889 // float aggregates. 3890 if (FPR_idx != Num_FPR_Regs) { 3891 unsigned VReg; 3892 3893 if (ObjectVT == MVT::f32) 3894 VReg = MF.addLiveIn(FPR[FPR_idx], 3895 Subtarget.hasP8Vector() 3896 ? &PPC::VSSRCRegClass 3897 : &PPC::F4RCRegClass); 3898 else 3899 VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() 3900 ? &PPC::VSFRCRegClass 3901 : &PPC::F8RCRegClass); 3902 3903 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 3904 ++FPR_idx; 3905 } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) { 3906 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 3907 // once we support fp <-> gpr moves. 3908 3909 // This can only ever happen in the presence of f32 array types, 3910 // since otherwise we never run out of FPRs before running out 3911 // of GPRs. 3912 unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); 3913 FuncInfo->addLiveInAttr(VReg, Flags); 3914 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 3915 3916 if (ObjectVT == MVT::f32) { 3917 if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) 3918 ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, 3919 DAG.getConstant(32, dl, MVT::i32)); 3920 ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); 3921 } 3922 3923 ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); 3924 } else { 3925 if (CallConv == CallingConv::Fast) 3926 ComputeArgOffset(); 3927 3928 needsLoad = true; 3929 } 3930 3931 // When passing an array of floats, the array occupies consecutive 3932 // space in the argument area; only round up to the next doubleword 3933 // at the end of the array. Otherwise, each float takes 8 bytes. 3934 if (CallConv != CallingConv::Fast || needsLoad) { 3935 ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; 3936 ArgOffset += ArgSize; 3937 if (Flags.isInConsecutiveRegsLast()) 3938 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3939 } 3940 break; 3941 case MVT::v4f32: 3942 case MVT::v4i32: 3943 case MVT::v8i16: 3944 case MVT::v16i8: 3945 case MVT::v2f64: 3946 case MVT::v2i64: 3947 case MVT::v1i128: 3948 case MVT::f128: 3949 if (!Subtarget.hasQPX()) { 3950 // These can be scalar arguments or elements of a vector array type 3951 // passed directly. The latter are used to implement ELFv2 homogenous 3952 // vector aggregates. 3953 if (VR_idx != Num_VR_Regs) { 3954 unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); 3955 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 3956 ++VR_idx; 3957 } else { 3958 if (CallConv == CallingConv::Fast) 3959 ComputeArgOffset(); 3960 needsLoad = true; 3961 } 3962 if (CallConv != CallingConv::Fast || needsLoad) 3963 ArgOffset += 16; 3964 break; 3965 } // not QPX 3966 3967 assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && 3968 "Invalid QPX parameter type"); 3969 LLVM_FALLTHROUGH; 3970 3971 case MVT::v4f64: 3972 case MVT::v4i1: 3973 // QPX vectors are treated like their scalar floating-point subregisters 3974 // (except that they're larger). 3975 unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32; 3976 if (QFPR_idx != Num_QFPR_Regs) { 3977 const TargetRegisterClass *RC; 3978 switch (ObjectVT.getSimpleVT().SimpleTy) { 3979 case MVT::v4f64: RC = &PPC::QFRCRegClass; break; 3980 case MVT::v4f32: RC = &PPC::QSRCRegClass; break; 3981 default: RC = &PPC::QBRCRegClass; break; 3982 } 3983 3984 unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC); 3985 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 3986 ++QFPR_idx; 3987 } else { 3988 if (CallConv == CallingConv::Fast) 3989 ComputeArgOffset(); 3990 needsLoad = true; 3991 } 3992 if (CallConv != CallingConv::Fast || needsLoad) 3993 ArgOffset += Sz; 3994 break; 3995 } 3996 3997 // We need to load the argument to a virtual register if we determined 3998 // above that we ran out of physical registers of the appropriate type. 3999 if (needsLoad) { 4000 if (ObjSize < ArgSize && !isLittleEndian) 4001 CurArgOffset += ArgSize - ObjSize; 4002 int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable); 4003 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4004 ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); 4005 } 4006 4007 InVals.push_back(ArgVal); 4008 } 4009 4010 // Area that is at least reserved in the caller of this function. 4011 unsigned MinReservedArea; 4012 if (HasParameterArea) 4013 MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); 4014 else 4015 MinReservedArea = LinkageSize; 4016 4017 // Set the size that is at least reserved in caller of this function. Tail 4018 // call optimized functions' reserved stack space needs to be aligned so that 4019 // taking the difference between two stack areas will result in an aligned 4020 // stack. 4021 MinReservedArea = 4022 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); 4023 FuncInfo->setMinReservedArea(MinReservedArea); 4024 4025 // If the function takes variable number of arguments, make a frame index for 4026 // the start of the first vararg value... for expansion of llvm.va_start. 4027 if (isVarArg) { 4028 int Depth = ArgOffset; 4029 4030 FuncInfo->setVarArgsFrameIndex( 4031 MFI.CreateFixedObject(PtrByteSize, Depth, true)); 4032 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 4033 4034 // If this function is vararg, store any remaining integer argument regs 4035 // to their spots on the stack so that they may be loaded by dereferencing 4036 // the result of va_next. 4037 for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; 4038 GPR_idx < Num_GPR_Regs; ++GPR_idx) { 4039 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4040 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 4041 SDValue Store = 4042 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 4043 MemOps.push_back(Store); 4044 // Increment the address by four for the next argument to store 4045 SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT); 4046 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 4047 } 4048 } 4049 4050 if (!MemOps.empty()) 4051 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 4052 4053 return Chain; 4054 } 4055 4056 SDValue PPCTargetLowering::LowerFormalArguments_Darwin( 4057 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 4058 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 4059 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 4060 // TODO: add description of PPC stack frame format, or at least some docs. 4061 // 4062 MachineFunction &MF = DAG.getMachineFunction(); 4063 MachineFrameInfo &MFI = MF.getFrameInfo(); 4064 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 4065 4066 EVT PtrVT = getPointerTy(MF.getDataLayout()); 4067 bool isPPC64 = PtrVT == MVT::i64; 4068 // Potential tail calls could cause overwriting of argument stack slots. 4069 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && 4070 (CallConv == CallingConv::Fast)); 4071 unsigned PtrByteSize = isPPC64 ? 8 : 4; 4072 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 4073 unsigned ArgOffset = LinkageSize; 4074 // Area that is at least reserved in caller of this function. 4075 unsigned MinReservedArea = ArgOffset; 4076 4077 static const MCPhysReg GPR_32[] = { // 32-bit registers. 4078 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 4079 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 4080 }; 4081 static const MCPhysReg GPR_64[] = { // 64-bit registers. 4082 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 4083 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 4084 }; 4085 static const MCPhysReg VR[] = { 4086 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 4087 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 4088 }; 4089 4090 const unsigned Num_GPR_Regs = array_lengthof(GPR_32); 4091 const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; 4092 const unsigned Num_VR_Regs = array_lengthof( VR); 4093 4094 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 4095 4096 const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; 4097 4098 // In 32-bit non-varargs functions, the stack space for vectors is after the 4099 // stack space for non-vectors. We do not use this space unless we have 4100 // too many vectors to fit in registers, something that only occurs in 4101 // constructed examples:), but we have to walk the arglist to figure 4102 // that out...for the pathological case, compute VecArgOffset as the 4103 // start of the vector parameter area. Computing VecArgOffset is the 4104 // entire point of the following loop. 4105 unsigned VecArgOffset = ArgOffset; 4106 if (!isVarArg && !isPPC64) { 4107 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; 4108 ++ArgNo) { 4109 EVT ObjectVT = Ins[ArgNo].VT; 4110 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; 4111 4112 if (Flags.isByVal()) { 4113 // ObjSize is the true size, ArgSize rounded up to multiple of regs. 4114 unsigned ObjSize = Flags.getByValSize(); 4115 unsigned ArgSize = 4116 ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 4117 VecArgOffset += ArgSize; 4118 continue; 4119 } 4120 4121 switch(ObjectVT.getSimpleVT().SimpleTy) { 4122 default: llvm_unreachable("Unhandled argument type!"); 4123 case MVT::i1: 4124 case MVT::i32: 4125 case MVT::f32: 4126 VecArgOffset += 4; 4127 break; 4128 case MVT::i64: // PPC64 4129 case MVT::f64: 4130 // FIXME: We are guaranteed to be !isPPC64 at this point. 4131 // Does MVT::i64 apply? 4132 VecArgOffset += 8; 4133 break; 4134 case MVT::v4f32: 4135 case MVT::v4i32: 4136 case MVT::v8i16: 4137 case MVT::v16i8: 4138 // Nothing to do, we're only looking at Nonvector args here. 4139 break; 4140 } 4141 } 4142 } 4143 // We've found where the vector parameter area in memory is. Skip the 4144 // first 12 parameters; these don't use that memory. 4145 VecArgOffset = ((VecArgOffset+15)/16)*16; 4146 VecArgOffset += 12*16; 4147 4148 // Add DAG nodes to load the arguments or copy them out of registers. On 4149 // entry to a function on PPC, the arguments start after the linkage area, 4150 // although the first ones are often in registers. 4151 4152 SmallVector<SDValue, 8> MemOps; 4153 unsigned nAltivecParamsAtEnd = 0; 4154 Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); 4155 unsigned CurArgIdx = 0; 4156 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { 4157 SDValue ArgVal; 4158 bool needsLoad = false; 4159 EVT ObjectVT = Ins[ArgNo].VT; 4160 unsigned ObjSize = ObjectVT.getSizeInBits()/8; 4161 unsigned ArgSize = ObjSize; 4162 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; 4163 if (Ins[ArgNo].isOrigArg()) { 4164 std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); 4165 CurArgIdx = Ins[ArgNo].getOrigArgIndex(); 4166 } 4167 unsigned CurArgOffset = ArgOffset; 4168 4169 // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. 4170 if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || 4171 ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) { 4172 if (isVarArg || isPPC64) { 4173 MinReservedArea = ((MinReservedArea+15)/16)*16; 4174 MinReservedArea += CalculateStackSlotSize(ObjectVT, 4175 Flags, 4176 PtrByteSize); 4177 } else nAltivecParamsAtEnd++; 4178 } else 4179 // Calculate min reserved area. 4180 MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT, 4181 Flags, 4182 PtrByteSize); 4183 4184 // FIXME the codegen can be much improved in some cases. 4185 // We do not have to keep everything in memory. 4186 if (Flags.isByVal()) { 4187 assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); 4188 4189 // ObjSize is the true size, ArgSize rounded up to multiple of registers. 4190 ObjSize = Flags.getByValSize(); 4191 ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 4192 // Objects of size 1 and 2 are right justified, everything else is 4193 // left justified. This means the memory address is adjusted forwards. 4194 if (ObjSize==1 || ObjSize==2) { 4195 CurArgOffset = CurArgOffset + (4 - ObjSize); 4196 } 4197 // The value of the object is its address. 4198 int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true); 4199 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4200 InVals.push_back(FIN); 4201 if (ObjSize==1 || ObjSize==2) { 4202 if (GPR_idx != Num_GPR_Regs) { 4203 unsigned VReg; 4204 if (isPPC64) 4205 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4206 else 4207 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4208 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 4209 EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16; 4210 SDValue Store = 4211 DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, 4212 MachinePointerInfo(&*FuncArg), ObjType); 4213 MemOps.push_back(Store); 4214 ++GPR_idx; 4215 } 4216 4217 ArgOffset += PtrByteSize; 4218 4219 continue; 4220 } 4221 for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { 4222 // Store whatever pieces of the object are in registers 4223 // to memory. ArgOffset will be the address of the beginning 4224 // of the object. 4225 if (GPR_idx != Num_GPR_Regs) { 4226 unsigned VReg; 4227 if (isPPC64) 4228 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4229 else 4230 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4231 int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true); 4232 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4233 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 4234 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 4235 MachinePointerInfo(&*FuncArg, j)); 4236 MemOps.push_back(Store); 4237 ++GPR_idx; 4238 ArgOffset += PtrByteSize; 4239 } else { 4240 ArgOffset += ArgSize - (ArgOffset-CurArgOffset); 4241 break; 4242 } 4243 } 4244 continue; 4245 } 4246 4247 switch (ObjectVT.getSimpleVT().SimpleTy) { 4248 default: llvm_unreachable("Unhandled argument type!"); 4249 case MVT::i1: 4250 case MVT::i32: 4251 if (!isPPC64) { 4252 if (GPR_idx != Num_GPR_Regs) { 4253 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4254 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 4255 4256 if (ObjectVT == MVT::i1) 4257 ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal); 4258 4259 ++GPR_idx; 4260 } else { 4261 needsLoad = true; 4262 ArgSize = PtrByteSize; 4263 } 4264 // All int arguments reserve stack space in the Darwin ABI. 4265 ArgOffset += PtrByteSize; 4266 break; 4267 } 4268 LLVM_FALLTHROUGH; 4269 case MVT::i64: // PPC64 4270 if (GPR_idx != Num_GPR_Regs) { 4271 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4272 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 4273 4274 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) 4275 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote 4276 // value to MVT::i64 and then truncate to the correct register size. 4277 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); 4278 4279 ++GPR_idx; 4280 } else { 4281 needsLoad = true; 4282 ArgSize = PtrByteSize; 4283 } 4284 // All int arguments reserve stack space in the Darwin ABI. 4285 ArgOffset += 8; 4286 break; 4287 4288 case MVT::f32: 4289 case MVT::f64: 4290 // Every 4 bytes of argument space consumes one of the GPRs available for 4291 // argument passing. 4292 if (GPR_idx != Num_GPR_Regs) { 4293 ++GPR_idx; 4294 if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64) 4295 ++GPR_idx; 4296 } 4297 if (FPR_idx != Num_FPR_Regs) { 4298 unsigned VReg; 4299 4300 if (ObjectVT == MVT::f32) 4301 VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); 4302 else 4303 VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass); 4304 4305 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 4306 ++FPR_idx; 4307 } else { 4308 needsLoad = true; 4309 } 4310 4311 // All FP arguments reserve stack space in the Darwin ABI. 4312 ArgOffset += isPPC64 ? 8 : ObjSize; 4313 break; 4314 case MVT::v4f32: 4315 case MVT::v4i32: 4316 case MVT::v8i16: 4317 case MVT::v16i8: 4318 // Note that vector arguments in registers don't reserve stack space, 4319 // except in varargs functions. 4320 if (VR_idx != Num_VR_Regs) { 4321 unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); 4322 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 4323 if (isVarArg) { 4324 while ((ArgOffset % 16) != 0) { 4325 ArgOffset += PtrByteSize; 4326 if (GPR_idx != Num_GPR_Regs) 4327 GPR_idx++; 4328 } 4329 ArgOffset += 16; 4330 GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64? 4331 } 4332 ++VR_idx; 4333 } else { 4334 if (!isVarArg && !isPPC64) { 4335 // Vectors go after all the nonvectors. 4336 CurArgOffset = VecArgOffset; 4337 VecArgOffset += 16; 4338 } else { 4339 // Vectors are aligned. 4340 ArgOffset = ((ArgOffset+15)/16)*16; 4341 CurArgOffset = ArgOffset; 4342 ArgOffset += 16; 4343 } 4344 needsLoad = true; 4345 } 4346 break; 4347 } 4348 4349 // We need to load the argument to a virtual register if we determined above 4350 // that we ran out of physical registers of the appropriate type. 4351 if (needsLoad) { 4352 int FI = MFI.CreateFixedObject(ObjSize, 4353 CurArgOffset + (ArgSize - ObjSize), 4354 isImmutable); 4355 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4356 ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); 4357 } 4358 4359 InVals.push_back(ArgVal); 4360 } 4361 4362 // Allow for Altivec parameters at the end, if needed. 4363 if (nAltivecParamsAtEnd) { 4364 MinReservedArea = ((MinReservedArea+15)/16)*16; 4365 MinReservedArea += 16*nAltivecParamsAtEnd; 4366 } 4367 4368 // Area that is at least reserved in the caller of this function. 4369 MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); 4370 4371 // Set the size that is at least reserved in caller of this function. Tail 4372 // call optimized functions' reserved stack space needs to be aligned so that 4373 // taking the difference between two stack areas will result in an aligned 4374 // stack. 4375 MinReservedArea = 4376 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); 4377 FuncInfo->setMinReservedArea(MinReservedArea); 4378 4379 // If the function takes variable number of arguments, make a frame index for 4380 // the start of the first vararg value... for expansion of llvm.va_start. 4381 if (isVarArg) { 4382 int Depth = ArgOffset; 4383 4384 FuncInfo->setVarArgsFrameIndex( 4385 MFI.CreateFixedObject(PtrVT.getSizeInBits()/8, 4386 Depth, true)); 4387 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 4388 4389 // If this function is vararg, store any remaining integer argument regs 4390 // to their spots on the stack so that they may be loaded by dereferencing 4391 // the result of va_next. 4392 for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { 4393 unsigned VReg; 4394 4395 if (isPPC64) 4396 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4397 else 4398 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4399 4400 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 4401 SDValue Store = 4402 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 4403 MemOps.push_back(Store); 4404 // Increment the address by four for the next argument to store 4405 SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); 4406 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 4407 } 4408 } 4409 4410 if (!MemOps.empty()) 4411 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 4412 4413 return Chain; 4414 } 4415 4416 /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be 4417 /// adjusted to accommodate the arguments for the tailcall. 4418 static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, 4419 unsigned ParamSize) { 4420 4421 if (!isTailCall) return 0; 4422 4423 PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>(); 4424 unsigned CallerMinReservedArea = FI->getMinReservedArea(); 4425 int SPDiff = (int)CallerMinReservedArea - (int)ParamSize; 4426 // Remember only if the new adjustment is bigger. 4427 if (SPDiff < FI->getTailCallSPDelta()) 4428 FI->setTailCallSPDelta(SPDiff); 4429 4430 return SPDiff; 4431 } 4432 4433 static bool isFunctionGlobalAddress(SDValue Callee); 4434 4435 static bool 4436 callsShareTOCBase(const Function *Caller, SDValue Callee, 4437 const TargetMachine &TM) { 4438 // If !G, Callee can be an external symbol. 4439 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 4440 if (!G) 4441 return false; 4442 4443 // The medium and large code models are expected to provide a sufficiently 4444 // large TOC to provide all data addressing needs of a module with a 4445 // single TOC. Since each module will be addressed with a single TOC then we 4446 // only need to check that caller and callee don't cross dso boundaries. 4447 if (CodeModel::Medium == TM.getCodeModel() || 4448 CodeModel::Large == TM.getCodeModel()) 4449 return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal()); 4450 4451 // Otherwise we need to ensure callee and caller are in the same section, 4452 // since the linker may allocate multiple TOCs, and we don't know which 4453 // sections will belong to the same TOC base. 4454 4455 const GlobalValue *GV = G->getGlobal(); 4456 if (!GV->isStrongDefinitionForLinker()) 4457 return false; 4458 4459 // Any explicitly-specified sections and section prefixes must also match. 4460 // Also, if we're using -ffunction-sections, then each function is always in 4461 // a different section (the same is true for COMDAT functions). 4462 if (TM.getFunctionSections() || GV->hasComdat() || Caller->hasComdat() || 4463 GV->getSection() != Caller->getSection()) 4464 return false; 4465 if (const auto *F = dyn_cast<Function>(GV)) { 4466 if (F->getSectionPrefix() != Caller->getSectionPrefix()) 4467 return false; 4468 } 4469 4470 // If the callee might be interposed, then we can't assume the ultimate call 4471 // target will be in the same section. Even in cases where we can assume that 4472 // interposition won't happen, in any case where the linker might insert a 4473 // stub to allow for interposition, we must generate code as though 4474 // interposition might occur. To understand why this matters, consider a 4475 // situation where: a -> b -> c where the arrows indicate calls. b and c are 4476 // in the same section, but a is in a different module (i.e. has a different 4477 // TOC base pointer). If the linker allows for interposition between b and c, 4478 // then it will generate a stub for the call edge between b and c which will 4479 // save the TOC pointer into the designated stack slot allocated by b. If we 4480 // return true here, and therefore allow a tail call between b and c, that 4481 // stack slot won't exist and the b -> c stub will end up saving b'c TOC base 4482 // pointer into the stack slot allocated by a (where the a -> b stub saved 4483 // a's TOC base pointer). If we're not considering a tail call, but rather, 4484 // whether a nop is needed after the call instruction in b, because the linker 4485 // will insert a stub, it might complain about a missing nop if we omit it 4486 // (although many don't complain in this case). 4487 if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV)) 4488 return false; 4489 4490 return true; 4491 } 4492 4493 static bool 4494 needStackSlotPassParameters(const PPCSubtarget &Subtarget, 4495 const SmallVectorImpl<ISD::OutputArg> &Outs) { 4496 assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64()); 4497 4498 const unsigned PtrByteSize = 8; 4499 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 4500 4501 static const MCPhysReg GPR[] = { 4502 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 4503 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 4504 }; 4505 static const MCPhysReg VR[] = { 4506 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 4507 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 4508 }; 4509 4510 const unsigned NumGPRs = array_lengthof(GPR); 4511 const unsigned NumFPRs = 13; 4512 const unsigned NumVRs = array_lengthof(VR); 4513 const unsigned ParamAreaSize = NumGPRs * PtrByteSize; 4514 4515 unsigned NumBytes = LinkageSize; 4516 unsigned AvailableFPRs = NumFPRs; 4517 unsigned AvailableVRs = NumVRs; 4518 4519 for (const ISD::OutputArg& Param : Outs) { 4520 if (Param.Flags.isNest()) continue; 4521 4522 if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, 4523 PtrByteSize, LinkageSize, ParamAreaSize, 4524 NumBytes, AvailableFPRs, AvailableVRs, 4525 Subtarget.hasQPX())) 4526 return true; 4527 } 4528 return false; 4529 } 4530 4531 static bool 4532 hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) { 4533 if (CS.arg_size() != CallerFn->arg_size()) 4534 return false; 4535 4536 ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin(); 4537 ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end(); 4538 Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin(); 4539 4540 for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) { 4541 const Value* CalleeArg = *CalleeArgIter; 4542 const Value* CallerArg = &(*CallerArgIter); 4543 if (CalleeArg == CallerArg) 4544 continue; 4545 4546 // e.g. @caller([4 x i64] %a, [4 x i64] %b) { 4547 // tail call @callee([4 x i64] undef, [4 x i64] %b) 4548 // } 4549 // 1st argument of callee is undef and has the same type as caller. 4550 if (CalleeArg->getType() == CallerArg->getType() && 4551 isa<UndefValue>(CalleeArg)) 4552 continue; 4553 4554 return false; 4555 } 4556 4557 return true; 4558 } 4559 4560 // Returns true if TCO is possible between the callers and callees 4561 // calling conventions. 4562 static bool 4563 areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC, 4564 CallingConv::ID CalleeCC) { 4565 // Tail calls are possible with fastcc and ccc. 4566 auto isTailCallableCC = [] (CallingConv::ID CC){ 4567 return CC == CallingConv::C || CC == CallingConv::Fast; 4568 }; 4569 if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC)) 4570 return false; 4571 4572 // We can safely tail call both fastcc and ccc callees from a c calling 4573 // convention caller. If the caller is fastcc, we may have less stack space 4574 // than a non-fastcc caller with the same signature so disable tail-calls in 4575 // that case. 4576 return CallerCC == CallingConv::C || CallerCC == CalleeCC; 4577 } 4578 4579 bool 4580 PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( 4581 SDValue Callee, 4582 CallingConv::ID CalleeCC, 4583 ImmutableCallSite CS, 4584 bool isVarArg, 4585 const SmallVectorImpl<ISD::OutputArg> &Outs, 4586 const SmallVectorImpl<ISD::InputArg> &Ins, 4587 SelectionDAG& DAG) const { 4588 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; 4589 4590 if (DisableSCO && !TailCallOpt) return false; 4591 4592 // Variadic argument functions are not supported. 4593 if (isVarArg) return false; 4594 4595 auto &Caller = DAG.getMachineFunction().getFunction(); 4596 // Check that the calling conventions are compatible for tco. 4597 if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC)) 4598 return false; 4599 4600 // Caller contains any byval parameter is not supported. 4601 if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); })) 4602 return false; 4603 4604 // Callee contains any byval parameter is not supported, too. 4605 // Note: This is a quick work around, because in some cases, e.g. 4606 // caller's stack size > callee's stack size, we are still able to apply 4607 // sibling call optimization. For example, gcc is able to do SCO for caller1 4608 // in the following example, but not for caller2. 4609 // struct test { 4610 // long int a; 4611 // char ary[56]; 4612 // } gTest; 4613 // __attribute__((noinline)) int callee(struct test v, struct test *b) { 4614 // b->a = v.a; 4615 // return 0; 4616 // } 4617 // void caller1(struct test a, struct test c, struct test *b) { 4618 // callee(gTest, b); } 4619 // void caller2(struct test *b) { callee(gTest, b); } 4620 if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); })) 4621 return false; 4622 4623 // If callee and caller use different calling conventions, we cannot pass 4624 // parameters on stack since offsets for the parameter area may be different. 4625 if (Caller.getCallingConv() != CalleeCC && 4626 needStackSlotPassParameters(Subtarget, Outs)) 4627 return false; 4628 4629 // No TCO/SCO on indirect call because Caller have to restore its TOC 4630 if (!isFunctionGlobalAddress(Callee) && 4631 !isa<ExternalSymbolSDNode>(Callee)) 4632 return false; 4633 4634 // If the caller and callee potentially have different TOC bases then we 4635 // cannot tail call since we need to restore the TOC pointer after the call. 4636 // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977 4637 if (!callsShareTOCBase(&Caller, Callee, getTargetMachine())) 4638 return false; 4639 4640 // TCO allows altering callee ABI, so we don't have to check further. 4641 if (CalleeCC == CallingConv::Fast && TailCallOpt) 4642 return true; 4643 4644 if (DisableSCO) return false; 4645 4646 // If callee use the same argument list that caller is using, then we can 4647 // apply SCO on this case. If it is not, then we need to check if callee needs 4648 // stack for passing arguments. 4649 if (!hasSameArgumentList(&Caller, CS) && 4650 needStackSlotPassParameters(Subtarget, Outs)) { 4651 return false; 4652 } 4653 4654 return true; 4655 } 4656 4657 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 4658 /// for tail call optimization. Targets which want to do tail call 4659 /// optimization should implement this function. 4660 bool 4661 PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 4662 CallingConv::ID CalleeCC, 4663 bool isVarArg, 4664 const SmallVectorImpl<ISD::InputArg> &Ins, 4665 SelectionDAG& DAG) const { 4666 if (!getTargetMachine().Options.GuaranteedTailCallOpt) 4667 return false; 4668 4669 // Variable argument functions are not supported. 4670 if (isVarArg) 4671 return false; 4672 4673 MachineFunction &MF = DAG.getMachineFunction(); 4674 CallingConv::ID CallerCC = MF.getFunction().getCallingConv(); 4675 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { 4676 // Functions containing by val parameters are not supported. 4677 for (unsigned i = 0; i != Ins.size(); i++) { 4678 ISD::ArgFlagsTy Flags = Ins[i].Flags; 4679 if (Flags.isByVal()) return false; 4680 } 4681 4682 // Non-PIC/GOT tail calls are supported. 4683 if (getTargetMachine().getRelocationModel() != Reloc::PIC_) 4684 return true; 4685 4686 // At the moment we can only do local tail calls (in same module, hidden 4687 // or protected) if we are generating PIC. 4688 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 4689 return G->getGlobal()->hasHiddenVisibility() 4690 || G->getGlobal()->hasProtectedVisibility(); 4691 } 4692 4693 return false; 4694 } 4695 4696 /// isCallCompatibleAddress - Return the immediate to use if the specified 4697 /// 32-bit value is representable in the immediate field of a BxA instruction. 4698 static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) { 4699 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4700 if (!C) return nullptr; 4701 4702 int Addr = C->getZExtValue(); 4703 if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero. 4704 SignExtend32<26>(Addr) != Addr) 4705 return nullptr; // Top 6 bits have to be sext of immediate. 4706 4707 return DAG 4708 .getConstant( 4709 (int)C->getZExtValue() >> 2, SDLoc(Op), 4710 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout())) 4711 .getNode(); 4712 } 4713 4714 namespace { 4715 4716 struct TailCallArgumentInfo { 4717 SDValue Arg; 4718 SDValue FrameIdxOp; 4719 int FrameIdx = 0; 4720 4721 TailCallArgumentInfo() = default; 4722 }; 4723 4724 } // end anonymous namespace 4725 4726 /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. 4727 static void StoreTailCallArgumentsToStackSlot( 4728 SelectionDAG &DAG, SDValue Chain, 4729 const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs, 4730 SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) { 4731 for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) { 4732 SDValue Arg = TailCallArgs[i].Arg; 4733 SDValue FIN = TailCallArgs[i].FrameIdxOp; 4734 int FI = TailCallArgs[i].FrameIdx; 4735 // Store relative to framepointer. 4736 MemOpChains.push_back(DAG.getStore( 4737 Chain, dl, Arg, FIN, 4738 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); 4739 } 4740 } 4741 4742 /// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to 4743 /// the appropriate stack slot for the tail call optimized function call. 4744 static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain, 4745 SDValue OldRetAddr, SDValue OldFP, 4746 int SPDiff, const SDLoc &dl) { 4747 if (SPDiff) { 4748 // Calculate the new stack slot for the return address. 4749 MachineFunction &MF = DAG.getMachineFunction(); 4750 const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); 4751 const PPCFrameLowering *FL = Subtarget.getFrameLowering(); 4752 bool isPPC64 = Subtarget.isPPC64(); 4753 int SlotSize = isPPC64 ? 8 : 4; 4754 int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset(); 4755 int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize, 4756 NewRetAddrLoc, true); 4757 EVT VT = isPPC64 ? MVT::i64 : MVT::i32; 4758 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT); 4759 Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx, 4760 MachinePointerInfo::getFixedStack(MF, NewRetAddr)); 4761 4762 // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack 4763 // slot as the FP is never overwritten. 4764 if (Subtarget.isDarwinABI()) { 4765 int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset(); 4766 int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc, 4767 true); 4768 SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT); 4769 Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx, 4770 MachinePointerInfo::getFixedStack( 4771 DAG.getMachineFunction(), NewFPIdx)); 4772 } 4773 } 4774 return Chain; 4775 } 4776 4777 /// CalculateTailCallArgDest - Remember Argument for later processing. Calculate 4778 /// the position of the argument. 4779 static void 4780 CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64, 4781 SDValue Arg, int SPDiff, unsigned ArgOffset, 4782 SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) { 4783 int Offset = ArgOffset + SPDiff; 4784 uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8; 4785 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); 4786 EVT VT = isPPC64 ? MVT::i64 : MVT::i32; 4787 SDValue FIN = DAG.getFrameIndex(FI, VT); 4788 TailCallArgumentInfo Info; 4789 Info.Arg = Arg; 4790 Info.FrameIdxOp = FIN; 4791 Info.FrameIdx = FI; 4792 TailCallArguments.push_back(Info); 4793 } 4794 4795 /// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address 4796 /// stack slot. Returns the chain as result and the loaded frame pointers in 4797 /// LROpOut/FPOpout. Used when tail calling. 4798 SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr( 4799 SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut, 4800 SDValue &FPOpOut, const SDLoc &dl) const { 4801 if (SPDiff) { 4802 // Load the LR and FP stack slot for later adjusting. 4803 EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; 4804 LROpOut = getReturnAddrFrameIndex(DAG); 4805 LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo()); 4806 Chain = SDValue(LROpOut.getNode(), 1); 4807 4808 // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack 4809 // slot as the FP is never overwritten. 4810 if (Subtarget.isDarwinABI()) { 4811 FPOpOut = getFramePointerFrameIndex(DAG); 4812 FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo()); 4813 Chain = SDValue(FPOpOut.getNode(), 1); 4814 } 4815 } 4816 return Chain; 4817 } 4818 4819 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 4820 /// by "Src" to address "Dst" of size "Size". Alignment information is 4821 /// specified by the specific parameter attribute. The copy will be passed as 4822 /// a byval function parameter. 4823 /// Sometimes what we are copying is the end of a larger object, the part that 4824 /// does not fit in registers. 4825 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, 4826 SDValue Chain, ISD::ArgFlagsTy Flags, 4827 SelectionDAG &DAG, const SDLoc &dl) { 4828 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); 4829 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 4830 false, false, false, MachinePointerInfo(), 4831 MachinePointerInfo()); 4832 } 4833 4834 /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of 4835 /// tail calls. 4836 static void LowerMemOpCallTo( 4837 SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg, 4838 SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64, 4839 bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains, 4840 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) { 4841 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 4842 if (!isTailCall) { 4843 if (isVector) { 4844 SDValue StackPtr; 4845 if (isPPC64) 4846 StackPtr = DAG.getRegister(PPC::X1, MVT::i64); 4847 else 4848 StackPtr = DAG.getRegister(PPC::R1, MVT::i32); 4849 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, 4850 DAG.getConstant(ArgOffset, dl, PtrVT)); 4851 } 4852 MemOpChains.push_back( 4853 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); 4854 // Calculate and remember argument location. 4855 } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset, 4856 TailCallArguments); 4857 } 4858 4859 static void 4860 PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, 4861 const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp, 4862 SDValue FPOp, 4863 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) { 4864 // Emit a sequence of copyto/copyfrom virtual registers for arguments that 4865 // might overwrite each other in case of tail call optimization. 4866 SmallVector<SDValue, 8> MemOpChains2; 4867 // Do not flag preceding copytoreg stuff together with the following stuff. 4868 InFlag = SDValue(); 4869 StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments, 4870 MemOpChains2, dl); 4871 if (!MemOpChains2.empty()) 4872 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); 4873 4874 // Store the return address to the appropriate stack slot. 4875 Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl); 4876 4877 // Emit callseq_end just before tailcall node. 4878 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 4879 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 4880 InFlag = Chain.getValue(1); 4881 } 4882 4883 // Is this global address that of a function that can be called by name? (as 4884 // opposed to something that must hold a descriptor for an indirect call). 4885 static bool isFunctionGlobalAddress(SDValue Callee) { 4886 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 4887 if (Callee.getOpcode() == ISD::GlobalTLSAddress || 4888 Callee.getOpcode() == ISD::TargetGlobalTLSAddress) 4889 return false; 4890 4891 return G->getGlobal()->getValueType()->isFunctionTy(); 4892 } 4893 4894 return false; 4895 } 4896 4897 static unsigned 4898 PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, 4899 SDValue CallSeqStart, const SDLoc &dl, int SPDiff, bool isTailCall, 4900 bool isPatchPoint, bool hasNest, 4901 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, 4902 SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys, 4903 ImmutableCallSite CS, const PPCSubtarget &Subtarget) { 4904 bool isPPC64 = Subtarget.isPPC64(); 4905 bool isSVR4ABI = Subtarget.isSVR4ABI(); 4906 bool isELFv2ABI = Subtarget.isELFv2ABI(); 4907 4908 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 4909 NodeTys.push_back(MVT::Other); // Returns a chain 4910 NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use. 4911 4912 unsigned CallOpc = PPCISD::CALL; 4913 4914 bool needIndirectCall = true; 4915 if (!isSVR4ABI || !isPPC64) 4916 if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { 4917 // If this is an absolute destination address, use the munged value. 4918 Callee = SDValue(Dest, 0); 4919 needIndirectCall = false; 4920 } 4921 4922 // PC-relative references to external symbols should go through $stub, unless 4923 // we're building with the leopard linker or later, which automatically 4924 // synthesizes these stubs. 4925 const TargetMachine &TM = DAG.getTarget(); 4926 const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); 4927 const GlobalValue *GV = nullptr; 4928 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) 4929 GV = G->getGlobal(); 4930 bool Local = TM.shouldAssumeDSOLocal(*Mod, GV); 4931 bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64; 4932 4933 if (isFunctionGlobalAddress(Callee)) { 4934 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee); 4935 // A call to a TLS address is actually an indirect call to a 4936 // thread-specific pointer. 4937 unsigned OpFlags = 0; 4938 if (UsePlt) 4939 OpFlags = PPCII::MO_PLT; 4940 4941 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, 4942 // every direct call is) turn it into a TargetGlobalAddress / 4943 // TargetExternalSymbol node so that legalize doesn't hack it. 4944 Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, 4945 Callee.getValueType(), 0, OpFlags); 4946 needIndirectCall = false; 4947 } 4948 4949 if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 4950 unsigned char OpFlags = 0; 4951 4952 if (UsePlt) 4953 OpFlags = PPCII::MO_PLT; 4954 4955 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(), 4956 OpFlags); 4957 needIndirectCall = false; 4958 } 4959 4960 if (isPatchPoint) { 4961 // We'll form an invalid direct call when lowering a patchpoint; the full 4962 // sequence for an indirect call is complicated, and many of the 4963 // instructions introduced might have side effects (and, thus, can't be 4964 // removed later). The call itself will be removed as soon as the 4965 // argument/return lowering is complete, so the fact that it has the wrong 4966 // kind of operands should not really matter. 4967 needIndirectCall = false; 4968 } 4969 4970 if (needIndirectCall) { 4971 // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair 4972 // to do the call, we can't use PPCISD::CALL. 4973 SDValue MTCTROps[] = {Chain, Callee, InFlag}; 4974 4975 if (isSVR4ABI && isPPC64 && !isELFv2ABI) { 4976 // Function pointers in the 64-bit SVR4 ABI do not point to the function 4977 // entry point, but to the function descriptor (the function entry point 4978 // address is part of the function descriptor though). 4979 // The function descriptor is a three doubleword structure with the 4980 // following fields: function entry point, TOC base address and 4981 // environment pointer. 4982 // Thus for a call through a function pointer, the following actions need 4983 // to be performed: 4984 // 1. Save the TOC of the caller in the TOC save area of its stack 4985 // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()). 4986 // 2. Load the address of the function entry point from the function 4987 // descriptor. 4988 // 3. Load the TOC of the callee from the function descriptor into r2. 4989 // 4. Load the environment pointer from the function descriptor into 4990 // r11. 4991 // 5. Branch to the function entry point address. 4992 // 6. On return of the callee, the TOC of the caller needs to be 4993 // restored (this is done in FinishCall()). 4994 // 4995 // The loads are scheduled at the beginning of the call sequence, and the 4996 // register copies are flagged together to ensure that no other 4997 // operations can be scheduled in between. E.g. without flagging the 4998 // copies together, a TOC access in the caller could be scheduled between 4999 // the assignment of the callee TOC and the branch to the callee, which 5000 // results in the TOC access going through the TOC of the callee instead 5001 // of going through the TOC of the caller, which leads to incorrect code. 5002 5003 // Load the address of the function entry point from the function 5004 // descriptor. 5005 SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1); 5006 if (LDChain.getValueType() == MVT::Glue) 5007 LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2); 5008 5009 auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors() 5010 ? (MachineMemOperand::MODereferenceable | 5011 MachineMemOperand::MOInvariant) 5012 : MachineMemOperand::MONone; 5013 5014 MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr); 5015 SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI, 5016 /* Alignment = */ 8, MMOFlags); 5017 5018 // Load environment pointer into r11. 5019 SDValue PtrOff = DAG.getIntPtrConstant(16, dl); 5020 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff); 5021 SDValue LoadEnvPtr = 5022 DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16), 5023 /* Alignment = */ 8, MMOFlags); 5024 5025 SDValue TOCOff = DAG.getIntPtrConstant(8, dl); 5026 SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); 5027 SDValue TOCPtr = 5028 DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8), 5029 /* Alignment = */ 8, MMOFlags); 5030 5031 setUsesTOCBasePtr(DAG); 5032 SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr, 5033 InFlag); 5034 Chain = TOCVal.getValue(0); 5035 InFlag = TOCVal.getValue(1); 5036 5037 // If the function call has an explicit 'nest' parameter, it takes the 5038 // place of the environment pointer. 5039 if (!hasNest) { 5040 SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, 5041 InFlag); 5042 5043 Chain = EnvVal.getValue(0); 5044 InFlag = EnvVal.getValue(1); 5045 } 5046 5047 MTCTROps[0] = Chain; 5048 MTCTROps[1] = LoadFuncPtr; 5049 MTCTROps[2] = InFlag; 5050 } 5051 5052 Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, 5053 makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2)); 5054 InFlag = Chain.getValue(1); 5055 5056 NodeTys.clear(); 5057 NodeTys.push_back(MVT::Other); 5058 NodeTys.push_back(MVT::Glue); 5059 Ops.push_back(Chain); 5060 CallOpc = PPCISD::BCTRL; 5061 Callee.setNode(nullptr); 5062 // Add use of X11 (holding environment pointer) 5063 if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest) 5064 Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); 5065 // Add CTR register as callee so a bctr can be emitted later. 5066 if (isTailCall) 5067 Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT)); 5068 } 5069 5070 // If this is a direct call, pass the chain and the callee. 5071 if (Callee.getNode()) { 5072 Ops.push_back(Chain); 5073 Ops.push_back(Callee); 5074 } 5075 // If this is a tail call add stack pointer delta. 5076 if (isTailCall) 5077 Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32)); 5078 5079 // Add argument registers to the end of the list so that they are known live 5080 // into the call. 5081 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 5082 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 5083 RegsToPass[i].second.getValueType())); 5084 5085 // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live 5086 // into the call. 5087 if (isSVR4ABI && isPPC64 && !isPatchPoint) { 5088 setUsesTOCBasePtr(DAG); 5089 Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); 5090 } 5091 5092 return CallOpc; 5093 } 5094 5095 SDValue PPCTargetLowering::LowerCallResult( 5096 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 5097 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 5098 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 5099 SmallVector<CCValAssign, 16> RVLocs; 5100 CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 5101 *DAG.getContext()); 5102 5103 CCRetInfo.AnalyzeCallResult( 5104 Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold) 5105 ? RetCC_PPC_Cold 5106 : RetCC_PPC); 5107 5108 // Copy all of the result registers out of their specified physreg. 5109 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 5110 CCValAssign &VA = RVLocs[i]; 5111 assert(VA.isRegLoc() && "Can only return in registers!"); 5112 5113 SDValue Val = DAG.getCopyFromReg(Chain, dl, 5114 VA.getLocReg(), VA.getLocVT(), InFlag); 5115 Chain = Val.getValue(1); 5116 InFlag = Val.getValue(2); 5117 5118 switch (VA.getLocInfo()) { 5119 default: llvm_unreachable("Unknown loc info!"); 5120 case CCValAssign::Full: break; 5121 case CCValAssign::AExt: 5122 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 5123 break; 5124 case CCValAssign::ZExt: 5125 Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val, 5126 DAG.getValueType(VA.getValVT())); 5127 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 5128 break; 5129 case CCValAssign::SExt: 5130 Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val, 5131 DAG.getValueType(VA.getValVT())); 5132 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 5133 break; 5134 } 5135 5136 InVals.push_back(Val); 5137 } 5138 5139 return Chain; 5140 } 5141 5142 SDValue PPCTargetLowering::FinishCall( 5143 CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg, 5144 bool isPatchPoint, bool hasNest, SelectionDAG &DAG, 5145 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag, 5146 SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff, 5147 unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins, 5148 SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const { 5149 std::vector<EVT> NodeTys; 5150 SmallVector<SDValue, 8> Ops; 5151 unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, 5152 SPDiff, isTailCall, isPatchPoint, hasNest, 5153 RegsToPass, Ops, NodeTys, CS, Subtarget); 5154 5155 // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls 5156 if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) 5157 Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); 5158 5159 // When performing tail call optimization the callee pops its arguments off 5160 // the stack. Account for this here so these bytes can be pushed back on in 5161 // PPCFrameLowering::eliminateCallFramePseudoInstr. 5162 int BytesCalleePops = 5163 (CallConv == CallingConv::Fast && 5164 getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; 5165 5166 // Add a register mask operand representing the call-preserved registers. 5167 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); 5168 const uint32_t *Mask = 5169 TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv); 5170 assert(Mask && "Missing call preserved mask for calling convention"); 5171 Ops.push_back(DAG.getRegisterMask(Mask)); 5172 5173 if (InFlag.getNode()) 5174 Ops.push_back(InFlag); 5175 5176 // Emit tail call. 5177 if (isTailCall) { 5178 assert(((Callee.getOpcode() == ISD::Register && 5179 cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) || 5180 Callee.getOpcode() == ISD::TargetExternalSymbol || 5181 Callee.getOpcode() == ISD::TargetGlobalAddress || 5182 isa<ConstantSDNode>(Callee)) && 5183 "Expecting an global address, external symbol, absolute value or register"); 5184 5185 DAG.getMachineFunction().getFrameInfo().setHasTailCall(); 5186 return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops); 5187 } 5188 5189 // Add a NOP immediately after the branch instruction when using the 64-bit 5190 // SVR4 ABI. At link time, if caller and callee are in a different module and 5191 // thus have a different TOC, the call will be replaced with a call to a stub 5192 // function which saves the current TOC, loads the TOC of the callee and 5193 // branches to the callee. The NOP will be replaced with a load instruction 5194 // which restores the TOC of the caller from the TOC save slot of the current 5195 // stack frame. If caller and callee belong to the same module (and have the 5196 // same TOC), the NOP will remain unchanged. 5197 5198 MachineFunction &MF = DAG.getMachineFunction(); 5199 if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() && 5200 !isPatchPoint) { 5201 if (CallOpc == PPCISD::BCTRL) { 5202 // This is a call through a function pointer. 5203 // Restore the caller TOC from the save area into R2. 5204 // See PrepareCall() for more information about calls through function 5205 // pointers in the 64-bit SVR4 ABI. 5206 // We are using a target-specific load with r2 hard coded, because the 5207 // result of a target-independent load would never go directly into r2, 5208 // since r2 is a reserved register (which prevents the register allocator 5209 // from allocating it), resulting in an additional register being 5210 // allocated and an unnecessary move instruction being generated. 5211 CallOpc = PPCISD::BCTRL_LOAD_TOC; 5212 5213 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5214 SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); 5215 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); 5216 SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); 5217 SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); 5218 5219 // The address needs to go after the chain input but before the flag (or 5220 // any other variadic arguments). 5221 Ops.insert(std::next(Ops.begin()), AddTOC); 5222 } else if (CallOpc == PPCISD::CALL && 5223 !callsShareTOCBase(&MF.getFunction(), Callee, DAG.getTarget())) { 5224 // Otherwise insert NOP for non-local calls. 5225 CallOpc = PPCISD::CALL_NOP; 5226 } 5227 } 5228 5229 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 5230 InFlag = Chain.getValue(1); 5231 5232 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 5233 DAG.getIntPtrConstant(BytesCalleePops, dl, true), 5234 InFlag, dl); 5235 if (!Ins.empty()) 5236 InFlag = Chain.getValue(1); 5237 5238 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 5239 Ins, dl, DAG, InVals); 5240 } 5241 5242 SDValue 5243 PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 5244 SmallVectorImpl<SDValue> &InVals) const { 5245 SelectionDAG &DAG = CLI.DAG; 5246 SDLoc &dl = CLI.DL; 5247 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 5248 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 5249 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 5250 SDValue Chain = CLI.Chain; 5251 SDValue Callee = CLI.Callee; 5252 bool &isTailCall = CLI.IsTailCall; 5253 CallingConv::ID CallConv = CLI.CallConv; 5254 bool isVarArg = CLI.IsVarArg; 5255 bool isPatchPoint = CLI.IsPatchPoint; 5256 ImmutableCallSite CS = CLI.CS; 5257 5258 if (isTailCall) { 5259 if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall())) 5260 isTailCall = false; 5261 else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) 5262 isTailCall = 5263 IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS, 5264 isVarArg, Outs, Ins, DAG); 5265 else 5266 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, 5267 Ins, DAG); 5268 if (isTailCall) { 5269 ++NumTailCalls; 5270 if (!getTargetMachine().Options.GuaranteedTailCallOpt) 5271 ++NumSiblingCalls; 5272 5273 assert(isa<GlobalAddressSDNode>(Callee) && 5274 "Callee should be an llvm::Function object."); 5275 LLVM_DEBUG( 5276 const GlobalValue *GV = 5277 cast<GlobalAddressSDNode>(Callee)->getGlobal(); 5278 const unsigned Width = 5279 80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0"); 5280 dbgs() << "TCO caller: " 5281 << left_justify(DAG.getMachineFunction().getName(), Width) 5282 << ", callee linkage: " << GV->getVisibility() << ", " 5283 << GV->getLinkage() << "\n"); 5284 } 5285 } 5286 5287 if (!isTailCall && CS && CS.isMustTailCall()) 5288 report_fatal_error("failed to perform tail call elimination on a call " 5289 "site marked musttail"); 5290 5291 // When long calls (i.e. indirect calls) are always used, calls are always 5292 // made via function pointer. If we have a function name, first translate it 5293 // into a pointer. 5294 if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) && 5295 !isTailCall) 5296 Callee = LowerGlobalAddress(Callee, DAG); 5297 5298 if (Subtarget.isSVR4ABI()) { 5299 if (Subtarget.isPPC64()) 5300 return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, 5301 isTailCall, isPatchPoint, Outs, OutVals, Ins, 5302 dl, DAG, InVals, CS); 5303 else 5304 return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, 5305 isTailCall, isPatchPoint, Outs, OutVals, Ins, 5306 dl, DAG, InVals, CS); 5307 } 5308 5309 return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, 5310 isTailCall, isPatchPoint, Outs, OutVals, Ins, 5311 dl, DAG, InVals, CS); 5312 } 5313 5314 SDValue PPCTargetLowering::LowerCall_32SVR4( 5315 SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, 5316 bool isTailCall, bool isPatchPoint, 5317 const SmallVectorImpl<ISD::OutputArg> &Outs, 5318 const SmallVectorImpl<SDValue> &OutVals, 5319 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 5320 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 5321 ImmutableCallSite CS) const { 5322 // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description 5323 // of the 32-bit SVR4 ABI stack frame layout. 5324 5325 assert((CallConv == CallingConv::C || 5326 CallConv == CallingConv::Cold || 5327 CallConv == CallingConv::Fast) && "Unknown calling convention!"); 5328 5329 unsigned PtrByteSize = 4; 5330 5331 MachineFunction &MF = DAG.getMachineFunction(); 5332 5333 // Mark this function as potentially containing a function that contains a 5334 // tail call. As a consequence the frame pointer will be used for dynamicalloc 5335 // and restoring the callers stack pointer in this functions epilog. This is 5336 // done because by tail calling the called function might overwrite the value 5337 // in this function's (MF) stack pointer stack slot 0(SP). 5338 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5339 CallConv == CallingConv::Fast) 5340 MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); 5341 5342 // Count how many bytes are to be pushed on the stack, including the linkage 5343 // area, parameter list area and the part of the local variable space which 5344 // contains copies of aggregates which are passed by value. 5345 5346 // Assign locations to all of the outgoing arguments. 5347 SmallVector<CCValAssign, 16> ArgLocs; 5348 PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 5349 5350 // Reserve space for the linkage area on the stack. 5351 CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(), 5352 PtrByteSize); 5353 if (useSoftFloat()) 5354 CCInfo.PreAnalyzeCallOperands(Outs); 5355 5356 if (isVarArg) { 5357 // Handle fixed and variable vector arguments differently. 5358 // Fixed vector arguments go into registers as long as registers are 5359 // available. Variable vector arguments always go into memory. 5360 unsigned NumArgs = Outs.size(); 5361 5362 for (unsigned i = 0; i != NumArgs; ++i) { 5363 MVT ArgVT = Outs[i].VT; 5364 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 5365 bool Result; 5366 5367 if (Outs[i].IsFixed) { 5368 Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, 5369 CCInfo); 5370 } else { 5371 Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full, 5372 ArgFlags, CCInfo); 5373 } 5374 5375 if (Result) { 5376 #ifndef NDEBUG 5377 errs() << "Call operand #" << i << " has unhandled type " 5378 << EVT(ArgVT).getEVTString() << "\n"; 5379 #endif 5380 llvm_unreachable(nullptr); 5381 } 5382 } 5383 } else { 5384 // All arguments are treated the same. 5385 CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4); 5386 } 5387 CCInfo.clearWasPPCF128(); 5388 5389 // Assign locations to all of the outgoing aggregate by value arguments. 5390 SmallVector<CCValAssign, 16> ByValArgLocs; 5391 CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext()); 5392 5393 // Reserve stack space for the allocations in CCInfo. 5394 CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); 5395 5396 CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal); 5397 5398 // Size of the linkage area, parameter list area and the part of the local 5399 // space variable where copies of aggregates which are passed by value are 5400 // stored. 5401 unsigned NumBytes = CCByValInfo.getNextStackOffset(); 5402 5403 // Calculate by how many bytes the stack has to be adjusted in case of tail 5404 // call optimization. 5405 int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); 5406 5407 // Adjust the stack pointer for the new arguments... 5408 // These operations are automatically eliminated by the prolog/epilog pass 5409 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 5410 SDValue CallSeqStart = Chain; 5411 5412 // Load the return address and frame pointer so it can be moved somewhere else 5413 // later. 5414 SDValue LROp, FPOp; 5415 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); 5416 5417 // Set up a copy of the stack pointer for use loading and storing any 5418 // arguments that may not fit in the registers available for argument 5419 // passing. 5420 SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32); 5421 5422 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 5423 SmallVector<TailCallArgumentInfo, 8> TailCallArguments; 5424 SmallVector<SDValue, 8> MemOpChains; 5425 5426 bool seenFloatArg = false; 5427 // Walk the register/memloc assignments, inserting copies/loads. 5428 for (unsigned i = 0, j = 0, e = ArgLocs.size(); 5429 i != e; 5430 ++i) { 5431 CCValAssign &VA = ArgLocs[i]; 5432 SDValue Arg = OutVals[i]; 5433 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5434 5435 if (Flags.isByVal()) { 5436 // Argument is an aggregate which is passed by value, thus we need to 5437 // create a copy of it in the local variable space of the current stack 5438 // frame (which is the stack frame of the caller) and pass the address of 5439 // this copy to the callee. 5440 assert((j < ByValArgLocs.size()) && "Index out of bounds!"); 5441 CCValAssign &ByValVA = ByValArgLocs[j++]; 5442 assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!"); 5443 5444 // Memory reserved in the local variable space of the callers stack frame. 5445 unsigned LocMemOffset = ByValVA.getLocMemOffset(); 5446 5447 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 5448 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), 5449 StackPtr, PtrOff); 5450 5451 // Create a copy of the argument in the local area of the current 5452 // stack frame. 5453 SDValue MemcpyCall = 5454 CreateCopyOfByValArgument(Arg, PtrOff, 5455 CallSeqStart.getNode()->getOperand(0), 5456 Flags, DAG, dl); 5457 5458 // This must go outside the CALLSEQ_START..END. 5459 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0, 5460 SDLoc(MemcpyCall)); 5461 DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), 5462 NewCallSeqStart.getNode()); 5463 Chain = CallSeqStart = NewCallSeqStart; 5464 5465 // Pass the address of the aggregate copy on the stack either in a 5466 // physical register or in the parameter list area of the current stack 5467 // frame to the callee. 5468 Arg = PtrOff; 5469 } 5470 5471 // When useCRBits() is true, there can be i1 arguments. 5472 // It is because getRegisterType(MVT::i1) => MVT::i1, 5473 // and for other integer types getRegisterType() => MVT::i32. 5474 // Extend i1 and ensure callee will get i32. 5475 if (Arg.getValueType() == MVT::i1) 5476 Arg = DAG.getNode(Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 5477 dl, MVT::i32, Arg); 5478 5479 if (VA.isRegLoc()) { 5480 seenFloatArg |= VA.getLocVT().isFloatingPoint(); 5481 // Put argument in a physical register. 5482 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 5483 } else { 5484 // Put argument in the parameter list area of the current stack frame. 5485 assert(VA.isMemLoc()); 5486 unsigned LocMemOffset = VA.getLocMemOffset(); 5487 5488 if (!isTailCall) { 5489 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 5490 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), 5491 StackPtr, PtrOff); 5492 5493 MemOpChains.push_back( 5494 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); 5495 } else { 5496 // Calculate and remember argument location. 5497 CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset, 5498 TailCallArguments); 5499 } 5500 } 5501 } 5502 5503 if (!MemOpChains.empty()) 5504 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 5505 5506 // Build a sequence of copy-to-reg nodes chained together with token chain 5507 // and flag operands which copy the outgoing args into the appropriate regs. 5508 SDValue InFlag; 5509 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 5510 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 5511 RegsToPass[i].second, InFlag); 5512 InFlag = Chain.getValue(1); 5513 } 5514 5515 // Set CR bit 6 to true if this is a vararg call with floating args passed in 5516 // registers. 5517 if (isVarArg) { 5518 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 5519 SDValue Ops[] = { Chain, InFlag }; 5520 5521 Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, 5522 dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1)); 5523 5524 InFlag = Chain.getValue(1); 5525 } 5526 5527 if (isTailCall) 5528 PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, 5529 TailCallArguments); 5530 5531 return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, 5532 /* unused except on PPC64 ELFv1 */ false, DAG, 5533 RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, 5534 NumBytes, Ins, InVals, CS); 5535 } 5536 5537 // Copy an argument into memory, being careful to do this outside the 5538 // call sequence for the call to which the argument belongs. 5539 SDValue PPCTargetLowering::createMemcpyOutsideCallSeq( 5540 SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags, 5541 SelectionDAG &DAG, const SDLoc &dl) const { 5542 SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, 5543 CallSeqStart.getNode()->getOperand(0), 5544 Flags, DAG, dl); 5545 // The MEMCPY must go outside the CALLSEQ_START..END. 5546 int64_t FrameSize = CallSeqStart.getConstantOperandVal(1); 5547 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0, 5548 SDLoc(MemcpyCall)); 5549 DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), 5550 NewCallSeqStart.getNode()); 5551 return NewCallSeqStart; 5552 } 5553 5554 SDValue PPCTargetLowering::LowerCall_64SVR4( 5555 SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, 5556 bool isTailCall, bool isPatchPoint, 5557 const SmallVectorImpl<ISD::OutputArg> &Outs, 5558 const SmallVectorImpl<SDValue> &OutVals, 5559 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 5560 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 5561 ImmutableCallSite CS) const { 5562 bool isELFv2ABI = Subtarget.isELFv2ABI(); 5563 bool isLittleEndian = Subtarget.isLittleEndian(); 5564 unsigned NumOps = Outs.size(); 5565 bool hasNest = false; 5566 bool IsSibCall = false; 5567 5568 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5569 unsigned PtrByteSize = 8; 5570 5571 MachineFunction &MF = DAG.getMachineFunction(); 5572 5573 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt) 5574 IsSibCall = true; 5575 5576 // Mark this function as potentially containing a function that contains a 5577 // tail call. As a consequence the frame pointer will be used for dynamicalloc 5578 // and restoring the callers stack pointer in this functions epilog. This is 5579 // done because by tail calling the called function might overwrite the value 5580 // in this function's (MF) stack pointer stack slot 0(SP). 5581 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5582 CallConv == CallingConv::Fast) 5583 MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); 5584 5585 assert(!(CallConv == CallingConv::Fast && isVarArg) && 5586 "fastcc not supported on varargs functions"); 5587 5588 // Count how many bytes are to be pushed on the stack, including the linkage 5589 // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes 5590 // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage 5591 // area is 32 bytes reserved space for [SP][CR][LR][TOC]. 5592 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 5593 unsigned NumBytes = LinkageSize; 5594 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 5595 unsigned &QFPR_idx = FPR_idx; 5596 5597 static const MCPhysReg GPR[] = { 5598 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 5599 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 5600 }; 5601 static const MCPhysReg VR[] = { 5602 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 5603 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 5604 }; 5605 5606 const unsigned NumGPRs = array_lengthof(GPR); 5607 const unsigned NumFPRs = useSoftFloat() ? 0 : 13; 5608 const unsigned NumVRs = array_lengthof(VR); 5609 const unsigned NumQFPRs = NumFPRs; 5610 5611 // On ELFv2, we can avoid allocating the parameter area if all the arguments 5612 // can be passed to the callee in registers. 5613 // For the fast calling convention, there is another check below. 5614 // Note: We should keep consistent with LowerFormalArguments_64SVR4() 5615 bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast; 5616 if (!HasParameterArea) { 5617 unsigned ParamAreaSize = NumGPRs * PtrByteSize; 5618 unsigned AvailableFPRs = NumFPRs; 5619 unsigned AvailableVRs = NumVRs; 5620 unsigned NumBytesTmp = NumBytes; 5621 for (unsigned i = 0; i != NumOps; ++i) { 5622 if (Outs[i].Flags.isNest()) continue; 5623 if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags, 5624 PtrByteSize, LinkageSize, ParamAreaSize, 5625 NumBytesTmp, AvailableFPRs, AvailableVRs, 5626 Subtarget.hasQPX())) 5627 HasParameterArea = true; 5628 } 5629 } 5630 5631 // When using the fast calling convention, we don't provide backing for 5632 // arguments that will be in registers. 5633 unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0; 5634 5635 // Avoid allocating parameter area for fastcc functions if all the arguments 5636 // can be passed in the registers. 5637 if (CallConv == CallingConv::Fast) 5638 HasParameterArea = false; 5639 5640 // Add up all the space actually used. 5641 for (unsigned i = 0; i != NumOps; ++i) { 5642 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5643 EVT ArgVT = Outs[i].VT; 5644 EVT OrigVT = Outs[i].ArgVT; 5645 5646 if (Flags.isNest()) 5647 continue; 5648 5649 if (CallConv == CallingConv::Fast) { 5650 if (Flags.isByVal()) { 5651 NumGPRsUsed += (Flags.getByValSize()+7)/8; 5652 if (NumGPRsUsed > NumGPRs) 5653 HasParameterArea = true; 5654 } else { 5655 switch (ArgVT.getSimpleVT().SimpleTy) { 5656 default: llvm_unreachable("Unexpected ValueType for argument!"); 5657 case MVT::i1: 5658 case MVT::i32: 5659 case MVT::i64: 5660 if (++NumGPRsUsed <= NumGPRs) 5661 continue; 5662 break; 5663 case MVT::v4i32: 5664 case MVT::v8i16: 5665 case MVT::v16i8: 5666 case MVT::v2f64: 5667 case MVT::v2i64: 5668 case MVT::v1i128: 5669 case MVT::f128: 5670 if (++NumVRsUsed <= NumVRs) 5671 continue; 5672 break; 5673 case MVT::v4f32: 5674 // When using QPX, this is handled like a FP register, otherwise, it 5675 // is an Altivec register. 5676 if (Subtarget.hasQPX()) { 5677 if (++NumFPRsUsed <= NumFPRs) 5678 continue; 5679 } else { 5680 if (++NumVRsUsed <= NumVRs) 5681 continue; 5682 } 5683 break; 5684 case MVT::f32: 5685 case MVT::f64: 5686 case MVT::v4f64: // QPX 5687 case MVT::v4i1: // QPX 5688 if (++NumFPRsUsed <= NumFPRs) 5689 continue; 5690 break; 5691 } 5692 HasParameterArea = true; 5693 } 5694 } 5695 5696 /* Respect alignment of argument on the stack. */ 5697 unsigned Align = 5698 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); 5699 NumBytes = ((NumBytes + Align - 1) / Align) * Align; 5700 5701 NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); 5702 if (Flags.isInConsecutiveRegsLast()) 5703 NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 5704 } 5705 5706 unsigned NumBytesActuallyUsed = NumBytes; 5707 5708 // In the old ELFv1 ABI, 5709 // the prolog code of the callee may store up to 8 GPR argument registers to 5710 // the stack, allowing va_start to index over them in memory if its varargs. 5711 // Because we cannot tell if this is needed on the caller side, we have to 5712 // conservatively assume that it is needed. As such, make sure we have at 5713 // least enough stack space for the caller to store the 8 GPRs. 5714 // In the ELFv2 ABI, we allocate the parameter area iff a callee 5715 // really requires memory operands, e.g. a vararg function. 5716 if (HasParameterArea) 5717 NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); 5718 else 5719 NumBytes = LinkageSize; 5720 5721 // Tail call needs the stack to be aligned. 5722 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5723 CallConv == CallingConv::Fast) 5724 NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); 5725 5726 int SPDiff = 0; 5727 5728 // Calculate by how many bytes the stack has to be adjusted in case of tail 5729 // call optimization. 5730 if (!IsSibCall) 5731 SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); 5732 5733 // To protect arguments on the stack from being clobbered in a tail call, 5734 // force all the loads to happen before doing any other lowering. 5735 if (isTailCall) 5736 Chain = DAG.getStackArgumentTokenFactor(Chain); 5737 5738 // Adjust the stack pointer for the new arguments... 5739 // These operations are automatically eliminated by the prolog/epilog pass 5740 if (!IsSibCall) 5741 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 5742 SDValue CallSeqStart = Chain; 5743 5744 // Load the return address and frame pointer so it can be move somewhere else 5745 // later. 5746 SDValue LROp, FPOp; 5747 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); 5748 5749 // Set up a copy of the stack pointer for use loading and storing any 5750 // arguments that may not fit in the registers available for argument 5751 // passing. 5752 SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64); 5753 5754 // Figure out which arguments are going to go in registers, and which in 5755 // memory. Also, if this is a vararg function, floating point operations 5756 // must be stored to our stack, and loaded into integer regs as well, if 5757 // any integer regs are available for argument passing. 5758 unsigned ArgOffset = LinkageSize; 5759 5760 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 5761 SmallVector<TailCallArgumentInfo, 8> TailCallArguments; 5762 5763 SmallVector<SDValue, 8> MemOpChains; 5764 for (unsigned i = 0; i != NumOps; ++i) { 5765 SDValue Arg = OutVals[i]; 5766 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5767 EVT ArgVT = Outs[i].VT; 5768 EVT OrigVT = Outs[i].ArgVT; 5769 5770 // PtrOff will be used to store the current argument to the stack if a 5771 // register cannot be found for it. 5772 SDValue PtrOff; 5773 5774 // We re-align the argument offset for each argument, except when using the 5775 // fast calling convention, when we need to make sure we do that only when 5776 // we'll actually use a stack slot. 5777 auto ComputePtrOff = [&]() { 5778 /* Respect alignment of argument on the stack. */ 5779 unsigned Align = 5780 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); 5781 ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; 5782 5783 PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); 5784 5785 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); 5786 }; 5787 5788 if (CallConv != CallingConv::Fast) { 5789 ComputePtrOff(); 5790 5791 /* Compute GPR index associated with argument offset. */ 5792 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; 5793 GPR_idx = std::min(GPR_idx, NumGPRs); 5794 } 5795 5796 // Promote integers to 64-bit values. 5797 if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) { 5798 // FIXME: Should this use ANY_EXTEND if neither sext nor zext? 5799 unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 5800 Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); 5801 } 5802 5803 // FIXME memcpy is used way more than necessary. Correctness first. 5804 // Note: "by value" is code for passing a structure by value, not 5805 // basic types. 5806 if (Flags.isByVal()) { 5807 // Note: Size includes alignment padding, so 5808 // struct x { short a; char b; } 5809 // will have Size = 4. With #pragma pack(1), it will have Size = 3. 5810 // These are the proper values we need for right-justifying the 5811 // aggregate in a parameter register. 5812 unsigned Size = Flags.getByValSize(); 5813 5814 // An empty aggregate parameter takes up no storage and no 5815 // registers. 5816 if (Size == 0) 5817 continue; 5818 5819 if (CallConv == CallingConv::Fast) 5820 ComputePtrOff(); 5821 5822 // All aggregates smaller than 8 bytes must be passed right-justified. 5823 if (Size==1 || Size==2 || Size==4) { 5824 EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); 5825 if (GPR_idx != NumGPRs) { 5826 SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, 5827 MachinePointerInfo(), VT); 5828 MemOpChains.push_back(Load.getValue(1)); 5829 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5830 5831 ArgOffset += PtrByteSize; 5832 continue; 5833 } 5834 } 5835 5836 if (GPR_idx == NumGPRs && Size < 8) { 5837 SDValue AddPtr = PtrOff; 5838 if (!isLittleEndian) { 5839 SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, 5840 PtrOff.getValueType()); 5841 AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); 5842 } 5843 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, 5844 CallSeqStart, 5845 Flags, DAG, dl); 5846 ArgOffset += PtrByteSize; 5847 continue; 5848 } 5849 // Copy entire object into memory. There are cases where gcc-generated 5850 // code assumes it is there, even if it could be put entirely into 5851 // registers. (This is not what the doc says.) 5852 5853 // FIXME: The above statement is likely due to a misunderstanding of the 5854 // documents. All arguments must be copied into the parameter area BY 5855 // THE CALLEE in the event that the callee takes the address of any 5856 // formal argument. That has not yet been implemented. However, it is 5857 // reasonable to use the stack area as a staging area for the register 5858 // load. 5859 5860 // Skip this for small aggregates, as we will use the same slot for a 5861 // right-justified copy, below. 5862 if (Size >= 8) 5863 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, 5864 CallSeqStart, 5865 Flags, DAG, dl); 5866 5867 // When a register is available, pass a small aggregate right-justified. 5868 if (Size < 8 && GPR_idx != NumGPRs) { 5869 // The easiest way to get this right-justified in a register 5870 // is to copy the structure into the rightmost portion of a 5871 // local variable slot, then load the whole slot into the 5872 // register. 5873 // FIXME: The memcpy seems to produce pretty awful code for 5874 // small aggregates, particularly for packed ones. 5875 // FIXME: It would be preferable to use the slot in the 5876 // parameter save area instead of a new local variable. 5877 SDValue AddPtr = PtrOff; 5878 if (!isLittleEndian) { 5879 SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType()); 5880 AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); 5881 } 5882 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, 5883 CallSeqStart, 5884 Flags, DAG, dl); 5885 5886 // Load the slot into the register. 5887 SDValue Load = 5888 DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo()); 5889 MemOpChains.push_back(Load.getValue(1)); 5890 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5891 5892 // Done with this argument. 5893 ArgOffset += PtrByteSize; 5894 continue; 5895 } 5896 5897 // For aggregates larger than PtrByteSize, copy the pieces of the 5898 // object that fit into registers from the parameter save area. 5899 for (unsigned j=0; j<Size; j+=PtrByteSize) { 5900 SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType()); 5901 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 5902 if (GPR_idx != NumGPRs) { 5903 SDValue Load = 5904 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo()); 5905 MemOpChains.push_back(Load.getValue(1)); 5906 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5907 ArgOffset += PtrByteSize; 5908 } else { 5909 ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; 5910 break; 5911 } 5912 } 5913 continue; 5914 } 5915 5916 switch (Arg.getSimpleValueType().SimpleTy) { 5917 default: llvm_unreachable("Unexpected ValueType for argument!"); 5918 case MVT::i1: 5919 case MVT::i32: 5920 case MVT::i64: 5921 if (Flags.isNest()) { 5922 // The 'nest' parameter, if any, is passed in R11. 5923 RegsToPass.push_back(std::make_pair(PPC::X11, Arg)); 5924 hasNest = true; 5925 break; 5926 } 5927 5928 // These can be scalar arguments or elements of an integer array type 5929 // passed directly. Clang may use those instead of "byval" aggregate 5930 // types to avoid forcing arguments to memory unnecessarily. 5931 if (GPR_idx != NumGPRs) { 5932 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); 5933 } else { 5934 if (CallConv == CallingConv::Fast) 5935 ComputePtrOff(); 5936 5937 assert(HasParameterArea && 5938 "Parameter area must exist to pass an argument in memory."); 5939 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 5940 true, isTailCall, false, MemOpChains, 5941 TailCallArguments, dl); 5942 if (CallConv == CallingConv::Fast) 5943 ArgOffset += PtrByteSize; 5944 } 5945 if (CallConv != CallingConv::Fast) 5946 ArgOffset += PtrByteSize; 5947 break; 5948 case MVT::f32: 5949 case MVT::f64: { 5950 // These can be scalar arguments or elements of a float array type 5951 // passed directly. The latter are used to implement ELFv2 homogenous 5952 // float aggregates. 5953 5954 // Named arguments go into FPRs first, and once they overflow, the 5955 // remaining arguments go into GPRs and then the parameter save area. 5956 // Unnamed arguments for vararg functions always go to GPRs and 5957 // then the parameter save area. For now, put all arguments to vararg 5958 // routines always in both locations (FPR *and* GPR or stack slot). 5959 bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs; 5960 bool NeededLoad = false; 5961 5962 // First load the argument into the next available FPR. 5963 if (FPR_idx != NumFPRs) 5964 RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); 5965 5966 // Next, load the argument into GPR or stack slot if needed. 5967 if (!NeedGPROrStack) 5968 ; 5969 else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) { 5970 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 5971 // once we support fp <-> gpr moves. 5972 5973 // In the non-vararg case, this can only ever happen in the 5974 // presence of f32 array types, since otherwise we never run 5975 // out of FPRs before running out of GPRs. 5976 SDValue ArgVal; 5977 5978 // Double values are always passed in a single GPR. 5979 if (Arg.getValueType() != MVT::f32) { 5980 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 5981 5982 // Non-array float values are extended and passed in a GPR. 5983 } else if (!Flags.isInConsecutiveRegs()) { 5984 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); 5985 ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); 5986 5987 // If we have an array of floats, we collect every odd element 5988 // together with its predecessor into one GPR. 5989 } else if (ArgOffset % PtrByteSize != 0) { 5990 SDValue Lo, Hi; 5991 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); 5992 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); 5993 if (!isLittleEndian) 5994 std::swap(Lo, Hi); 5995 ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 5996 5997 // The final element, if even, goes into the first half of a GPR. 5998 } else if (Flags.isInConsecutiveRegsLast()) { 5999 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); 6000 ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); 6001 if (!isLittleEndian) 6002 ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, 6003 DAG.getConstant(32, dl, MVT::i32)); 6004 6005 // Non-final even elements are skipped; they will be handled 6006 // together the with subsequent argument on the next go-around. 6007 } else 6008 ArgVal = SDValue(); 6009 6010 if (ArgVal.getNode()) 6011 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal)); 6012 } else { 6013 if (CallConv == CallingConv::Fast) 6014 ComputePtrOff(); 6015 6016 // Single-precision floating-point values are mapped to the 6017 // second (rightmost) word of the stack doubleword. 6018 if (Arg.getValueType() == MVT::f32 && 6019 !isLittleEndian && !Flags.isInConsecutiveRegs()) { 6020 SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); 6021 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); 6022 } 6023 6024 assert(HasParameterArea && 6025 "Parameter area must exist to pass an argument in memory."); 6026 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6027 true, isTailCall, false, MemOpChains, 6028 TailCallArguments, dl); 6029 6030 NeededLoad = true; 6031 } 6032 // When passing an array of floats, the array occupies consecutive 6033 // space in the argument area; only round up to the next doubleword 6034 // at the end of the array. Otherwise, each float takes 8 bytes. 6035 if (CallConv != CallingConv::Fast || NeededLoad) { 6036 ArgOffset += (Arg.getValueType() == MVT::f32 && 6037 Flags.isInConsecutiveRegs()) ? 4 : 8; 6038 if (Flags.isInConsecutiveRegsLast()) 6039 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 6040 } 6041 break; 6042 } 6043 case MVT::v4f32: 6044 case MVT::v4i32: 6045 case MVT::v8i16: 6046 case MVT::v16i8: 6047 case MVT::v2f64: 6048 case MVT::v2i64: 6049 case MVT::v1i128: 6050 case MVT::f128: 6051 if (!Subtarget.hasQPX()) { 6052 // These can be scalar arguments or elements of a vector array type 6053 // passed directly. The latter are used to implement ELFv2 homogenous 6054 // vector aggregates. 6055 6056 // For a varargs call, named arguments go into VRs or on the stack as 6057 // usual; unnamed arguments always go to the stack or the corresponding 6058 // GPRs when within range. For now, we always put the value in both 6059 // locations (or even all three). 6060 if (isVarArg) { 6061 assert(HasParameterArea && 6062 "Parameter area must exist if we have a varargs call."); 6063 // We could elide this store in the case where the object fits 6064 // entirely in R registers. Maybe later. 6065 SDValue Store = 6066 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 6067 MemOpChains.push_back(Store); 6068 if (VR_idx != NumVRs) { 6069 SDValue Load = 6070 DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo()); 6071 MemOpChains.push_back(Load.getValue(1)); 6072 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load)); 6073 } 6074 ArgOffset += 16; 6075 for (unsigned i=0; i<16; i+=PtrByteSize) { 6076 if (GPR_idx == NumGPRs) 6077 break; 6078 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, 6079 DAG.getConstant(i, dl, PtrVT)); 6080 SDValue Load = 6081 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); 6082 MemOpChains.push_back(Load.getValue(1)); 6083 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6084 } 6085 break; 6086 } 6087 6088 // Non-varargs Altivec params go into VRs or on the stack. 6089 if (VR_idx != NumVRs) { 6090 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); 6091 } else { 6092 if (CallConv == CallingConv::Fast) 6093 ComputePtrOff(); 6094 6095 assert(HasParameterArea && 6096 "Parameter area must exist to pass an argument in memory."); 6097 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6098 true, isTailCall, true, MemOpChains, 6099 TailCallArguments, dl); 6100 if (CallConv == CallingConv::Fast) 6101 ArgOffset += 16; 6102 } 6103 6104 if (CallConv != CallingConv::Fast) 6105 ArgOffset += 16; 6106 break; 6107 } // not QPX 6108 6109 assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && 6110 "Invalid QPX parameter type"); 6111 6112 LLVM_FALLTHROUGH; 6113 case MVT::v4f64: 6114 case MVT::v4i1: { 6115 bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; 6116 if (isVarArg) { 6117 assert(HasParameterArea && 6118 "Parameter area must exist if we have a varargs call."); 6119 // We could elide this store in the case where the object fits 6120 // entirely in R registers. Maybe later. 6121 SDValue Store = 6122 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 6123 MemOpChains.push_back(Store); 6124 if (QFPR_idx != NumQFPRs) { 6125 SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store, 6126 PtrOff, MachinePointerInfo()); 6127 MemOpChains.push_back(Load.getValue(1)); 6128 RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load)); 6129 } 6130 ArgOffset += (IsF32 ? 16 : 32); 6131 for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) { 6132 if (GPR_idx == NumGPRs) 6133 break; 6134 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, 6135 DAG.getConstant(i, dl, PtrVT)); 6136 SDValue Load = 6137 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); 6138 MemOpChains.push_back(Load.getValue(1)); 6139 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6140 } 6141 break; 6142 } 6143 6144 // Non-varargs QPX params go into registers or on the stack. 6145 if (QFPR_idx != NumQFPRs) { 6146 RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg)); 6147 } else { 6148 if (CallConv == CallingConv::Fast) 6149 ComputePtrOff(); 6150 6151 assert(HasParameterArea && 6152 "Parameter area must exist to pass an argument in memory."); 6153 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6154 true, isTailCall, true, MemOpChains, 6155 TailCallArguments, dl); 6156 if (CallConv == CallingConv::Fast) 6157 ArgOffset += (IsF32 ? 16 : 32); 6158 } 6159 6160 if (CallConv != CallingConv::Fast) 6161 ArgOffset += (IsF32 ? 16 : 32); 6162 break; 6163 } 6164 } 6165 } 6166 6167 assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) && 6168 "mismatch in size of parameter area"); 6169 (void)NumBytesActuallyUsed; 6170 6171 if (!MemOpChains.empty()) 6172 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 6173 6174 // Check if this is an indirect call (MTCTR/BCTRL). 6175 // See PrepareCall() for more information about calls through function 6176 // pointers in the 64-bit SVR4 ABI. 6177 if (!isTailCall && !isPatchPoint && 6178 !isFunctionGlobalAddress(Callee) && 6179 !isa<ExternalSymbolSDNode>(Callee)) { 6180 // Load r2 into a virtual register and store it to the TOC save area. 6181 setUsesTOCBasePtr(DAG); 6182 SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); 6183 // TOC save area offset. 6184 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); 6185 SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); 6186 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); 6187 Chain = DAG.getStore( 6188 Val.getValue(1), dl, Val, AddPtr, 6189 MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset)); 6190 // In the ELFv2 ABI, R12 must contain the address of an indirect callee. 6191 // This does not mean the MTCTR instruction must use R12; it's easier 6192 // to model this as an extra parameter, so do that. 6193 if (isELFv2ABI && !isPatchPoint) 6194 RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); 6195 } 6196 6197 // Build a sequence of copy-to-reg nodes chained together with token chain 6198 // and flag operands which copy the outgoing args into the appropriate regs. 6199 SDValue InFlag; 6200 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 6201 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 6202 RegsToPass[i].second, InFlag); 6203 InFlag = Chain.getValue(1); 6204 } 6205 6206 if (isTailCall && !IsSibCall) 6207 PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, 6208 TailCallArguments); 6209 6210 return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest, 6211 DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, 6212 SPDiff, NumBytes, Ins, InVals, CS); 6213 } 6214 6215 SDValue PPCTargetLowering::LowerCall_Darwin( 6216 SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, 6217 bool isTailCall, bool isPatchPoint, 6218 const SmallVectorImpl<ISD::OutputArg> &Outs, 6219 const SmallVectorImpl<SDValue> &OutVals, 6220 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 6221 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 6222 ImmutableCallSite CS) const { 6223 unsigned NumOps = Outs.size(); 6224 6225 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6226 bool isPPC64 = PtrVT == MVT::i64; 6227 unsigned PtrByteSize = isPPC64 ? 8 : 4; 6228 6229 MachineFunction &MF = DAG.getMachineFunction(); 6230 6231 // Mark this function as potentially containing a function that contains a 6232 // tail call. As a consequence the frame pointer will be used for dynamicalloc 6233 // and restoring the callers stack pointer in this functions epilog. This is 6234 // done because by tail calling the called function might overwrite the value 6235 // in this function's (MF) stack pointer stack slot 0(SP). 6236 if (getTargetMachine().Options.GuaranteedTailCallOpt && 6237 CallConv == CallingConv::Fast) 6238 MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); 6239 6240 // Count how many bytes are to be pushed on the stack, including the linkage 6241 // area, and parameter passing area. We start with 24/48 bytes, which is 6242 // prereserved space for [SP][CR][LR][3 x unused]. 6243 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 6244 unsigned NumBytes = LinkageSize; 6245 6246 // Add up all the space actually used. 6247 // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually 6248 // they all go in registers, but we must reserve stack space for them for 6249 // possible use by the caller. In varargs or 64-bit calls, parameters are 6250 // assigned stack space in order, with padding so Altivec parameters are 6251 // 16-byte aligned. 6252 unsigned nAltivecParamsAtEnd = 0; 6253 for (unsigned i = 0; i != NumOps; ++i) { 6254 ISD::ArgFlagsTy Flags = Outs[i].Flags; 6255 EVT ArgVT = Outs[i].VT; 6256 // Varargs Altivec parameters are padded to a 16 byte boundary. 6257 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || 6258 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || 6259 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) { 6260 if (!isVarArg && !isPPC64) { 6261 // Non-varargs Altivec parameters go after all the non-Altivec 6262 // parameters; handle those later so we know how much padding we need. 6263 nAltivecParamsAtEnd++; 6264 continue; 6265 } 6266 // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. 6267 NumBytes = ((NumBytes+15)/16)*16; 6268 } 6269 NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); 6270 } 6271 6272 // Allow for Altivec parameters at the end, if needed. 6273 if (nAltivecParamsAtEnd) { 6274 NumBytes = ((NumBytes+15)/16)*16; 6275 NumBytes += 16*nAltivecParamsAtEnd; 6276 } 6277 6278 // The prolog code of the callee may store up to 8 GPR argument registers to 6279 // the stack, allowing va_start to index over them in memory if its varargs. 6280 // Because we cannot tell if this is needed on the caller side, we have to 6281 // conservatively assume that it is needed. As such, make sure we have at 6282 // least enough stack space for the caller to store the 8 GPRs. 6283 NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); 6284 6285 // Tail call needs the stack to be aligned. 6286 if (getTargetMachine().Options.GuaranteedTailCallOpt && 6287 CallConv == CallingConv::Fast) 6288 NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); 6289 6290 // Calculate by how many bytes the stack has to be adjusted in case of tail 6291 // call optimization. 6292 int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); 6293 6294 // To protect arguments on the stack from being clobbered in a tail call, 6295 // force all the loads to happen before doing any other lowering. 6296 if (isTailCall) 6297 Chain = DAG.getStackArgumentTokenFactor(Chain); 6298 6299 // Adjust the stack pointer for the new arguments... 6300 // These operations are automatically eliminated by the prolog/epilog pass 6301 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 6302 SDValue CallSeqStart = Chain; 6303 6304 // Load the return address and frame pointer so it can be move somewhere else 6305 // later. 6306 SDValue LROp, FPOp; 6307 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); 6308 6309 // Set up a copy of the stack pointer for use loading and storing any 6310 // arguments that may not fit in the registers available for argument 6311 // passing. 6312 SDValue StackPtr; 6313 if (isPPC64) 6314 StackPtr = DAG.getRegister(PPC::X1, MVT::i64); 6315 else 6316 StackPtr = DAG.getRegister(PPC::R1, MVT::i32); 6317 6318 // Figure out which arguments are going to go in registers, and which in 6319 // memory. Also, if this is a vararg function, floating point operations 6320 // must be stored to our stack, and loaded into integer regs as well, if 6321 // any integer regs are available for argument passing. 6322 unsigned ArgOffset = LinkageSize; 6323 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 6324 6325 static const MCPhysReg GPR_32[] = { // 32-bit registers. 6326 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 6327 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 6328 }; 6329 static const MCPhysReg GPR_64[] = { // 64-bit registers. 6330 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 6331 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 6332 }; 6333 static const MCPhysReg VR[] = { 6334 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 6335 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 6336 }; 6337 const unsigned NumGPRs = array_lengthof(GPR_32); 6338 const unsigned NumFPRs = 13; 6339 const unsigned NumVRs = array_lengthof(VR); 6340 6341 const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; 6342 6343 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 6344 SmallVector<TailCallArgumentInfo, 8> TailCallArguments; 6345 6346 SmallVector<SDValue, 8> MemOpChains; 6347 for (unsigned i = 0; i != NumOps; ++i) { 6348 SDValue Arg = OutVals[i]; 6349 ISD::ArgFlagsTy Flags = Outs[i].Flags; 6350 6351 // PtrOff will be used to store the current argument to the stack if a 6352 // register cannot be found for it. 6353 SDValue PtrOff; 6354 6355 PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); 6356 6357 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); 6358 6359 // On PPC64, promote integers to 64-bit values. 6360 if (isPPC64 && Arg.getValueType() == MVT::i32) { 6361 // FIXME: Should this use ANY_EXTEND if neither sext nor zext? 6362 unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 6363 Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); 6364 } 6365 6366 // FIXME memcpy is used way more than necessary. Correctness first. 6367 // Note: "by value" is code for passing a structure by value, not 6368 // basic types. 6369 if (Flags.isByVal()) { 6370 unsigned Size = Flags.getByValSize(); 6371 // Very small objects are passed right-justified. Everything else is 6372 // passed left-justified. 6373 if (Size==1 || Size==2) { 6374 EVT VT = (Size==1) ? MVT::i8 : MVT::i16; 6375 if (GPR_idx != NumGPRs) { 6376 SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, 6377 MachinePointerInfo(), VT); 6378 MemOpChains.push_back(Load.getValue(1)); 6379 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6380 6381 ArgOffset += PtrByteSize; 6382 } else { 6383 SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, 6384 PtrOff.getValueType()); 6385 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); 6386 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, 6387 CallSeqStart, 6388 Flags, DAG, dl); 6389 ArgOffset += PtrByteSize; 6390 } 6391 continue; 6392 } 6393 // Copy entire object into memory. There are cases where gcc-generated 6394 // code assumes it is there, even if it could be put entirely into 6395 // registers. (This is not what the doc says.) 6396 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, 6397 CallSeqStart, 6398 Flags, DAG, dl); 6399 6400 // For small aggregates (Darwin only) and aggregates >= PtrByteSize, 6401 // copy the pieces of the object that fit into registers from the 6402 // parameter save area. 6403 for (unsigned j=0; j<Size; j+=PtrByteSize) { 6404 SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType()); 6405 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 6406 if (GPR_idx != NumGPRs) { 6407 SDValue Load = 6408 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo()); 6409 MemOpChains.push_back(Load.getValue(1)); 6410 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6411 ArgOffset += PtrByteSize; 6412 } else { 6413 ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; 6414 break; 6415 } 6416 } 6417 continue; 6418 } 6419 6420 switch (Arg.getSimpleValueType().SimpleTy) { 6421 default: llvm_unreachable("Unexpected ValueType for argument!"); 6422 case MVT::i1: 6423 case MVT::i32: 6424 case MVT::i64: 6425 if (GPR_idx != NumGPRs) { 6426 if (Arg.getValueType() == MVT::i1) 6427 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg); 6428 6429 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); 6430 } else { 6431 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6432 isPPC64, isTailCall, false, MemOpChains, 6433 TailCallArguments, dl); 6434 } 6435 ArgOffset += PtrByteSize; 6436 break; 6437 case MVT::f32: 6438 case MVT::f64: 6439 if (FPR_idx != NumFPRs) { 6440 RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); 6441 6442 if (isVarArg) { 6443 SDValue Store = 6444 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 6445 MemOpChains.push_back(Store); 6446 6447 // Float varargs are always shadowed in available integer registers 6448 if (GPR_idx != NumGPRs) { 6449 SDValue Load = 6450 DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo()); 6451 MemOpChains.push_back(Load.getValue(1)); 6452 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6453 } 6454 if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){ 6455 SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); 6456 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); 6457 SDValue Load = 6458 DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo()); 6459 MemOpChains.push_back(Load.getValue(1)); 6460 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6461 } 6462 } else { 6463 // If we have any FPRs remaining, we may also have GPRs remaining. 6464 // Args passed in FPRs consume either 1 (f32) or 2 (f64) available 6465 // GPRs. 6466 if (GPR_idx != NumGPRs) 6467 ++GPR_idx; 6468 if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && 6469 !isPPC64) // PPC64 has 64-bit GPR's obviously :) 6470 ++GPR_idx; 6471 } 6472 } else 6473 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6474 isPPC64, isTailCall, false, MemOpChains, 6475 TailCallArguments, dl); 6476 if (isPPC64) 6477 ArgOffset += 8; 6478 else 6479 ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8; 6480 break; 6481 case MVT::v4f32: 6482 case MVT::v4i32: 6483 case MVT::v8i16: 6484 case MVT::v16i8: 6485 if (isVarArg) { 6486 // These go aligned on the stack, or in the corresponding R registers 6487 // when within range. The Darwin PPC ABI doc claims they also go in 6488 // V registers; in fact gcc does this only for arguments that are 6489 // prototyped, not for those that match the ... We do it for all 6490 // arguments, seems to work. 6491 while (ArgOffset % 16 !=0) { 6492 ArgOffset += PtrByteSize; 6493 if (GPR_idx != NumGPRs) 6494 GPR_idx++; 6495 } 6496 // We could elide this store in the case where the object fits 6497 // entirely in R registers. Maybe later. 6498 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, 6499 DAG.getConstant(ArgOffset, dl, PtrVT)); 6500 SDValue Store = 6501 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 6502 MemOpChains.push_back(Store); 6503 if (VR_idx != NumVRs) { 6504 SDValue Load = 6505 DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo()); 6506 MemOpChains.push_back(Load.getValue(1)); 6507 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load)); 6508 } 6509 ArgOffset += 16; 6510 for (unsigned i=0; i<16; i+=PtrByteSize) { 6511 if (GPR_idx == NumGPRs) 6512 break; 6513 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, 6514 DAG.getConstant(i, dl, PtrVT)); 6515 SDValue Load = 6516 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); 6517 MemOpChains.push_back(Load.getValue(1)); 6518 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6519 } 6520 break; 6521 } 6522 6523 // Non-varargs Altivec params generally go in registers, but have 6524 // stack space allocated at the end. 6525 if (VR_idx != NumVRs) { 6526 // Doesn't have GPR space allocated. 6527 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); 6528 } else if (nAltivecParamsAtEnd==0) { 6529 // We are emitting Altivec params in order. 6530 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6531 isPPC64, isTailCall, true, MemOpChains, 6532 TailCallArguments, dl); 6533 ArgOffset += 16; 6534 } 6535 break; 6536 } 6537 } 6538 // If all Altivec parameters fit in registers, as they usually do, 6539 // they get stack space following the non-Altivec parameters. We 6540 // don't track this here because nobody below needs it. 6541 // If there are more Altivec parameters than fit in registers emit 6542 // the stores here. 6543 if (!isVarArg && nAltivecParamsAtEnd > NumVRs) { 6544 unsigned j = 0; 6545 // Offset is aligned; skip 1st 12 params which go in V registers. 6546 ArgOffset = ((ArgOffset+15)/16)*16; 6547 ArgOffset += 12*16; 6548 for (unsigned i = 0; i != NumOps; ++i) { 6549 SDValue Arg = OutVals[i]; 6550 EVT ArgType = Outs[i].VT; 6551 if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 || 6552 ArgType==MVT::v8i16 || ArgType==MVT::v16i8) { 6553 if (++j > NumVRs) { 6554 SDValue PtrOff; 6555 // We are emitting Altivec params in order. 6556 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6557 isPPC64, isTailCall, true, MemOpChains, 6558 TailCallArguments, dl); 6559 ArgOffset += 16; 6560 } 6561 } 6562 } 6563 } 6564 6565 if (!MemOpChains.empty()) 6566 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 6567 6568 // On Darwin, R12 must contain the address of an indirect callee. This does 6569 // not mean the MTCTR instruction must use R12; it's easier to model this as 6570 // an extra parameter, so do that. 6571 if (!isTailCall && 6572 !isFunctionGlobalAddress(Callee) && 6573 !isa<ExternalSymbolSDNode>(Callee) && 6574 !isBLACompatibleAddress(Callee, DAG)) 6575 RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : 6576 PPC::R12), Callee)); 6577 6578 // Build a sequence of copy-to-reg nodes chained together with token chain 6579 // and flag operands which copy the outgoing args into the appropriate regs. 6580 SDValue InFlag; 6581 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 6582 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 6583 RegsToPass[i].second, InFlag); 6584 InFlag = Chain.getValue(1); 6585 } 6586 6587 if (isTailCall) 6588 PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, 6589 TailCallArguments); 6590 6591 return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, 6592 /* unused except on PPC64 ELFv1 */ false, DAG, 6593 RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, 6594 NumBytes, Ins, InVals, CS); 6595 } 6596 6597 bool 6598 PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 6599 MachineFunction &MF, bool isVarArg, 6600 const SmallVectorImpl<ISD::OutputArg> &Outs, 6601 LLVMContext &Context) const { 6602 SmallVector<CCValAssign, 16> RVLocs; 6603 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 6604 return CCInfo.CheckReturn( 6605 Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold) 6606 ? RetCC_PPC_Cold 6607 : RetCC_PPC); 6608 } 6609 6610 SDValue 6611 PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 6612 bool isVarArg, 6613 const SmallVectorImpl<ISD::OutputArg> &Outs, 6614 const SmallVectorImpl<SDValue> &OutVals, 6615 const SDLoc &dl, SelectionDAG &DAG) const { 6616 SmallVector<CCValAssign, 16> RVLocs; 6617 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 6618 *DAG.getContext()); 6619 CCInfo.AnalyzeReturn(Outs, 6620 (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold) 6621 ? RetCC_PPC_Cold 6622 : RetCC_PPC); 6623 6624 SDValue Flag; 6625 SmallVector<SDValue, 4> RetOps(1, Chain); 6626 6627 // Copy the result values into the output registers. 6628 for (unsigned i = 0; i != RVLocs.size(); ++i) { 6629 CCValAssign &VA = RVLocs[i]; 6630 assert(VA.isRegLoc() && "Can only return in registers!"); 6631 6632 SDValue Arg = OutVals[i]; 6633 6634 switch (VA.getLocInfo()) { 6635 default: llvm_unreachable("Unknown loc info!"); 6636 case CCValAssign::Full: break; 6637 case CCValAssign::AExt: 6638 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 6639 break; 6640 case CCValAssign::ZExt: 6641 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 6642 break; 6643 case CCValAssign::SExt: 6644 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 6645 break; 6646 } 6647 6648 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 6649 Flag = Chain.getValue(1); 6650 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 6651 } 6652 6653 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); 6654 const MCPhysReg *I = 6655 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 6656 if (I) { 6657 for (; *I; ++I) { 6658 6659 if (PPC::G8RCRegClass.contains(*I)) 6660 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 6661 else if (PPC::F8RCRegClass.contains(*I)) 6662 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 6663 else if (PPC::CRRCRegClass.contains(*I)) 6664 RetOps.push_back(DAG.getRegister(*I, MVT::i1)); 6665 else if (PPC::VRRCRegClass.contains(*I)) 6666 RetOps.push_back(DAG.getRegister(*I, MVT::Other)); 6667 else 6668 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 6669 } 6670 } 6671 6672 RetOps[0] = Chain; // Update chain. 6673 6674 // Add the flag if we have it. 6675 if (Flag.getNode()) 6676 RetOps.push_back(Flag); 6677 6678 return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); 6679 } 6680 6681 SDValue 6682 PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, 6683 SelectionDAG &DAG) const { 6684 SDLoc dl(Op); 6685 6686 // Get the correct type for integers. 6687 EVT IntVT = Op.getValueType(); 6688 6689 // Get the inputs. 6690 SDValue Chain = Op.getOperand(0); 6691 SDValue FPSIdx = getFramePointerFrameIndex(DAG); 6692 // Build a DYNAREAOFFSET node. 6693 SDValue Ops[2] = {Chain, FPSIdx}; 6694 SDVTList VTs = DAG.getVTList(IntVT); 6695 return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops); 6696 } 6697 6698 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, 6699 SelectionDAG &DAG) const { 6700 // When we pop the dynamic allocation we need to restore the SP link. 6701 SDLoc dl(Op); 6702 6703 // Get the correct type for pointers. 6704 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6705 6706 // Construct the stack pointer operand. 6707 bool isPPC64 = Subtarget.isPPC64(); 6708 unsigned SP = isPPC64 ? PPC::X1 : PPC::R1; 6709 SDValue StackPtr = DAG.getRegister(SP, PtrVT); 6710 6711 // Get the operands for the STACKRESTORE. 6712 SDValue Chain = Op.getOperand(0); 6713 SDValue SaveSP = Op.getOperand(1); 6714 6715 // Load the old link SP. 6716 SDValue LoadLinkSP = 6717 DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo()); 6718 6719 // Restore the stack pointer. 6720 Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP); 6721 6722 // Store the old link SP. 6723 return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo()); 6724 } 6725 6726 SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const { 6727 MachineFunction &MF = DAG.getMachineFunction(); 6728 bool isPPC64 = Subtarget.isPPC64(); 6729 EVT PtrVT = getPointerTy(MF.getDataLayout()); 6730 6731 // Get current frame pointer save index. The users of this index will be 6732 // primarily DYNALLOC instructions. 6733 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); 6734 int RASI = FI->getReturnAddrSaveIndex(); 6735 6736 // If the frame pointer save index hasn't been defined yet. 6737 if (!RASI) { 6738 // Find out what the fix offset of the frame pointer save area. 6739 int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset(); 6740 // Allocate the frame index for frame pointer save area. 6741 RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false); 6742 // Save the result. 6743 FI->setReturnAddrSaveIndex(RASI); 6744 } 6745 return DAG.getFrameIndex(RASI, PtrVT); 6746 } 6747 6748 SDValue 6749 PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { 6750 MachineFunction &MF = DAG.getMachineFunction(); 6751 bool isPPC64 = Subtarget.isPPC64(); 6752 EVT PtrVT = getPointerTy(MF.getDataLayout()); 6753 6754 // Get current frame pointer save index. The users of this index will be 6755 // primarily DYNALLOC instructions. 6756 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); 6757 int FPSI = FI->getFramePointerSaveIndex(); 6758 6759 // If the frame pointer save index hasn't been defined yet. 6760 if (!FPSI) { 6761 // Find out what the fix offset of the frame pointer save area. 6762 int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset(); 6763 // Allocate the frame index for frame pointer save area. 6764 FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true); 6765 // Save the result. 6766 FI->setFramePointerSaveIndex(FPSI); 6767 } 6768 return DAG.getFrameIndex(FPSI, PtrVT); 6769 } 6770 6771 SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6772 SelectionDAG &DAG) const { 6773 // Get the inputs. 6774 SDValue Chain = Op.getOperand(0); 6775 SDValue Size = Op.getOperand(1); 6776 SDLoc dl(Op); 6777 6778 // Get the correct type for pointers. 6779 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6780 // Negate the size. 6781 SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT, 6782 DAG.getConstant(0, dl, PtrVT), Size); 6783 // Construct a node for the frame pointer save index. 6784 SDValue FPSIdx = getFramePointerFrameIndex(DAG); 6785 // Build a DYNALLOC node. 6786 SDValue Ops[3] = { Chain, NegSize, FPSIdx }; 6787 SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other); 6788 return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops); 6789 } 6790 6791 SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op, 6792 SelectionDAG &DAG) const { 6793 MachineFunction &MF = DAG.getMachineFunction(); 6794 6795 bool isPPC64 = Subtarget.isPPC64(); 6796 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6797 6798 int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false); 6799 return DAG.getFrameIndex(FI, PtrVT); 6800 } 6801 6802 SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, 6803 SelectionDAG &DAG) const { 6804 SDLoc DL(Op); 6805 return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL, 6806 DAG.getVTList(MVT::i32, MVT::Other), 6807 Op.getOperand(0), Op.getOperand(1)); 6808 } 6809 6810 SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, 6811 SelectionDAG &DAG) const { 6812 SDLoc DL(Op); 6813 return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other, 6814 Op.getOperand(0), Op.getOperand(1)); 6815 } 6816 6817 SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 6818 if (Op.getValueType().isVector()) 6819 return LowerVectorLoad(Op, DAG); 6820 6821 assert(Op.getValueType() == MVT::i1 && 6822 "Custom lowering only for i1 loads"); 6823 6824 // First, load 8 bits into 32 bits, then truncate to 1 bit. 6825 6826 SDLoc dl(Op); 6827 LoadSDNode *LD = cast<LoadSDNode>(Op); 6828 6829 SDValue Chain = LD->getChain(); 6830 SDValue BasePtr = LD->getBasePtr(); 6831 MachineMemOperand *MMO = LD->getMemOperand(); 6832 6833 SDValue NewLD = 6834 DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain, 6835 BasePtr, MVT::i8, MMO); 6836 SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD); 6837 6838 SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) }; 6839 return DAG.getMergeValues(Ops, dl); 6840 } 6841 6842 SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 6843 if (Op.getOperand(1).getValueType().isVector()) 6844 return LowerVectorStore(Op, DAG); 6845 6846 assert(Op.getOperand(1).getValueType() == MVT::i1 && 6847 "Custom lowering only for i1 stores"); 6848 6849 // First, zero extend to 32 bits, then use a truncating store to 8 bits. 6850 6851 SDLoc dl(Op); 6852 StoreSDNode *ST = cast<StoreSDNode>(Op); 6853 6854 SDValue Chain = ST->getChain(); 6855 SDValue BasePtr = ST->getBasePtr(); 6856 SDValue Value = ST->getValue(); 6857 MachineMemOperand *MMO = ST->getMemOperand(); 6858 6859 Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()), 6860 Value); 6861 return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO); 6862 } 6863 6864 // FIXME: Remove this once the ANDI glue bug is fixed: 6865 SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { 6866 assert(Op.getValueType() == MVT::i1 && 6867 "Custom lowering only for i1 results"); 6868 6869 SDLoc DL(Op); 6870 return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1, 6871 Op.getOperand(0)); 6872 } 6873 6874 /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when 6875 /// possible. 6876 SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 6877 // Not FP? Not a fsel. 6878 if (!Op.getOperand(0).getValueType().isFloatingPoint() || 6879 !Op.getOperand(2).getValueType().isFloatingPoint()) 6880 return Op; 6881 6882 // We might be able to do better than this under some circumstances, but in 6883 // general, fsel-based lowering of select is a finite-math-only optimization. 6884 // For more information, see section F.3 of the 2.06 ISA specification. 6885 if (!DAG.getTarget().Options.NoInfsFPMath || 6886 !DAG.getTarget().Options.NoNaNsFPMath) 6887 return Op; 6888 // TODO: Propagate flags from the select rather than global settings. 6889 SDNodeFlags Flags; 6890 Flags.setNoInfs(true); 6891 Flags.setNoNaNs(true); 6892 6893 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 6894 6895 EVT ResVT = Op.getValueType(); 6896 EVT CmpVT = Op.getOperand(0).getValueType(); 6897 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 6898 SDValue TV = Op.getOperand(2), FV = Op.getOperand(3); 6899 SDLoc dl(Op); 6900 6901 // If the RHS of the comparison is a 0.0, we don't need to do the 6902 // subtraction at all. 6903 SDValue Sel1; 6904 if (isFloatingPointZero(RHS)) 6905 switch (CC) { 6906 default: break; // SETUO etc aren't handled by fsel. 6907 case ISD::SETNE: 6908 std::swap(TV, FV); 6909 LLVM_FALLTHROUGH; 6910 case ISD::SETEQ: 6911 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits 6912 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); 6913 Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); 6914 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits 6915 Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); 6916 return DAG.getNode(PPCISD::FSEL, dl, ResVT, 6917 DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV); 6918 case ISD::SETULT: 6919 case ISD::SETLT: 6920 std::swap(TV, FV); // fsel is natively setge, swap operands for setlt 6921 LLVM_FALLTHROUGH; 6922 case ISD::SETOGE: 6923 case ISD::SETGE: 6924 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits 6925 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); 6926 return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); 6927 case ISD::SETUGT: 6928 case ISD::SETGT: 6929 std::swap(TV, FV); // fsel is natively setge, swap operands for setlt 6930 LLVM_FALLTHROUGH; 6931 case ISD::SETOLE: 6932 case ISD::SETLE: 6933 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits 6934 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); 6935 return DAG.getNode(PPCISD::FSEL, dl, ResVT, 6936 DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV); 6937 } 6938 6939 SDValue Cmp; 6940 switch (CC) { 6941 default: break; // SETUO etc aren't handled by fsel. 6942 case ISD::SETNE: 6943 std::swap(TV, FV); 6944 LLVM_FALLTHROUGH; 6945 case ISD::SETEQ: 6946 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); 6947 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6948 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6949 Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); 6950 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits 6951 Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); 6952 return DAG.getNode(PPCISD::FSEL, dl, ResVT, 6953 DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); 6954 case ISD::SETULT: 6955 case ISD::SETLT: 6956 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); 6957 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6958 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6959 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); 6960 case ISD::SETOGE: 6961 case ISD::SETGE: 6962 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); 6963 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6964 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6965 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); 6966 case ISD::SETUGT: 6967 case ISD::SETGT: 6968 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); 6969 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6970 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6971 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); 6972 case ISD::SETOLE: 6973 case ISD::SETLE: 6974 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); 6975 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6976 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6977 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); 6978 } 6979 return Op; 6980 } 6981 6982 void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, 6983 SelectionDAG &DAG, 6984 const SDLoc &dl) const { 6985 assert(Op.getOperand(0).getValueType().isFloatingPoint()); 6986 SDValue Src = Op.getOperand(0); 6987 if (Src.getValueType() == MVT::f32) 6988 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); 6989 6990 SDValue Tmp; 6991 switch (Op.getSimpleValueType().SimpleTy) { 6992 default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); 6993 case MVT::i32: 6994 Tmp = DAG.getNode( 6995 Op.getOpcode() == ISD::FP_TO_SINT 6996 ? PPCISD::FCTIWZ 6997 : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), 6998 dl, MVT::f64, Src); 6999 break; 7000 case MVT::i64: 7001 assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && 7002 "i64 FP_TO_UINT is supported only with FPCVT"); 7003 Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : 7004 PPCISD::FCTIDUZ, 7005 dl, MVT::f64, Src); 7006 break; 7007 } 7008 7009 // Convert the FP value to an int value through memory. 7010 bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() && 7011 (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); 7012 SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); 7013 int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex(); 7014 MachinePointerInfo MPI = 7015 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 7016 7017 // Emit a store to the stack slot. 7018 SDValue Chain; 7019 if (i32Stack) { 7020 MachineFunction &MF = DAG.getMachineFunction(); 7021 MachineMemOperand *MMO = 7022 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4); 7023 SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr }; 7024 Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, 7025 DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO); 7026 } else 7027 Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI); 7028 7029 // Result is a load from the stack slot. If loading 4 bytes, make sure to 7030 // add in a bias on big endian. 7031 if (Op.getValueType() == MVT::i32 && !i32Stack) { 7032 FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, 7033 DAG.getConstant(4, dl, FIPtr.getValueType())); 7034 MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4); 7035 } 7036 7037 RLI.Chain = Chain; 7038 RLI.Ptr = FIPtr; 7039 RLI.MPI = MPI; 7040 } 7041 7042 /// Custom lowers floating point to integer conversions to use 7043 /// the direct move instructions available in ISA 2.07 to avoid the 7044 /// need for load/store combinations. 7045 SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op, 7046 SelectionDAG &DAG, 7047 const SDLoc &dl) const { 7048 assert(Op.getOperand(0).getValueType().isFloatingPoint()); 7049 SDValue Src = Op.getOperand(0); 7050 7051 if (Src.getValueType() == MVT::f32) 7052 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); 7053 7054 SDValue Tmp; 7055 switch (Op.getSimpleValueType().SimpleTy) { 7056 default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); 7057 case MVT::i32: 7058 Tmp = DAG.getNode( 7059 Op.getOpcode() == ISD::FP_TO_SINT 7060 ? PPCISD::FCTIWZ 7061 : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), 7062 dl, MVT::f64, Src); 7063 Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp); 7064 break; 7065 case MVT::i64: 7066 assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && 7067 "i64 FP_TO_UINT is supported only with FPCVT"); 7068 Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : 7069 PPCISD::FCTIDUZ, 7070 dl, MVT::f64, Src); 7071 Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp); 7072 break; 7073 } 7074 return Tmp; 7075 } 7076 7077 SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, 7078 const SDLoc &dl) const { 7079 7080 // FP to INT conversions are legal for f128. 7081 if (EnableQuadPrecision && (Op->getOperand(0).getValueType() == MVT::f128)) 7082 return Op; 7083 7084 // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on 7085 // PPC (the libcall is not available). 7086 if (Op.getOperand(0).getValueType() == MVT::ppcf128) { 7087 if (Op.getValueType() == MVT::i32) { 7088 if (Op.getOpcode() == ISD::FP_TO_SINT) { 7089 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, 7090 MVT::f64, Op.getOperand(0), 7091 DAG.getIntPtrConstant(0, dl)); 7092 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, 7093 MVT::f64, Op.getOperand(0), 7094 DAG.getIntPtrConstant(1, dl)); 7095 7096 // Add the two halves of the long double in round-to-zero mode. 7097 SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi); 7098 7099 // Now use a smaller FP_TO_SINT. 7100 return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res); 7101 } 7102 if (Op.getOpcode() == ISD::FP_TO_UINT) { 7103 const uint64_t TwoE31[] = {0x41e0000000000000LL, 0}; 7104 APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31)); 7105 SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128); 7106 // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X 7107 // FIXME: generated code sucks. 7108 // TODO: Are there fast-math-flags to propagate to this FSUB? 7109 SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128, 7110 Op.getOperand(0), Tmp); 7111 True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True); 7112 True = DAG.getNode(ISD::ADD, dl, MVT::i32, True, 7113 DAG.getConstant(0x80000000, dl, MVT::i32)); 7114 SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, 7115 Op.getOperand(0)); 7116 return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False, 7117 ISD::SETGE); 7118 } 7119 } 7120 7121 return SDValue(); 7122 } 7123 7124 if (Subtarget.hasDirectMove() && Subtarget.isPPC64()) 7125 return LowerFP_TO_INTDirectMove(Op, DAG, dl); 7126 7127 ReuseLoadInfo RLI; 7128 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); 7129 7130 return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI, 7131 RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges); 7132 } 7133 7134 // We're trying to insert a regular store, S, and then a load, L. If the 7135 // incoming value, O, is a load, we might just be able to have our load use the 7136 // address used by O. However, we don't know if anything else will store to 7137 // that address before we can load from it. To prevent this situation, we need 7138 // to insert our load, L, into the chain as a peer of O. To do this, we give L 7139 // the same chain operand as O, we create a token factor from the chain results 7140 // of O and L, and we replace all uses of O's chain result with that token 7141 // factor (see spliceIntoChain below for this last part). 7142 bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, 7143 ReuseLoadInfo &RLI, 7144 SelectionDAG &DAG, 7145 ISD::LoadExtType ET) const { 7146 SDLoc dl(Op); 7147 if (ET == ISD::NON_EXTLOAD && 7148 (Op.getOpcode() == ISD::FP_TO_UINT || 7149 Op.getOpcode() == ISD::FP_TO_SINT) && 7150 isOperationLegalOrCustom(Op.getOpcode(), 7151 Op.getOperand(0).getValueType())) { 7152 7153 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); 7154 return true; 7155 } 7156 7157 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op); 7158 if (!LD || LD->getExtensionType() != ET || LD->isVolatile() || 7159 LD->isNonTemporal()) 7160 return false; 7161 if (LD->getMemoryVT() != MemVT) 7162 return false; 7163 7164 RLI.Ptr = LD->getBasePtr(); 7165 if (LD->isIndexed() && !LD->getOffset().isUndef()) { 7166 assert(LD->getAddressingMode() == ISD::PRE_INC && 7167 "Non-pre-inc AM on PPC?"); 7168 RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr, 7169 LD->getOffset()); 7170 } 7171 7172 RLI.Chain = LD->getChain(); 7173 RLI.MPI = LD->getPointerInfo(); 7174 RLI.IsDereferenceable = LD->isDereferenceable(); 7175 RLI.IsInvariant = LD->isInvariant(); 7176 RLI.Alignment = LD->getAlignment(); 7177 RLI.AAInfo = LD->getAAInfo(); 7178 RLI.Ranges = LD->getRanges(); 7179 7180 RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1); 7181 return true; 7182 } 7183 7184 // Given the head of the old chain, ResChain, insert a token factor containing 7185 // it and NewResChain, and make users of ResChain now be users of that token 7186 // factor. 7187 // TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead. 7188 void PPCTargetLowering::spliceIntoChain(SDValue ResChain, 7189 SDValue NewResChain, 7190 SelectionDAG &DAG) const { 7191 if (!ResChain) 7192 return; 7193 7194 SDLoc dl(NewResChain); 7195 7196 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 7197 NewResChain, DAG.getUNDEF(MVT::Other)); 7198 assert(TF.getNode() != NewResChain.getNode() && 7199 "A new TF really is required here"); 7200 7201 DAG.ReplaceAllUsesOfValueWith(ResChain, TF); 7202 DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain); 7203 } 7204 7205 /// Analyze profitability of direct move 7206 /// prefer float load to int load plus direct move 7207 /// when there is no integer use of int load 7208 bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const { 7209 SDNode *Origin = Op.getOperand(0).getNode(); 7210 if (Origin->getOpcode() != ISD::LOAD) 7211 return true; 7212 7213 // If there is no LXSIBZX/LXSIHZX, like Power8, 7214 // prefer direct move if the memory size is 1 or 2 bytes. 7215 MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand(); 7216 if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2) 7217 return true; 7218 7219 for (SDNode::use_iterator UI = Origin->use_begin(), 7220 UE = Origin->use_end(); 7221 UI != UE; ++UI) { 7222 7223 // Only look at the users of the loaded value. 7224 if (UI.getUse().get().getResNo() != 0) 7225 continue; 7226 7227 if (UI->getOpcode() != ISD::SINT_TO_FP && 7228 UI->getOpcode() != ISD::UINT_TO_FP) 7229 return true; 7230 } 7231 7232 return false; 7233 } 7234 7235 /// Custom lowers integer to floating point conversions to use 7236 /// the direct move instructions available in ISA 2.07 to avoid the 7237 /// need for load/store combinations. 7238 SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op, 7239 SelectionDAG &DAG, 7240 const SDLoc &dl) const { 7241 assert((Op.getValueType() == MVT::f32 || 7242 Op.getValueType() == MVT::f64) && 7243 "Invalid floating point type as target of conversion"); 7244 assert(Subtarget.hasFPCVT() && 7245 "Int to FP conversions with direct moves require FPCVT"); 7246 SDValue FP; 7247 SDValue Src = Op.getOperand(0); 7248 bool SinglePrec = Op.getValueType() == MVT::f32; 7249 bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32; 7250 bool Signed = Op.getOpcode() == ISD::SINT_TO_FP; 7251 unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) : 7252 (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU); 7253 7254 if (WordInt) { 7255 FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ, 7256 dl, MVT::f64, Src); 7257 FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); 7258 } 7259 else { 7260 FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src); 7261 FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); 7262 } 7263 7264 return FP; 7265 } 7266 7267 static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) { 7268 7269 EVT VecVT = Vec.getValueType(); 7270 assert(VecVT.isVector() && "Expected a vector type."); 7271 assert(VecVT.getSizeInBits() < 128 && "Vector is already full width."); 7272 7273 EVT EltVT = VecVT.getVectorElementType(); 7274 unsigned WideNumElts = 128 / EltVT.getSizeInBits(); 7275 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts); 7276 7277 unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements(); 7278 SmallVector<SDValue, 16> Ops(NumConcat); 7279 Ops[0] = Vec; 7280 SDValue UndefVec = DAG.getUNDEF(VecVT); 7281 for (unsigned i = 1; i < NumConcat; ++i) 7282 Ops[i] = UndefVec; 7283 7284 return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops); 7285 } 7286 7287 SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, 7288 SelectionDAG &DAG, 7289 const SDLoc &dl) const { 7290 7291 unsigned Opc = Op.getOpcode(); 7292 assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP) && 7293 "Unexpected conversion type"); 7294 assert(Op.getValueType() == MVT::v2f64 && "Supports v2f64 only."); 7295 7296 // CPU's prior to P9 don't have a way to sign-extend in vectors. 7297 bool SignedConv = Opc == ISD::SINT_TO_FP; 7298 if (SignedConv && !Subtarget.hasP9Altivec()) 7299 return SDValue(); 7300 7301 SDValue Wide = widenVec(DAG, Op.getOperand(0), dl); 7302 EVT WideVT = Wide.getValueType(); 7303 unsigned WideNumElts = WideVT.getVectorNumElements(); 7304 7305 SmallVector<int, 16> ShuffV; 7306 for (unsigned i = 0; i < WideNumElts; ++i) 7307 ShuffV.push_back(i + WideNumElts); 7308 7309 if (Subtarget.isLittleEndian()) { 7310 ShuffV[0] = 0; 7311 ShuffV[WideNumElts / 2] = 1; 7312 } 7313 else { 7314 ShuffV[WideNumElts / 2 - 1] = 0; 7315 ShuffV[WideNumElts - 1] = 1; 7316 } 7317 7318 SDValue ShuffleSrc2 = SignedConv ? DAG.getUNDEF(WideVT) : 7319 DAG.getConstant(0, dl, WideVT); 7320 SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV); 7321 unsigned ExtendOp = SignedConv ? (unsigned) PPCISD::SExtVElems : 7322 (unsigned) ISD::BITCAST; 7323 SDValue Extend = DAG.getNode(ExtendOp, dl, MVT::v2i64, Arrange); 7324 7325 return DAG.getNode(Opc, dl, Op.getValueType(), Extend); 7326 } 7327 7328 SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, 7329 SelectionDAG &DAG) const { 7330 SDLoc dl(Op); 7331 7332 if (Op.getValueType() == MVT::v2f64 && 7333 Op.getOperand(0).getValueType() == MVT::v2i16) 7334 return LowerINT_TO_FPVector(Op, DAG, dl); 7335 7336 // Conversions to f128 are legal. 7337 if (EnableQuadPrecision && (Op.getValueType() == MVT::f128)) 7338 return Op; 7339 7340 if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) { 7341 if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64) 7342 return SDValue(); 7343 7344 SDValue Value = Op.getOperand(0); 7345 // The values are now known to be -1 (false) or 1 (true). To convert this 7346 // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). 7347 // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 7348 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); 7349 7350 SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); 7351 7352 Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 7353 7354 if (Op.getValueType() != MVT::v4f64) 7355 Value = DAG.getNode(ISD::FP_ROUND, dl, 7356 Op.getValueType(), Value, 7357 DAG.getIntPtrConstant(1, dl)); 7358 return Value; 7359 } 7360 7361 // Don't handle ppc_fp128 here; let it be lowered to a libcall. 7362 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) 7363 return SDValue(); 7364 7365 if (Op.getOperand(0).getValueType() == MVT::i1) 7366 return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0), 7367 DAG.getConstantFP(1.0, dl, Op.getValueType()), 7368 DAG.getConstantFP(0.0, dl, Op.getValueType())); 7369 7370 // If we have direct moves, we can do all the conversion, skip the store/load 7371 // however, without FPCVT we can't do most conversions. 7372 if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) && 7373 Subtarget.isPPC64() && Subtarget.hasFPCVT()) 7374 return LowerINT_TO_FPDirectMove(Op, DAG, dl); 7375 7376 assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && 7377 "UINT_TO_FP is supported only with FPCVT"); 7378 7379 // If we have FCFIDS, then use it when converting to single-precision. 7380 // Otherwise, convert to double-precision and then round. 7381 unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 7382 ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS 7383 : PPCISD::FCFIDS) 7384 : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU 7385 : PPCISD::FCFID); 7386 MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 7387 ? MVT::f32 7388 : MVT::f64; 7389 7390 if (Op.getOperand(0).getValueType() == MVT::i64) { 7391 SDValue SINT = Op.getOperand(0); 7392 // When converting to single-precision, we actually need to convert 7393 // to double-precision first and then round to single-precision. 7394 // To avoid double-rounding effects during that operation, we have 7395 // to prepare the input operand. Bits that might be truncated when 7396 // converting to double-precision are replaced by a bit that won't 7397 // be lost at this stage, but is below the single-precision rounding 7398 // position. 7399 // 7400 // However, if -enable-unsafe-fp-math is in effect, accept double 7401 // rounding to avoid the extra overhead. 7402 if (Op.getValueType() == MVT::f32 && 7403 !Subtarget.hasFPCVT() && 7404 !DAG.getTarget().Options.UnsafeFPMath) { 7405 7406 // Twiddle input to make sure the low 11 bits are zero. (If this 7407 // is the case, we are guaranteed the value will fit into the 53 bit 7408 // mantissa of an IEEE double-precision value without rounding.) 7409 // If any of those low 11 bits were not zero originally, make sure 7410 // bit 12 (value 2048) is set instead, so that the final rounding 7411 // to single-precision gets the correct result. 7412 SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64, 7413 SINT, DAG.getConstant(2047, dl, MVT::i64)); 7414 Round = DAG.getNode(ISD::ADD, dl, MVT::i64, 7415 Round, DAG.getConstant(2047, dl, MVT::i64)); 7416 Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT); 7417 Round = DAG.getNode(ISD::AND, dl, MVT::i64, 7418 Round, DAG.getConstant(-2048, dl, MVT::i64)); 7419 7420 // However, we cannot use that value unconditionally: if the magnitude 7421 // of the input value is small, the bit-twiddling we did above might 7422 // end up visibly changing the output. Fortunately, in that case, we 7423 // don't need to twiddle bits since the original input will convert 7424 // exactly to double-precision floating-point already. Therefore, 7425 // construct a conditional to use the original value if the top 11 7426 // bits are all sign-bit copies, and use the rounded value computed 7427 // above otherwise. 7428 SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64, 7429 SINT, DAG.getConstant(53, dl, MVT::i32)); 7430 Cond = DAG.getNode(ISD::ADD, dl, MVT::i64, 7431 Cond, DAG.getConstant(1, dl, MVT::i64)); 7432 Cond = DAG.getSetCC(dl, MVT::i32, 7433 Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT); 7434 7435 SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT); 7436 } 7437 7438 ReuseLoadInfo RLI; 7439 SDValue Bits; 7440 7441 MachineFunction &MF = DAG.getMachineFunction(); 7442 if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) { 7443 Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, 7444 RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges); 7445 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); 7446 } else if (Subtarget.hasLFIWAX() && 7447 canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) { 7448 MachineMemOperand *MMO = 7449 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7450 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7451 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7452 Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl, 7453 DAG.getVTList(MVT::f64, MVT::Other), 7454 Ops, MVT::i32, MMO); 7455 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); 7456 } else if (Subtarget.hasFPCVT() && 7457 canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) { 7458 MachineMemOperand *MMO = 7459 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7460 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7461 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7462 Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl, 7463 DAG.getVTList(MVT::f64, MVT::Other), 7464 Ops, MVT::i32, MMO); 7465 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); 7466 } else if (((Subtarget.hasLFIWAX() && 7467 SINT.getOpcode() == ISD::SIGN_EXTEND) || 7468 (Subtarget.hasFPCVT() && 7469 SINT.getOpcode() == ISD::ZERO_EXTEND)) && 7470 SINT.getOperand(0).getValueType() == MVT::i32) { 7471 MachineFrameInfo &MFI = MF.getFrameInfo(); 7472 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 7473 7474 int FrameIdx = MFI.CreateStackObject(4, 4, false); 7475 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7476 7477 SDValue Store = 7478 DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, 7479 MachinePointerInfo::getFixedStack( 7480 DAG.getMachineFunction(), FrameIdx)); 7481 7482 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && 7483 "Expected an i32 store"); 7484 7485 RLI.Ptr = FIdx; 7486 RLI.Chain = Store; 7487 RLI.MPI = 7488 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 7489 RLI.Alignment = 4; 7490 7491 MachineMemOperand *MMO = 7492 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7493 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7494 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7495 Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ? 7496 PPCISD::LFIWZX : PPCISD::LFIWAX, 7497 dl, DAG.getVTList(MVT::f64, MVT::Other), 7498 Ops, MVT::i32, MMO); 7499 } else 7500 Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); 7501 7502 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); 7503 7504 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) 7505 FP = DAG.getNode(ISD::FP_ROUND, dl, 7506 MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); 7507 return FP; 7508 } 7509 7510 assert(Op.getOperand(0).getValueType() == MVT::i32 && 7511 "Unhandled INT_TO_FP type in custom expander!"); 7512 // Since we only generate this in 64-bit mode, we can take advantage of 7513 // 64-bit registers. In particular, sign extend the input value into the 7514 // 64-bit register with extsw, store the WHOLE 64-bit value into the stack 7515 // then lfd it and fcfid it. 7516 MachineFunction &MF = DAG.getMachineFunction(); 7517 MachineFrameInfo &MFI = MF.getFrameInfo(); 7518 EVT PtrVT = getPointerTy(MF.getDataLayout()); 7519 7520 SDValue Ld; 7521 if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { 7522 ReuseLoadInfo RLI; 7523 bool ReusingLoad; 7524 if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI, 7525 DAG))) { 7526 int FrameIdx = MFI.CreateStackObject(4, 4, false); 7527 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7528 7529 SDValue Store = 7530 DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, 7531 MachinePointerInfo::getFixedStack( 7532 DAG.getMachineFunction(), FrameIdx)); 7533 7534 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && 7535 "Expected an i32 store"); 7536 7537 RLI.Ptr = FIdx; 7538 RLI.Chain = Store; 7539 RLI.MPI = 7540 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 7541 RLI.Alignment = 4; 7542 } 7543 7544 MachineMemOperand *MMO = 7545 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7546 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7547 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7548 Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? 7549 PPCISD::LFIWZX : PPCISD::LFIWAX, 7550 dl, DAG.getVTList(MVT::f64, MVT::Other), 7551 Ops, MVT::i32, MMO); 7552 if (ReusingLoad) 7553 spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG); 7554 } else { 7555 assert(Subtarget.isPPC64() && 7556 "i32->FP without LFIWAX supported only on PPC64"); 7557 7558 int FrameIdx = MFI.CreateStackObject(8, 8, false); 7559 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7560 7561 SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, 7562 Op.getOperand(0)); 7563 7564 // STD the extended value into the stack slot. 7565 SDValue Store = DAG.getStore( 7566 DAG.getEntryNode(), dl, Ext64, FIdx, 7567 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); 7568 7569 // Load the value as a double. 7570 Ld = DAG.getLoad( 7571 MVT::f64, dl, Store, FIdx, 7572 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); 7573 } 7574 7575 // FCFID it and return it. 7576 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); 7577 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) 7578 FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, 7579 DAG.getIntPtrConstant(0, dl)); 7580 return FP; 7581 } 7582 7583 SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7584 SelectionDAG &DAG) const { 7585 SDLoc dl(Op); 7586 /* 7587 The rounding mode is in bits 30:31 of FPSR, and has the following 7588 settings: 7589 00 Round to nearest 7590 01 Round to 0 7591 10 Round to +inf 7592 11 Round to -inf 7593 7594 FLT_ROUNDS, on the other hand, expects the following: 7595 -1 Undefined 7596 0 Round to 0 7597 1 Round to nearest 7598 2 Round to +inf 7599 3 Round to -inf 7600 7601 To perform the conversion, we do: 7602 ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1)) 7603 */ 7604 7605 MachineFunction &MF = DAG.getMachineFunction(); 7606 EVT VT = Op.getValueType(); 7607 EVT PtrVT = getPointerTy(MF.getDataLayout()); 7608 7609 // Save FP Control Word to register 7610 EVT NodeTys[] = { 7611 MVT::f64, // return register 7612 MVT::Glue // unused in this context 7613 }; 7614 SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None); 7615 7616 // Save FP register to stack slot 7617 int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false); 7618 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); 7619 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot, 7620 MachinePointerInfo()); 7621 7622 // Load FP Control Word from low 32 bits of stack slot. 7623 SDValue Four = DAG.getConstant(4, dl, PtrVT); 7624 SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four); 7625 SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo()); 7626 7627 // Transform as necessary 7628 SDValue CWD1 = 7629 DAG.getNode(ISD::AND, dl, MVT::i32, 7630 CWD, DAG.getConstant(3, dl, MVT::i32)); 7631 SDValue CWD2 = 7632 DAG.getNode(ISD::SRL, dl, MVT::i32, 7633 DAG.getNode(ISD::AND, dl, MVT::i32, 7634 DAG.getNode(ISD::XOR, dl, MVT::i32, 7635 CWD, DAG.getConstant(3, dl, MVT::i32)), 7636 DAG.getConstant(3, dl, MVT::i32)), 7637 DAG.getConstant(1, dl, MVT::i32)); 7638 7639 SDValue RetVal = 7640 DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2); 7641 7642 return DAG.getNode((VT.getSizeInBits() < 16 ? 7643 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7644 } 7645 7646 SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { 7647 EVT VT = Op.getValueType(); 7648 unsigned BitWidth = VT.getSizeInBits(); 7649 SDLoc dl(Op); 7650 assert(Op.getNumOperands() == 3 && 7651 VT == Op.getOperand(1).getValueType() && 7652 "Unexpected SHL!"); 7653 7654 // Expand into a bunch of logical ops. Note that these ops 7655 // depend on the PPC behavior for oversized shift amounts. 7656 SDValue Lo = Op.getOperand(0); 7657 SDValue Hi = Op.getOperand(1); 7658 SDValue Amt = Op.getOperand(2); 7659 EVT AmtVT = Amt.getValueType(); 7660 7661 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, 7662 DAG.getConstant(BitWidth, dl, AmtVT), Amt); 7663 SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt); 7664 SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1); 7665 SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3); 7666 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, 7667 DAG.getConstant(-BitWidth, dl, AmtVT)); 7668 SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5); 7669 SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); 7670 SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt); 7671 SDValue OutOps[] = { OutLo, OutHi }; 7672 return DAG.getMergeValues(OutOps, dl); 7673 } 7674 7675 SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { 7676 EVT VT = Op.getValueType(); 7677 SDLoc dl(Op); 7678 unsigned BitWidth = VT.getSizeInBits(); 7679 assert(Op.getNumOperands() == 3 && 7680 VT == Op.getOperand(1).getValueType() && 7681 "Unexpected SRL!"); 7682 7683 // Expand into a bunch of logical ops. Note that these ops 7684 // depend on the PPC behavior for oversized shift amounts. 7685 SDValue Lo = Op.getOperand(0); 7686 SDValue Hi = Op.getOperand(1); 7687 SDValue Amt = Op.getOperand(2); 7688 EVT AmtVT = Amt.getValueType(); 7689 7690 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, 7691 DAG.getConstant(BitWidth, dl, AmtVT), Amt); 7692 SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); 7693 SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); 7694 SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); 7695 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, 7696 DAG.getConstant(-BitWidth, dl, AmtVT)); 7697 SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5); 7698 SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); 7699 SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt); 7700 SDValue OutOps[] = { OutLo, OutHi }; 7701 return DAG.getMergeValues(OutOps, dl); 7702 } 7703 7704 SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { 7705 SDLoc dl(Op); 7706 EVT VT = Op.getValueType(); 7707 unsigned BitWidth = VT.getSizeInBits(); 7708 assert(Op.getNumOperands() == 3 && 7709 VT == Op.getOperand(1).getValueType() && 7710 "Unexpected SRA!"); 7711 7712 // Expand into a bunch of logical ops, followed by a select_cc. 7713 SDValue Lo = Op.getOperand(0); 7714 SDValue Hi = Op.getOperand(1); 7715 SDValue Amt = Op.getOperand(2); 7716 EVT AmtVT = Amt.getValueType(); 7717 7718 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, 7719 DAG.getConstant(BitWidth, dl, AmtVT), Amt); 7720 SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); 7721 SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); 7722 SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); 7723 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, 7724 DAG.getConstant(-BitWidth, dl, AmtVT)); 7725 SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5); 7726 SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt); 7727 SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT), 7728 Tmp4, Tmp6, ISD::SETLE); 7729 SDValue OutOps[] = { OutLo, OutHi }; 7730 return DAG.getMergeValues(OutOps, dl); 7731 } 7732 7733 //===----------------------------------------------------------------------===// 7734 // Vector related lowering. 7735 // 7736 7737 /// BuildSplatI - Build a canonical splati of Val with an element size of 7738 /// SplatSize. Cast the result to VT. 7739 static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, 7740 SelectionDAG &DAG, const SDLoc &dl) { 7741 assert(Val >= -16 && Val <= 15 && "vsplti is out of range!"); 7742 7743 static const MVT VTys[] = { // canonical VT to use for each size. 7744 MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32 7745 }; 7746 7747 EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1]; 7748 7749 // Force vspltis[hw] -1 to vspltisb -1 to canonicalize. 7750 if (Val == -1) 7751 SplatSize = 1; 7752 7753 EVT CanonicalVT = VTys[SplatSize-1]; 7754 7755 // Build a canonical splat for this value. 7756 return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT)); 7757 } 7758 7759 /// BuildIntrinsicOp - Return a unary operator intrinsic node with the 7760 /// specified intrinsic ID. 7761 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG, 7762 const SDLoc &dl, EVT DestVT = MVT::Other) { 7763 if (DestVT == MVT::Other) DestVT = Op.getValueType(); 7764 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, 7765 DAG.getConstant(IID, dl, MVT::i32), Op); 7766 } 7767 7768 /// BuildIntrinsicOp - Return a binary operator intrinsic node with the 7769 /// specified intrinsic ID. 7770 static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS, 7771 SelectionDAG &DAG, const SDLoc &dl, 7772 EVT DestVT = MVT::Other) { 7773 if (DestVT == MVT::Other) DestVT = LHS.getValueType(); 7774 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, 7775 DAG.getConstant(IID, dl, MVT::i32), LHS, RHS); 7776 } 7777 7778 /// BuildIntrinsicOp - Return a ternary operator intrinsic node with the 7779 /// specified intrinsic ID. 7780 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1, 7781 SDValue Op2, SelectionDAG &DAG, const SDLoc &dl, 7782 EVT DestVT = MVT::Other) { 7783 if (DestVT == MVT::Other) DestVT = Op0.getValueType(); 7784 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, 7785 DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2); 7786 } 7787 7788 /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified 7789 /// amount. The result has the specified value type. 7790 static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT, 7791 SelectionDAG &DAG, const SDLoc &dl) { 7792 // Force LHS/RHS to be the right type. 7793 LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS); 7794 RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS); 7795 7796 int Ops[16]; 7797 for (unsigned i = 0; i != 16; ++i) 7798 Ops[i] = i + Amt; 7799 SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops); 7800 return DAG.getNode(ISD::BITCAST, dl, VT, T); 7801 } 7802 7803 /// Do we have an efficient pattern in a .td file for this node? 7804 /// 7805 /// \param V - pointer to the BuildVectorSDNode being matched 7806 /// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves? 7807 /// 7808 /// There are some patterns where it is beneficial to keep a BUILD_VECTOR 7809 /// node as a BUILD_VECTOR node rather than expanding it. The patterns where 7810 /// the opposite is true (expansion is beneficial) are: 7811 /// - The node builds a vector out of integers that are not 32 or 64-bits 7812 /// - The node builds a vector out of constants 7813 /// - The node is a "load-and-splat" 7814 /// In all other cases, we will choose to keep the BUILD_VECTOR. 7815 static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V, 7816 bool HasDirectMove, 7817 bool HasP8Vector) { 7818 EVT VecVT = V->getValueType(0); 7819 bool RightType = VecVT == MVT::v2f64 || 7820 (HasP8Vector && VecVT == MVT::v4f32) || 7821 (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32)); 7822 if (!RightType) 7823 return false; 7824 7825 bool IsSplat = true; 7826 bool IsLoad = false; 7827 SDValue Op0 = V->getOperand(0); 7828 7829 // This function is called in a block that confirms the node is not a constant 7830 // splat. So a constant BUILD_VECTOR here means the vector is built out of 7831 // different constants. 7832 if (V->isConstant()) 7833 return false; 7834 for (int i = 0, e = V->getNumOperands(); i < e; ++i) { 7835 if (V->getOperand(i).isUndef()) 7836 return false; 7837 // We want to expand nodes that represent load-and-splat even if the 7838 // loaded value is a floating point truncation or conversion to int. 7839 if (V->getOperand(i).getOpcode() == ISD::LOAD || 7840 (V->getOperand(i).getOpcode() == ISD::FP_ROUND && 7841 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) || 7842 (V->getOperand(i).getOpcode() == ISD::FP_TO_SINT && 7843 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) || 7844 (V->getOperand(i).getOpcode() == ISD::FP_TO_UINT && 7845 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD)) 7846 IsLoad = true; 7847 // If the operands are different or the input is not a load and has more 7848 // uses than just this BV node, then it isn't a splat. 7849 if (V->getOperand(i) != Op0 || 7850 (!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode()))) 7851 IsSplat = false; 7852 } 7853 return !(IsSplat && IsLoad); 7854 } 7855 7856 // Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128. 7857 SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { 7858 7859 SDLoc dl(Op); 7860 SDValue Op0 = Op->getOperand(0); 7861 7862 if (!EnableQuadPrecision || 7863 (Op.getValueType() != MVT::f128 ) || 7864 (Op0.getOpcode() != ISD::BUILD_PAIR) || 7865 (Op0.getOperand(0).getValueType() != MVT::i64) || 7866 (Op0.getOperand(1).getValueType() != MVT::i64)) 7867 return SDValue(); 7868 7869 return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Op0.getOperand(0), 7870 Op0.getOperand(1)); 7871 } 7872 7873 // If this is a case we can't handle, return null and let the default 7874 // expansion code take care of it. If we CAN select this case, and if it 7875 // selects to a single instruction, return Op. Otherwise, if we can codegen 7876 // this case more efficiently than a constant pool load, lower it to the 7877 // sequence of ops that should be used. 7878 SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, 7879 SelectionDAG &DAG) const { 7880 SDLoc dl(Op); 7881 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 7882 assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); 7883 7884 if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) { 7885 // We first build an i32 vector, load it into a QPX register, 7886 // then convert it to a floating-point vector and compare it 7887 // to a zero vector to get the boolean result. 7888 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 7889 int FrameIdx = MFI.CreateStackObject(16, 16, false); 7890 MachinePointerInfo PtrInfo = 7891 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 7892 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 7893 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7894 7895 assert(BVN->getNumOperands() == 4 && 7896 "BUILD_VECTOR for v4i1 does not have 4 operands"); 7897 7898 bool IsConst = true; 7899 for (unsigned i = 0; i < 4; ++i) { 7900 if (BVN->getOperand(i).isUndef()) continue; 7901 if (!isa<ConstantSDNode>(BVN->getOperand(i))) { 7902 IsConst = false; 7903 break; 7904 } 7905 } 7906 7907 if (IsConst) { 7908 Constant *One = 7909 ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0); 7910 Constant *NegOne = 7911 ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0); 7912 7913 Constant *CV[4]; 7914 for (unsigned i = 0; i < 4; ++i) { 7915 if (BVN->getOperand(i).isUndef()) 7916 CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); 7917 else if (isNullConstant(BVN->getOperand(i))) 7918 CV[i] = NegOne; 7919 else 7920 CV[i] = One; 7921 } 7922 7923 Constant *CP = ConstantVector::get(CV); 7924 SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), 7925 16 /* alignment */); 7926 7927 SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; 7928 SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other}); 7929 return DAG.getMemIntrinsicNode( 7930 PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, 7931 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 7932 } 7933 7934 SmallVector<SDValue, 4> Stores; 7935 for (unsigned i = 0; i < 4; ++i) { 7936 if (BVN->getOperand(i).isUndef()) continue; 7937 7938 unsigned Offset = 4*i; 7939 SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); 7940 Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); 7941 7942 unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize(); 7943 if (StoreSize > 4) { 7944 Stores.push_back( 7945 DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx, 7946 PtrInfo.getWithOffset(Offset), MVT::i32)); 7947 } else { 7948 SDValue StoreValue = BVN->getOperand(i); 7949 if (StoreSize < 4) 7950 StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue); 7951 7952 Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx, 7953 PtrInfo.getWithOffset(Offset))); 7954 } 7955 } 7956 7957 SDValue StoreChain; 7958 if (!Stores.empty()) 7959 StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); 7960 else 7961 StoreChain = DAG.getEntryNode(); 7962 7963 // Now load from v4i32 into the QPX register; this will extend it to 7964 // v4i64 but not yet convert it to a floating point. Nevertheless, this 7965 // is typed as v4f64 because the QPX register integer states are not 7966 // explicitly represented. 7967 7968 SDValue Ops[] = {StoreChain, 7969 DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32), 7970 FIdx}; 7971 SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other}); 7972 7973 SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, 7974 dl, VTs, Ops, MVT::v4i32, PtrInfo); 7975 LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, 7976 DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32), 7977 LoadedVect); 7978 7979 SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64); 7980 7981 return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ); 7982 } 7983 7984 // All other QPX vectors are handled by generic code. 7985 if (Subtarget.hasQPX()) 7986 return SDValue(); 7987 7988 // Check if this is a splat of a constant value. 7989 APInt APSplatBits, APSplatUndef; 7990 unsigned SplatBitSize; 7991 bool HasAnyUndefs; 7992 if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, 7993 HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || 7994 SplatBitSize > 32) { 7995 // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be 7996 // lowered to VSX instructions under certain conditions. 7997 // Without VSX, there is no pattern more efficient than expanding the node. 7998 if (Subtarget.hasVSX() && 7999 haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(), 8000 Subtarget.hasP8Vector())) 8001 return Op; 8002 return SDValue(); 8003 } 8004 8005 unsigned SplatBits = APSplatBits.getZExtValue(); 8006 unsigned SplatUndef = APSplatUndef.getZExtValue(); 8007 unsigned SplatSize = SplatBitSize / 8; 8008 8009 // First, handle single instruction cases. 8010 8011 // All zeros? 8012 if (SplatBits == 0) { 8013 // Canonicalize all zero vectors to be v4i32. 8014 if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) { 8015 SDValue Z = DAG.getConstant(0, dl, MVT::v4i32); 8016 Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z); 8017 } 8018 return Op; 8019 } 8020 8021 // We have XXSPLTIB for constant splats one byte wide 8022 if (Subtarget.hasP9Vector() && SplatSize == 1) { 8023 // This is a splat of 1-byte elements with some elements potentially undef. 8024 // Rather than trying to match undef in the SDAG patterns, ensure that all 8025 // elements are the same constant. 8026 if (HasAnyUndefs || ISD::isBuildVectorAllOnes(BVN)) { 8027 SmallVector<SDValue, 16> Ops(16, DAG.getConstant(SplatBits, 8028 dl, MVT::i32)); 8029 SDValue NewBV = DAG.getBuildVector(MVT::v16i8, dl, Ops); 8030 if (Op.getValueType() != MVT::v16i8) 8031 return DAG.getBitcast(Op.getValueType(), NewBV); 8032 return NewBV; 8033 } 8034 8035 // BuildVectorSDNode::isConstantSplat() is actually pretty smart. It'll 8036 // detect that constant splats like v8i16: 0xABAB are really just splats 8037 // of a 1-byte constant. In this case, we need to convert the node to a 8038 // splat of v16i8 and a bitcast. 8039 if (Op.getValueType() != MVT::v16i8) 8040 return DAG.getBitcast(Op.getValueType(), 8041 DAG.getConstant(SplatBits, dl, MVT::v16i8)); 8042 8043 return Op; 8044 } 8045 8046 // If the sign extended value is in the range [-16,15], use VSPLTI[bhw]. 8047 int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >> 8048 (32-SplatBitSize)); 8049 if (SextVal >= -16 && SextVal <= 15) 8050 return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl); 8051 8052 // Two instruction sequences. 8053 8054 // If this value is in the range [-32,30] and is even, use: 8055 // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2) 8056 // If this value is in the range [17,31] and is odd, use: 8057 // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16) 8058 // If this value is in the range [-31,-17] and is odd, use: 8059 // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16) 8060 // Note the last two are three-instruction sequences. 8061 if (SextVal >= -32 && SextVal <= 31) { 8062 // To avoid having these optimizations undone by constant folding, 8063 // we convert to a pseudo that will be expanded later into one of 8064 // the above forms. 8065 SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32); 8066 EVT VT = (SplatSize == 1 ? MVT::v16i8 : 8067 (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32)); 8068 SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32); 8069 SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize); 8070 if (VT == Op.getValueType()) 8071 return RetVal; 8072 else 8073 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal); 8074 } 8075 8076 // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is 8077 // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important 8078 // for fneg/fabs. 8079 if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) { 8080 // Make -1 and vspltisw -1: 8081 SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl); 8082 8083 // Make the VSLW intrinsic, computing 0x8000_0000. 8084 SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV, 8085 OnesV, DAG, dl); 8086 8087 // xor by OnesV to invert it. 8088 Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV); 8089 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 8090 } 8091 8092 // Check to see if this is a wide variety of vsplti*, binop self cases. 8093 static const signed char SplatCsts[] = { 8094 -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, 8095 -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16 8096 }; 8097 8098 for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) { 8099 // Indirect through the SplatCsts array so that we favor 'vsplti -1' for 8100 // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1' 8101 int i = SplatCsts[idx]; 8102 8103 // Figure out what shift amount will be used by altivec if shifted by i in 8104 // this splat size. 8105 unsigned TypeShiftAmt = i & (SplatBitSize-1); 8106 8107 // vsplti + shl self. 8108 if (SextVal == (int)((unsigned)i << TypeShiftAmt)) { 8109 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 8110 static const unsigned IIDs[] = { // Intrinsic to use for each size. 8111 Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0, 8112 Intrinsic::ppc_altivec_vslw 8113 }; 8114 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 8115 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 8116 } 8117 8118 // vsplti + srl self. 8119 if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { 8120 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 8121 static const unsigned IIDs[] = { // Intrinsic to use for each size. 8122 Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0, 8123 Intrinsic::ppc_altivec_vsrw 8124 }; 8125 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 8126 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 8127 } 8128 8129 // vsplti + sra self. 8130 if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { 8131 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 8132 static const unsigned IIDs[] = { // Intrinsic to use for each size. 8133 Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0, 8134 Intrinsic::ppc_altivec_vsraw 8135 }; 8136 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 8137 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 8138 } 8139 8140 // vsplti + rol self. 8141 if (SextVal == (int)(((unsigned)i << TypeShiftAmt) | 8142 ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) { 8143 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 8144 static const unsigned IIDs[] = { // Intrinsic to use for each size. 8145 Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0, 8146 Intrinsic::ppc_altivec_vrlw 8147 }; 8148 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 8149 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 8150 } 8151 8152 // t = vsplti c, result = vsldoi t, t, 1 8153 if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) { 8154 SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); 8155 unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1; 8156 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); 8157 } 8158 // t = vsplti c, result = vsldoi t, t, 2 8159 if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) { 8160 SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); 8161 unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2; 8162 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); 8163 } 8164 // t = vsplti c, result = vsldoi t, t, 3 8165 if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) { 8166 SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); 8167 unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3; 8168 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); 8169 } 8170 } 8171 8172 return SDValue(); 8173 } 8174 8175 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 8176 /// the specified operations to build the shuffle. 8177 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 8178 SDValue RHS, SelectionDAG &DAG, 8179 const SDLoc &dl) { 8180 unsigned OpNum = (PFEntry >> 26) & 0x0F; 8181 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8182 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8183 8184 enum { 8185 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 8186 OP_VMRGHW, 8187 OP_VMRGLW, 8188 OP_VSPLTISW0, 8189 OP_VSPLTISW1, 8190 OP_VSPLTISW2, 8191 OP_VSPLTISW3, 8192 OP_VSLDOI4, 8193 OP_VSLDOI8, 8194 OP_VSLDOI12 8195 }; 8196 8197 if (OpNum == OP_COPY) { 8198 if (LHSID == (1*9+2)*9+3) return LHS; 8199 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 8200 return RHS; 8201 } 8202 8203 SDValue OpLHS, OpRHS; 8204 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 8205 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 8206 8207 int ShufIdxs[16]; 8208 switch (OpNum) { 8209 default: llvm_unreachable("Unknown i32 permute!"); 8210 case OP_VMRGHW: 8211 ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3; 8212 ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19; 8213 ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7; 8214 ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23; 8215 break; 8216 case OP_VMRGLW: 8217 ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11; 8218 ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27; 8219 ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15; 8220 ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31; 8221 break; 8222 case OP_VSPLTISW0: 8223 for (unsigned i = 0; i != 16; ++i) 8224 ShufIdxs[i] = (i&3)+0; 8225 break; 8226 case OP_VSPLTISW1: 8227 for (unsigned i = 0; i != 16; ++i) 8228 ShufIdxs[i] = (i&3)+4; 8229 break; 8230 case OP_VSPLTISW2: 8231 for (unsigned i = 0; i != 16; ++i) 8232 ShufIdxs[i] = (i&3)+8; 8233 break; 8234 case OP_VSPLTISW3: 8235 for (unsigned i = 0; i != 16; ++i) 8236 ShufIdxs[i] = (i&3)+12; 8237 break; 8238 case OP_VSLDOI4: 8239 return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl); 8240 case OP_VSLDOI8: 8241 return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl); 8242 case OP_VSLDOI12: 8243 return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl); 8244 } 8245 EVT VT = OpLHS.getValueType(); 8246 OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS); 8247 OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS); 8248 SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs); 8249 return DAG.getNode(ISD::BITCAST, dl, VT, T); 8250 } 8251 8252 /// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled 8253 /// by the VINSERTB instruction introduced in ISA 3.0, else just return default 8254 /// SDValue. 8255 SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N, 8256 SelectionDAG &DAG) const { 8257 const unsigned BytesInVector = 16; 8258 bool IsLE = Subtarget.isLittleEndian(); 8259 SDLoc dl(N); 8260 SDValue V1 = N->getOperand(0); 8261 SDValue V2 = N->getOperand(1); 8262 unsigned ShiftElts = 0, InsertAtByte = 0; 8263 bool Swap = false; 8264 8265 // Shifts required to get the byte we want at element 7. 8266 unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1, 8267 0, 15, 14, 13, 12, 11, 10, 9}; 8268 unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0, 8269 1, 2, 3, 4, 5, 6, 7, 8}; 8270 8271 ArrayRef<int> Mask = N->getMask(); 8272 int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 8273 8274 // For each mask element, find out if we're just inserting something 8275 // from V2 into V1 or vice versa. 8276 // Possible permutations inserting an element from V2 into V1: 8277 // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 8278 // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 8279 // ... 8280 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X 8281 // Inserting from V1 into V2 will be similar, except mask range will be 8282 // [16,31]. 8283 8284 bool FoundCandidate = false; 8285 // If both vector operands for the shuffle are the same vector, the mask 8286 // will contain only elements from the first one and the second one will be 8287 // undef. 8288 unsigned VINSERTBSrcElem = IsLE ? 8 : 7; 8289 // Go through the mask of half-words to find an element that's being moved 8290 // from one vector to the other. 8291 for (unsigned i = 0; i < BytesInVector; ++i) { 8292 unsigned CurrentElement = Mask[i]; 8293 // If 2nd operand is undefined, we should only look for element 7 in the 8294 // Mask. 8295 if (V2.isUndef() && CurrentElement != VINSERTBSrcElem) 8296 continue; 8297 8298 bool OtherElementsInOrder = true; 8299 // Examine the other elements in the Mask to see if they're in original 8300 // order. 8301 for (unsigned j = 0; j < BytesInVector; ++j) { 8302 if (j == i) 8303 continue; 8304 // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be 8305 // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined, 8306 // in which we always assume we're always picking from the 1st operand. 8307 int MaskOffset = 8308 (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0; 8309 if (Mask[j] != OriginalOrder[j] + MaskOffset) { 8310 OtherElementsInOrder = false; 8311 break; 8312 } 8313 } 8314 // If other elements are in original order, we record the number of shifts 8315 // we need to get the element we want into element 7. Also record which byte 8316 // in the vector we should insert into. 8317 if (OtherElementsInOrder) { 8318 // If 2nd operand is undefined, we assume no shifts and no swapping. 8319 if (V2.isUndef()) { 8320 ShiftElts = 0; 8321 Swap = false; 8322 } else { 8323 // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4. 8324 ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF] 8325 : BigEndianShifts[CurrentElement & 0xF]; 8326 Swap = CurrentElement < BytesInVector; 8327 } 8328 InsertAtByte = IsLE ? BytesInVector - (i + 1) : i; 8329 FoundCandidate = true; 8330 break; 8331 } 8332 } 8333 8334 if (!FoundCandidate) 8335 return SDValue(); 8336 8337 // Candidate found, construct the proper SDAG sequence with VINSERTB, 8338 // optionally with VECSHL if shift is required. 8339 if (Swap) 8340 std::swap(V1, V2); 8341 if (V2.isUndef()) 8342 V2 = V1; 8343 if (ShiftElts) { 8344 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2, 8345 DAG.getConstant(ShiftElts, dl, MVT::i32)); 8346 return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl, 8347 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 8348 } 8349 return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2, 8350 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 8351 } 8352 8353 /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled 8354 /// by the VINSERTH instruction introduced in ISA 3.0, else just return default 8355 /// SDValue. 8356 SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N, 8357 SelectionDAG &DAG) const { 8358 const unsigned NumHalfWords = 8; 8359 const unsigned BytesInVector = NumHalfWords * 2; 8360 // Check that the shuffle is on half-words. 8361 if (!isNByteElemShuffleMask(N, 2, 1)) 8362 return SDValue(); 8363 8364 bool IsLE = Subtarget.isLittleEndian(); 8365 SDLoc dl(N); 8366 SDValue V1 = N->getOperand(0); 8367 SDValue V2 = N->getOperand(1); 8368 unsigned ShiftElts = 0, InsertAtByte = 0; 8369 bool Swap = false; 8370 8371 // Shifts required to get the half-word we want at element 3. 8372 unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5}; 8373 unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4}; 8374 8375 uint32_t Mask = 0; 8376 uint32_t OriginalOrderLow = 0x1234567; 8377 uint32_t OriginalOrderHigh = 0x89ABCDEF; 8378 // Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a 8379 // 32-bit space, only need 4-bit nibbles per element. 8380 for (unsigned i = 0; i < NumHalfWords; ++i) { 8381 unsigned MaskShift = (NumHalfWords - 1 - i) * 4; 8382 Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift); 8383 } 8384 8385 // For each mask element, find out if we're just inserting something 8386 // from V2 into V1 or vice versa. Possible permutations inserting an element 8387 // from V2 into V1: 8388 // X, 1, 2, 3, 4, 5, 6, 7 8389 // 0, X, 2, 3, 4, 5, 6, 7 8390 // 0, 1, X, 3, 4, 5, 6, 7 8391 // 0, 1, 2, X, 4, 5, 6, 7 8392 // 0, 1, 2, 3, X, 5, 6, 7 8393 // 0, 1, 2, 3, 4, X, 6, 7 8394 // 0, 1, 2, 3, 4, 5, X, 7 8395 // 0, 1, 2, 3, 4, 5, 6, X 8396 // Inserting from V1 into V2 will be similar, except mask range will be [8,15]. 8397 8398 bool FoundCandidate = false; 8399 // Go through the mask of half-words to find an element that's being moved 8400 // from one vector to the other. 8401 for (unsigned i = 0; i < NumHalfWords; ++i) { 8402 unsigned MaskShift = (NumHalfWords - 1 - i) * 4; 8403 uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF; 8404 uint32_t MaskOtherElts = ~(0xF << MaskShift); 8405 uint32_t TargetOrder = 0x0; 8406 8407 // If both vector operands for the shuffle are the same vector, the mask 8408 // will contain only elements from the first one and the second one will be 8409 // undef. 8410 if (V2.isUndef()) { 8411 ShiftElts = 0; 8412 unsigned VINSERTHSrcElem = IsLE ? 4 : 3; 8413 TargetOrder = OriginalOrderLow; 8414 Swap = false; 8415 // Skip if not the correct element or mask of other elements don't equal 8416 // to our expected order. 8417 if (MaskOneElt == VINSERTHSrcElem && 8418 (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) { 8419 InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2; 8420 FoundCandidate = true; 8421 break; 8422 } 8423 } else { // If both operands are defined. 8424 // Target order is [8,15] if the current mask is between [0,7]. 8425 TargetOrder = 8426 (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow; 8427 // Skip if mask of other elements don't equal our expected order. 8428 if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) { 8429 // We only need the last 3 bits for the number of shifts. 8430 ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7] 8431 : BigEndianShifts[MaskOneElt & 0x7]; 8432 InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2; 8433 Swap = MaskOneElt < NumHalfWords; 8434 FoundCandidate = true; 8435 break; 8436 } 8437 } 8438 } 8439 8440 if (!FoundCandidate) 8441 return SDValue(); 8442 8443 // Candidate found, construct the proper SDAG sequence with VINSERTH, 8444 // optionally with VECSHL if shift is required. 8445 if (Swap) 8446 std::swap(V1, V2); 8447 if (V2.isUndef()) 8448 V2 = V1; 8449 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 8450 if (ShiftElts) { 8451 // Double ShiftElts because we're left shifting on v16i8 type. 8452 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2, 8453 DAG.getConstant(2 * ShiftElts, dl, MVT::i32)); 8454 SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl); 8455 SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2, 8456 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 8457 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); 8458 } 8459 SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 8460 SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2, 8461 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 8462 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); 8463 } 8464 8465 /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this 8466 /// is a shuffle we can handle in a single instruction, return it. Otherwise, 8467 /// return the code it can be lowered into. Worst case, it can always be 8468 /// lowered into a vperm. 8469 SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 8470 SelectionDAG &DAG) const { 8471 SDLoc dl(Op); 8472 SDValue V1 = Op.getOperand(0); 8473 SDValue V2 = Op.getOperand(1); 8474 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 8475 EVT VT = Op.getValueType(); 8476 bool isLittleEndian = Subtarget.isLittleEndian(); 8477 8478 unsigned ShiftElts, InsertAtByte; 8479 bool Swap = false; 8480 if (Subtarget.hasP9Vector() && 8481 PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, 8482 isLittleEndian)) { 8483 if (Swap) 8484 std::swap(V1, V2); 8485 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 8486 SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2); 8487 if (ShiftElts) { 8488 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2, 8489 DAG.getConstant(ShiftElts, dl, MVT::i32)); 8490 SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl, 8491 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 8492 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); 8493 } 8494 SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2, 8495 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 8496 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); 8497 } 8498 8499 if (Subtarget.hasP9Altivec()) { 8500 SDValue NewISDNode; 8501 if ((NewISDNode = lowerToVINSERTH(SVOp, DAG))) 8502 return NewISDNode; 8503 8504 if ((NewISDNode = lowerToVINSERTB(SVOp, DAG))) 8505 return NewISDNode; 8506 } 8507 8508 if (Subtarget.hasVSX() && 8509 PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { 8510 if (Swap) 8511 std::swap(V1, V2); 8512 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 8513 SDValue Conv2 = 8514 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2); 8515 8516 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2, 8517 DAG.getConstant(ShiftElts, dl, MVT::i32)); 8518 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl); 8519 } 8520 8521 if (Subtarget.hasVSX() && 8522 PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { 8523 if (Swap) 8524 std::swap(V1, V2); 8525 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); 8526 SDValue Conv2 = 8527 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2); 8528 8529 SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2, 8530 DAG.getConstant(ShiftElts, dl, MVT::i32)); 8531 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI); 8532 } 8533 8534 if (Subtarget.hasP9Vector()) { 8535 if (PPC::isXXBRHShuffleMask(SVOp)) { 8536 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 8537 SDValue ReveHWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v8i16, Conv); 8538 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord); 8539 } else if (PPC::isXXBRWShuffleMask(SVOp)) { 8540 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 8541 SDValue ReveWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v4i32, Conv); 8542 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord); 8543 } else if (PPC::isXXBRDShuffleMask(SVOp)) { 8544 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); 8545 SDValue ReveDWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Conv); 8546 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord); 8547 } else if (PPC::isXXBRQShuffleMask(SVOp)) { 8548 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1); 8549 SDValue ReveQWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v1i128, Conv); 8550 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord); 8551 } 8552 } 8553 8554 if (Subtarget.hasVSX()) { 8555 if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { 8556 int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); 8557 8558 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 8559 SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv, 8560 DAG.getConstant(SplatIdx, dl, MVT::i32)); 8561 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat); 8562 } 8563 8564 // Left shifts of 8 bytes are actually swaps. Convert accordingly. 8565 if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) { 8566 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 8567 SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv); 8568 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap); 8569 } 8570 } 8571 8572 if (Subtarget.hasQPX()) { 8573 if (VT.getVectorNumElements() != 4) 8574 return SDValue(); 8575 8576 if (V2.isUndef()) V2 = V1; 8577 8578 int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp); 8579 if (AlignIdx != -1) { 8580 return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2, 8581 DAG.getConstant(AlignIdx, dl, MVT::i32)); 8582 } else if (SVOp->isSplat()) { 8583 int SplatIdx = SVOp->getSplatIndex(); 8584 if (SplatIdx >= 4) { 8585 std::swap(V1, V2); 8586 SplatIdx -= 4; 8587 } 8588 8589 return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, 8590 DAG.getConstant(SplatIdx, dl, MVT::i32)); 8591 } 8592 8593 // Lower this into a qvgpci/qvfperm pair. 8594 8595 // Compute the qvgpci literal 8596 unsigned idx = 0; 8597 for (unsigned i = 0; i < 4; ++i) { 8598 int m = SVOp->getMaskElt(i); 8599 unsigned mm = m >= 0 ? (unsigned) m : i; 8600 idx |= mm << (3-i)*3; 8601 } 8602 8603 SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64, 8604 DAG.getConstant(idx, dl, MVT::i32)); 8605 return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3); 8606 } 8607 8608 // Cases that are handled by instructions that take permute immediates 8609 // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be 8610 // selected by the instruction selector. 8611 if (V2.isUndef()) { 8612 if (PPC::isSplatShuffleMask(SVOp, 1) || 8613 PPC::isSplatShuffleMask(SVOp, 2) || 8614 PPC::isSplatShuffleMask(SVOp, 4) || 8615 PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) || 8616 PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) || 8617 PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 || 8618 PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) || 8619 PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) || 8620 PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) || 8621 PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) || 8622 PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) || 8623 PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) || 8624 (Subtarget.hasP8Altivec() && ( 8625 PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) || 8626 PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) || 8627 PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) { 8628 return Op; 8629 } 8630 } 8631 8632 // Altivec has a variety of "shuffle immediates" that take two vector inputs 8633 // and produce a fixed permutation. If any of these match, do not lower to 8634 // VPERM. 8635 unsigned int ShuffleKind = isLittleEndian ? 2 : 0; 8636 if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) || 8637 PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) || 8638 PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 || 8639 PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) || 8640 PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) || 8641 PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) || 8642 PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) || 8643 PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) || 8644 PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) || 8645 (Subtarget.hasP8Altivec() && ( 8646 PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) || 8647 PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) || 8648 PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG)))) 8649 return Op; 8650 8651 // Check to see if this is a shuffle of 4-byte values. If so, we can use our 8652 // perfect shuffle table to emit an optimal matching sequence. 8653 ArrayRef<int> PermMask = SVOp->getMask(); 8654 8655 unsigned PFIndexes[4]; 8656 bool isFourElementShuffle = true; 8657 for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number 8658 unsigned EltNo = 8; // Start out undef. 8659 for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. 8660 if (PermMask[i*4+j] < 0) 8661 continue; // Undef, ignore it. 8662 8663 unsigned ByteSource = PermMask[i*4+j]; 8664 if ((ByteSource & 3) != j) { 8665 isFourElementShuffle = false; 8666 break; 8667 } 8668 8669 if (EltNo == 8) { 8670 EltNo = ByteSource/4; 8671 } else if (EltNo != ByteSource/4) { 8672 isFourElementShuffle = false; 8673 break; 8674 } 8675 } 8676 PFIndexes[i] = EltNo; 8677 } 8678 8679 // If this shuffle can be expressed as a shuffle of 4-byte elements, use the 8680 // perfect shuffle vector to determine if it is cost effective to do this as 8681 // discrete instructions, or whether we should use a vperm. 8682 // For now, we skip this for little endian until such time as we have a 8683 // little-endian perfect shuffle table. 8684 if (isFourElementShuffle && !isLittleEndian) { 8685 // Compute the index in the perfect shuffle table. 8686 unsigned PFTableIndex = 8687 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8688 8689 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8690 unsigned Cost = (PFEntry >> 30); 8691 8692 // Determining when to avoid vperm is tricky. Many things affect the cost 8693 // of vperm, particularly how many times the perm mask needs to be computed. 8694 // For example, if the perm mask can be hoisted out of a loop or is already 8695 // used (perhaps because there are multiple permutes with the same shuffle 8696 // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of 8697 // the loop requires an extra register. 8698 // 8699 // As a compromise, we only emit discrete instructions if the shuffle can be 8700 // generated in 3 or fewer operations. When we have loop information 8701 // available, if this block is within a loop, we should avoid using vperm 8702 // for 3-operation perms and use a constant pool load instead. 8703 if (Cost < 3) 8704 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8705 } 8706 8707 // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant 8708 // vector that will get spilled to the constant pool. 8709 if (V2.isUndef()) V2 = V1; 8710 8711 // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except 8712 // that it is in input element units, not in bytes. Convert now. 8713 8714 // For little endian, the order of the input vectors is reversed, and 8715 // the permutation mask is complemented with respect to 31. This is 8716 // necessary to produce proper semantics with the big-endian-biased vperm 8717 // instruction. 8718 EVT EltVT = V1.getValueType().getVectorElementType(); 8719 unsigned BytesPerElement = EltVT.getSizeInBits()/8; 8720 8721 SmallVector<SDValue, 16> ResultMask; 8722 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { 8723 unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; 8724 8725 for (unsigned j = 0; j != BytesPerElement; ++j) 8726 if (isLittleEndian) 8727 ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j), 8728 dl, MVT::i32)); 8729 else 8730 ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl, 8731 MVT::i32)); 8732 } 8733 8734 SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask); 8735 if (isLittleEndian) 8736 return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), 8737 V2, V1, VPermMask); 8738 else 8739 return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), 8740 V1, V2, VPermMask); 8741 } 8742 8743 /// getVectorCompareInfo - Given an intrinsic, return false if it is not a 8744 /// vector comparison. If it is, return true and fill in Opc/isDot with 8745 /// information about the intrinsic. 8746 static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, 8747 bool &isDot, const PPCSubtarget &Subtarget) { 8748 unsigned IntrinsicID = 8749 cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); 8750 CompareOpc = -1; 8751 isDot = false; 8752 switch (IntrinsicID) { 8753 default: 8754 return false; 8755 // Comparison predicates. 8756 case Intrinsic::ppc_altivec_vcmpbfp_p: 8757 CompareOpc = 966; 8758 isDot = true; 8759 break; 8760 case Intrinsic::ppc_altivec_vcmpeqfp_p: 8761 CompareOpc = 198; 8762 isDot = true; 8763 break; 8764 case Intrinsic::ppc_altivec_vcmpequb_p: 8765 CompareOpc = 6; 8766 isDot = true; 8767 break; 8768 case Intrinsic::ppc_altivec_vcmpequh_p: 8769 CompareOpc = 70; 8770 isDot = true; 8771 break; 8772 case Intrinsic::ppc_altivec_vcmpequw_p: 8773 CompareOpc = 134; 8774 isDot = true; 8775 break; 8776 case Intrinsic::ppc_altivec_vcmpequd_p: 8777 if (Subtarget.hasP8Altivec()) { 8778 CompareOpc = 199; 8779 isDot = true; 8780 } else 8781 return false; 8782 break; 8783 case Intrinsic::ppc_altivec_vcmpneb_p: 8784 case Intrinsic::ppc_altivec_vcmpneh_p: 8785 case Intrinsic::ppc_altivec_vcmpnew_p: 8786 case Intrinsic::ppc_altivec_vcmpnezb_p: 8787 case Intrinsic::ppc_altivec_vcmpnezh_p: 8788 case Intrinsic::ppc_altivec_vcmpnezw_p: 8789 if (Subtarget.hasP9Altivec()) { 8790 switch (IntrinsicID) { 8791 default: 8792 llvm_unreachable("Unknown comparison intrinsic."); 8793 case Intrinsic::ppc_altivec_vcmpneb_p: 8794 CompareOpc = 7; 8795 break; 8796 case Intrinsic::ppc_altivec_vcmpneh_p: 8797 CompareOpc = 71; 8798 break; 8799 case Intrinsic::ppc_altivec_vcmpnew_p: 8800 CompareOpc = 135; 8801 break; 8802 case Intrinsic::ppc_altivec_vcmpnezb_p: 8803 CompareOpc = 263; 8804 break; 8805 case Intrinsic::ppc_altivec_vcmpnezh_p: 8806 CompareOpc = 327; 8807 break; 8808 case Intrinsic::ppc_altivec_vcmpnezw_p: 8809 CompareOpc = 391; 8810 break; 8811 } 8812 isDot = true; 8813 } else 8814 return false; 8815 break; 8816 case Intrinsic::ppc_altivec_vcmpgefp_p: 8817 CompareOpc = 454; 8818 isDot = true; 8819 break; 8820 case Intrinsic::ppc_altivec_vcmpgtfp_p: 8821 CompareOpc = 710; 8822 isDot = true; 8823 break; 8824 case Intrinsic::ppc_altivec_vcmpgtsb_p: 8825 CompareOpc = 774; 8826 isDot = true; 8827 break; 8828 case Intrinsic::ppc_altivec_vcmpgtsh_p: 8829 CompareOpc = 838; 8830 isDot = true; 8831 break; 8832 case Intrinsic::ppc_altivec_vcmpgtsw_p: 8833 CompareOpc = 902; 8834 isDot = true; 8835 break; 8836 case Intrinsic::ppc_altivec_vcmpgtsd_p: 8837 if (Subtarget.hasP8Altivec()) { 8838 CompareOpc = 967; 8839 isDot = true; 8840 } else 8841 return false; 8842 break; 8843 case Intrinsic::ppc_altivec_vcmpgtub_p: 8844 CompareOpc = 518; 8845 isDot = true; 8846 break; 8847 case Intrinsic::ppc_altivec_vcmpgtuh_p: 8848 CompareOpc = 582; 8849 isDot = true; 8850 break; 8851 case Intrinsic::ppc_altivec_vcmpgtuw_p: 8852 CompareOpc = 646; 8853 isDot = true; 8854 break; 8855 case Intrinsic::ppc_altivec_vcmpgtud_p: 8856 if (Subtarget.hasP8Altivec()) { 8857 CompareOpc = 711; 8858 isDot = true; 8859 } else 8860 return false; 8861 break; 8862 8863 // VSX predicate comparisons use the same infrastructure 8864 case Intrinsic::ppc_vsx_xvcmpeqdp_p: 8865 case Intrinsic::ppc_vsx_xvcmpgedp_p: 8866 case Intrinsic::ppc_vsx_xvcmpgtdp_p: 8867 case Intrinsic::ppc_vsx_xvcmpeqsp_p: 8868 case Intrinsic::ppc_vsx_xvcmpgesp_p: 8869 case Intrinsic::ppc_vsx_xvcmpgtsp_p: 8870 if (Subtarget.hasVSX()) { 8871 switch (IntrinsicID) { 8872 case Intrinsic::ppc_vsx_xvcmpeqdp_p: 8873 CompareOpc = 99; 8874 break; 8875 case Intrinsic::ppc_vsx_xvcmpgedp_p: 8876 CompareOpc = 115; 8877 break; 8878 case Intrinsic::ppc_vsx_xvcmpgtdp_p: 8879 CompareOpc = 107; 8880 break; 8881 case Intrinsic::ppc_vsx_xvcmpeqsp_p: 8882 CompareOpc = 67; 8883 break; 8884 case Intrinsic::ppc_vsx_xvcmpgesp_p: 8885 CompareOpc = 83; 8886 break; 8887 case Intrinsic::ppc_vsx_xvcmpgtsp_p: 8888 CompareOpc = 75; 8889 break; 8890 } 8891 isDot = true; 8892 } else 8893 return false; 8894 break; 8895 8896 // Normal Comparisons. 8897 case Intrinsic::ppc_altivec_vcmpbfp: 8898 CompareOpc = 966; 8899 break; 8900 case Intrinsic::ppc_altivec_vcmpeqfp: 8901 CompareOpc = 198; 8902 break; 8903 case Intrinsic::ppc_altivec_vcmpequb: 8904 CompareOpc = 6; 8905 break; 8906 case Intrinsic::ppc_altivec_vcmpequh: 8907 CompareOpc = 70; 8908 break; 8909 case Intrinsic::ppc_altivec_vcmpequw: 8910 CompareOpc = 134; 8911 break; 8912 case Intrinsic::ppc_altivec_vcmpequd: 8913 if (Subtarget.hasP8Altivec()) 8914 CompareOpc = 199; 8915 else 8916 return false; 8917 break; 8918 case Intrinsic::ppc_altivec_vcmpneb: 8919 case Intrinsic::ppc_altivec_vcmpneh: 8920 case Intrinsic::ppc_altivec_vcmpnew: 8921 case Intrinsic::ppc_altivec_vcmpnezb: 8922 case Intrinsic::ppc_altivec_vcmpnezh: 8923 case Intrinsic::ppc_altivec_vcmpnezw: 8924 if (Subtarget.hasP9Altivec()) 8925 switch (IntrinsicID) { 8926 default: 8927 llvm_unreachable("Unknown comparison intrinsic."); 8928 case Intrinsic::ppc_altivec_vcmpneb: 8929 CompareOpc = 7; 8930 break; 8931 case Intrinsic::ppc_altivec_vcmpneh: 8932 CompareOpc = 71; 8933 break; 8934 case Intrinsic::ppc_altivec_vcmpnew: 8935 CompareOpc = 135; 8936 break; 8937 case Intrinsic::ppc_altivec_vcmpnezb: 8938 CompareOpc = 263; 8939 break; 8940 case Intrinsic::ppc_altivec_vcmpnezh: 8941 CompareOpc = 327; 8942 break; 8943 case Intrinsic::ppc_altivec_vcmpnezw: 8944 CompareOpc = 391; 8945 break; 8946 } 8947 else 8948 return false; 8949 break; 8950 case Intrinsic::ppc_altivec_vcmpgefp: 8951 CompareOpc = 454; 8952 break; 8953 case Intrinsic::ppc_altivec_vcmpgtfp: 8954 CompareOpc = 710; 8955 break; 8956 case Intrinsic::ppc_altivec_vcmpgtsb: 8957 CompareOpc = 774; 8958 break; 8959 case Intrinsic::ppc_altivec_vcmpgtsh: 8960 CompareOpc = 838; 8961 break; 8962 case Intrinsic::ppc_altivec_vcmpgtsw: 8963 CompareOpc = 902; 8964 break; 8965 case Intrinsic::ppc_altivec_vcmpgtsd: 8966 if (Subtarget.hasP8Altivec()) 8967 CompareOpc = 967; 8968 else 8969 return false; 8970 break; 8971 case Intrinsic::ppc_altivec_vcmpgtub: 8972 CompareOpc = 518; 8973 break; 8974 case Intrinsic::ppc_altivec_vcmpgtuh: 8975 CompareOpc = 582; 8976 break; 8977 case Intrinsic::ppc_altivec_vcmpgtuw: 8978 CompareOpc = 646; 8979 break; 8980 case Intrinsic::ppc_altivec_vcmpgtud: 8981 if (Subtarget.hasP8Altivec()) 8982 CompareOpc = 711; 8983 else 8984 return false; 8985 break; 8986 } 8987 return true; 8988 } 8989 8990 /// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom 8991 /// lower, do it, otherwise return null. 8992 SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 8993 SelectionDAG &DAG) const { 8994 unsigned IntrinsicID = 8995 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8996 8997 SDLoc dl(Op); 8998 8999 if (IntrinsicID == Intrinsic::thread_pointer) { 9000 // Reads the thread pointer register, used for __builtin_thread_pointer. 9001 if (Subtarget.isPPC64()) 9002 return DAG.getRegister(PPC::X13, MVT::i64); 9003 return DAG.getRegister(PPC::R2, MVT::i32); 9004 } 9005 9006 // We are looking for absolute values here. 9007 // The idea is to try to fit one of two patterns: 9008 // max (a, (0-a)) OR max ((0-a), a) 9009 if (Subtarget.hasP9Vector() && 9010 (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw || 9011 IntrinsicID == Intrinsic::ppc_altivec_vmaxsh || 9012 IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) { 9013 SDValue V1 = Op.getOperand(1); 9014 SDValue V2 = Op.getOperand(2); 9015 if (V1.getSimpleValueType() == V2.getSimpleValueType() && 9016 (V1.getSimpleValueType() == MVT::v4i32 || 9017 V1.getSimpleValueType() == MVT::v8i16 || 9018 V1.getSimpleValueType() == MVT::v16i8)) { 9019 if ( V1.getOpcode() == ISD::SUB && 9020 ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && 9021 V1.getOperand(1) == V2 ) { 9022 // Generate the abs instruction with the operands 9023 return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2); 9024 } 9025 9026 if ( V2.getOpcode() == ISD::SUB && 9027 ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && 9028 V2.getOperand(1) == V1 ) { 9029 // Generate the abs instruction with the operands 9030 return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1); 9031 } 9032 } 9033 } 9034 9035 // If this is a lowered altivec predicate compare, CompareOpc is set to the 9036 // opcode number of the comparison. 9037 int CompareOpc; 9038 bool isDot; 9039 if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget)) 9040 return SDValue(); // Don't custom lower most intrinsics. 9041 9042 // If this is a non-dot comparison, make the VCMP node and we are done. 9043 if (!isDot) { 9044 SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(), 9045 Op.getOperand(1), Op.getOperand(2), 9046 DAG.getConstant(CompareOpc, dl, MVT::i32)); 9047 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp); 9048 } 9049 9050 // Create the PPCISD altivec 'dot' comparison node. 9051 SDValue Ops[] = { 9052 Op.getOperand(2), // LHS 9053 Op.getOperand(3), // RHS 9054 DAG.getConstant(CompareOpc, dl, MVT::i32) 9055 }; 9056 EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue }; 9057 SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); 9058 9059 // Now that we have the comparison, emit a copy from the CR to a GPR. 9060 // This is flagged to the above dot comparison. 9061 SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32, 9062 DAG.getRegister(PPC::CR6, MVT::i32), 9063 CompNode.getValue(1)); 9064 9065 // Unpack the result based on how the target uses it. 9066 unsigned BitNo; // Bit # of CR6. 9067 bool InvertBit; // Invert result? 9068 switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) { 9069 default: // Can't happen, don't crash on invalid number though. 9070 case 0: // Return the value of the EQ bit of CR6. 9071 BitNo = 0; InvertBit = false; 9072 break; 9073 case 1: // Return the inverted value of the EQ bit of CR6. 9074 BitNo = 0; InvertBit = true; 9075 break; 9076 case 2: // Return the value of the LT bit of CR6. 9077 BitNo = 2; InvertBit = false; 9078 break; 9079 case 3: // Return the inverted value of the LT bit of CR6. 9080 BitNo = 2; InvertBit = true; 9081 break; 9082 } 9083 9084 // Shift the bit into the low position. 9085 Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags, 9086 DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32)); 9087 // Isolate the bit. 9088 Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags, 9089 DAG.getConstant(1, dl, MVT::i32)); 9090 9091 // If we are supposed to, toggle the bit. 9092 if (InvertBit) 9093 Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags, 9094 DAG.getConstant(1, dl, MVT::i32)); 9095 return Flags; 9096 } 9097 9098 SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, 9099 SelectionDAG &DAG) const { 9100 // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to 9101 // the beginning of the argument list. 9102 int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1; 9103 SDLoc DL(Op); 9104 switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) { 9105 case Intrinsic::ppc_cfence: { 9106 assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument."); 9107 assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); 9108 return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, 9109 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, 9110 Op.getOperand(ArgStart + 1)), 9111 Op.getOperand(0)), 9112 0); 9113 } 9114 default: 9115 break; 9116 } 9117 return SDValue(); 9118 } 9119 9120 SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const { 9121 // Check for a DIV with the same operands as this REM. 9122 for (auto UI : Op.getOperand(1)->uses()) { 9123 if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) || 9124 (Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV)) 9125 if (UI->getOperand(0) == Op.getOperand(0) && 9126 UI->getOperand(1) == Op.getOperand(1)) 9127 return SDValue(); 9128 } 9129 return Op; 9130 } 9131 9132 // Lower scalar BSWAP64 to xxbrd. 9133 SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const { 9134 SDLoc dl(Op); 9135 // MTVSRDD 9136 Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0), 9137 Op.getOperand(0)); 9138 // XXBRD 9139 Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op); 9140 // MFVSRD 9141 int VectorIndex = 0; 9142 if (Subtarget.isLittleEndian()) 9143 VectorIndex = 1; 9144 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op, 9145 DAG.getTargetConstant(VectorIndex, dl, MVT::i32)); 9146 return Op; 9147 } 9148 9149 // ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be 9150 // compared to a value that is atomically loaded (atomic loads zero-extend). 9151 SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, 9152 SelectionDAG &DAG) const { 9153 assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP && 9154 "Expecting an atomic compare-and-swap here."); 9155 SDLoc dl(Op); 9156 auto *AtomicNode = cast<AtomicSDNode>(Op.getNode()); 9157 EVT MemVT = AtomicNode->getMemoryVT(); 9158 if (MemVT.getSizeInBits() >= 32) 9159 return Op; 9160 9161 SDValue CmpOp = Op.getOperand(2); 9162 // If this is already correctly zero-extended, leave it alone. 9163 auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits()); 9164 if (DAG.MaskedValueIsZero(CmpOp, HighBits)) 9165 return Op; 9166 9167 // Clear the high bits of the compare operand. 9168 unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1; 9169 SDValue NewCmpOp = 9170 DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp, 9171 DAG.getConstant(MaskVal, dl, MVT::i32)); 9172 9173 // Replace the existing compare operand with the properly zero-extended one. 9174 SmallVector<SDValue, 4> Ops; 9175 for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++) 9176 Ops.push_back(AtomicNode->getOperand(i)); 9177 Ops[2] = NewCmpOp; 9178 MachineMemOperand *MMO = AtomicNode->getMemOperand(); 9179 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other); 9180 auto NodeTy = 9181 (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16; 9182 return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO); 9183 } 9184 9185 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, 9186 SelectionDAG &DAG) const { 9187 SDLoc dl(Op); 9188 // Create a stack slot that is 16-byte aligned. 9189 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 9190 int FrameIdx = MFI.CreateStackObject(16, 16, false); 9191 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 9192 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 9193 9194 // Store the input value into Value#0 of the stack slot. 9195 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, 9196 MachinePointerInfo()); 9197 // Load it out. 9198 return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo()); 9199 } 9200 9201 SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 9202 SelectionDAG &DAG) const { 9203 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && 9204 "Should only be called for ISD::INSERT_VECTOR_ELT"); 9205 9206 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 9207 // We have legal lowering for constant indices but not for variable ones. 9208 if (!C) 9209 return SDValue(); 9210 9211 EVT VT = Op.getValueType(); 9212 SDLoc dl(Op); 9213 SDValue V1 = Op.getOperand(0); 9214 SDValue V2 = Op.getOperand(1); 9215 // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types. 9216 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 9217 SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2); 9218 unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8; 9219 unsigned InsertAtElement = C->getZExtValue(); 9220 unsigned InsertAtByte = InsertAtElement * BytesInEachElement; 9221 if (Subtarget.isLittleEndian()) { 9222 InsertAtByte = (16 - BytesInEachElement) - InsertAtByte; 9223 } 9224 return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz, 9225 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 9226 } 9227 return Op; 9228 } 9229 9230 SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 9231 SelectionDAG &DAG) const { 9232 SDLoc dl(Op); 9233 SDNode *N = Op.getNode(); 9234 9235 assert(N->getOperand(0).getValueType() == MVT::v4i1 && 9236 "Unknown extract_vector_elt type"); 9237 9238 SDValue Value = N->getOperand(0); 9239 9240 // The first part of this is like the store lowering except that we don't 9241 // need to track the chain. 9242 9243 // The values are now known to be -1 (false) or 1 (true). To convert this 9244 // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). 9245 // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 9246 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); 9247 9248 // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to 9249 // understand how to form the extending load. 9250 SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); 9251 9252 Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 9253 9254 // Now convert to an integer and store. 9255 Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, 9256 DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), 9257 Value); 9258 9259 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 9260 int FrameIdx = MFI.CreateStackObject(16, 16, false); 9261 MachinePointerInfo PtrInfo = 9262 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 9263 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 9264 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 9265 9266 SDValue StoreChain = DAG.getEntryNode(); 9267 SDValue Ops[] = {StoreChain, 9268 DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), 9269 Value, FIdx}; 9270 SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); 9271 9272 StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, 9273 dl, VTs, Ops, MVT::v4i32, PtrInfo); 9274 9275 // Extract the value requested. 9276 unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 9277 SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); 9278 Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); 9279 9280 SDValue IntVal = 9281 DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); 9282 9283 if (!Subtarget.useCRBits()) 9284 return IntVal; 9285 9286 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal); 9287 } 9288 9289 /// Lowering for QPX v4i1 loads 9290 SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, 9291 SelectionDAG &DAG) const { 9292 SDLoc dl(Op); 9293 LoadSDNode *LN = cast<LoadSDNode>(Op.getNode()); 9294 SDValue LoadChain = LN->getChain(); 9295 SDValue BasePtr = LN->getBasePtr(); 9296 9297 if (Op.getValueType() == MVT::v4f64 || 9298 Op.getValueType() == MVT::v4f32) { 9299 EVT MemVT = LN->getMemoryVT(); 9300 unsigned Alignment = LN->getAlignment(); 9301 9302 // If this load is properly aligned, then it is legal. 9303 if (Alignment >= MemVT.getStoreSize()) 9304 return Op; 9305 9306 EVT ScalarVT = Op.getValueType().getScalarType(), 9307 ScalarMemVT = MemVT.getScalarType(); 9308 unsigned Stride = ScalarMemVT.getStoreSize(); 9309 9310 SDValue Vals[4], LoadChains[4]; 9311 for (unsigned Idx = 0; Idx < 4; ++Idx) { 9312 SDValue Load; 9313 if (ScalarVT != ScalarMemVT) 9314 Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain, 9315 BasePtr, 9316 LN->getPointerInfo().getWithOffset(Idx * Stride), 9317 ScalarMemVT, MinAlign(Alignment, Idx * Stride), 9318 LN->getMemOperand()->getFlags(), LN->getAAInfo()); 9319 else 9320 Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr, 9321 LN->getPointerInfo().getWithOffset(Idx * Stride), 9322 MinAlign(Alignment, Idx * Stride), 9323 LN->getMemOperand()->getFlags(), LN->getAAInfo()); 9324 9325 if (Idx == 0 && LN->isIndexed()) { 9326 assert(LN->getAddressingMode() == ISD::PRE_INC && 9327 "Unknown addressing mode on vector load"); 9328 Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(), 9329 LN->getAddressingMode()); 9330 } 9331 9332 Vals[Idx] = Load; 9333 LoadChains[Idx] = Load.getValue(1); 9334 9335 BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 9336 DAG.getConstant(Stride, dl, 9337 BasePtr.getValueType())); 9338 } 9339 9340 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); 9341 SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals); 9342 9343 if (LN->isIndexed()) { 9344 SDValue RetOps[] = { Value, Vals[0].getValue(1), TF }; 9345 return DAG.getMergeValues(RetOps, dl); 9346 } 9347 9348 SDValue RetOps[] = { Value, TF }; 9349 return DAG.getMergeValues(RetOps, dl); 9350 } 9351 9352 assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower"); 9353 assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported"); 9354 9355 // To lower v4i1 from a byte array, we load the byte elements of the 9356 // vector and then reuse the BUILD_VECTOR logic. 9357 9358 SDValue VectElmts[4], VectElmtChains[4]; 9359 for (unsigned i = 0; i < 4; ++i) { 9360 SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); 9361 Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); 9362 9363 VectElmts[i] = DAG.getExtLoad( 9364 ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx, 9365 LN->getPointerInfo().getWithOffset(i), MVT::i8, 9366 /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo()); 9367 VectElmtChains[i] = VectElmts[i].getValue(1); 9368 } 9369 9370 LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains); 9371 SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts); 9372 9373 SDValue RVals[] = { Value, LoadChain }; 9374 return DAG.getMergeValues(RVals, dl); 9375 } 9376 9377 /// Lowering for QPX v4i1 stores 9378 SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, 9379 SelectionDAG &DAG) const { 9380 SDLoc dl(Op); 9381 StoreSDNode *SN = cast<StoreSDNode>(Op.getNode()); 9382 SDValue StoreChain = SN->getChain(); 9383 SDValue BasePtr = SN->getBasePtr(); 9384 SDValue Value = SN->getValue(); 9385 9386 if (Value.getValueType() == MVT::v4f64 || 9387 Value.getValueType() == MVT::v4f32) { 9388 EVT MemVT = SN->getMemoryVT(); 9389 unsigned Alignment = SN->getAlignment(); 9390 9391 // If this store is properly aligned, then it is legal. 9392 if (Alignment >= MemVT.getStoreSize()) 9393 return Op; 9394 9395 EVT ScalarVT = Value.getValueType().getScalarType(), 9396 ScalarMemVT = MemVT.getScalarType(); 9397 unsigned Stride = ScalarMemVT.getStoreSize(); 9398 9399 SDValue Stores[4]; 9400 for (unsigned Idx = 0; Idx < 4; ++Idx) { 9401 SDValue Ex = DAG.getNode( 9402 ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, 9403 DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout()))); 9404 SDValue Store; 9405 if (ScalarVT != ScalarMemVT) 9406 Store = 9407 DAG.getTruncStore(StoreChain, dl, Ex, BasePtr, 9408 SN->getPointerInfo().getWithOffset(Idx * Stride), 9409 ScalarMemVT, MinAlign(Alignment, Idx * Stride), 9410 SN->getMemOperand()->getFlags(), SN->getAAInfo()); 9411 else 9412 Store = DAG.getStore(StoreChain, dl, Ex, BasePtr, 9413 SN->getPointerInfo().getWithOffset(Idx * Stride), 9414 MinAlign(Alignment, Idx * Stride), 9415 SN->getMemOperand()->getFlags(), SN->getAAInfo()); 9416 9417 if (Idx == 0 && SN->isIndexed()) { 9418 assert(SN->getAddressingMode() == ISD::PRE_INC && 9419 "Unknown addressing mode on vector store"); 9420 Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(), 9421 SN->getAddressingMode()); 9422 } 9423 9424 BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 9425 DAG.getConstant(Stride, dl, 9426 BasePtr.getValueType())); 9427 Stores[Idx] = Store; 9428 } 9429 9430 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); 9431 9432 if (SN->isIndexed()) { 9433 SDValue RetOps[] = { TF, Stores[0].getValue(1) }; 9434 return DAG.getMergeValues(RetOps, dl); 9435 } 9436 9437 return TF; 9438 } 9439 9440 assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported"); 9441 assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower"); 9442 9443 // The values are now known to be -1 (false) or 1 (true). To convert this 9444 // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). 9445 // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 9446 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); 9447 9448 // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to 9449 // understand how to form the extending load. 9450 SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); 9451 9452 Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 9453 9454 // Now convert to an integer and store. 9455 Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, 9456 DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), 9457 Value); 9458 9459 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 9460 int FrameIdx = MFI.CreateStackObject(16, 16, false); 9461 MachinePointerInfo PtrInfo = 9462 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 9463 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 9464 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 9465 9466 SDValue Ops[] = {StoreChain, 9467 DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), 9468 Value, FIdx}; 9469 SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); 9470 9471 StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, 9472 dl, VTs, Ops, MVT::v4i32, PtrInfo); 9473 9474 // Move data into the byte array. 9475 SDValue Loads[4], LoadChains[4]; 9476 for (unsigned i = 0; i < 4; ++i) { 9477 unsigned Offset = 4*i; 9478 SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); 9479 Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); 9480 9481 Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, 9482 PtrInfo.getWithOffset(Offset)); 9483 LoadChains[i] = Loads[i].getValue(1); 9484 } 9485 9486 StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); 9487 9488 SDValue Stores[4]; 9489 for (unsigned i = 0; i < 4; ++i) { 9490 SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); 9491 Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); 9492 9493 Stores[i] = DAG.getTruncStore( 9494 StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), 9495 MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(), 9496 SN->getAAInfo()); 9497 } 9498 9499 StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); 9500 9501 return StoreChain; 9502 } 9503 9504 SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { 9505 SDLoc dl(Op); 9506 if (Op.getValueType() == MVT::v4i32) { 9507 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 9508 9509 SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl); 9510 SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt. 9511 9512 SDValue RHSSwap = // = vrlw RHS, 16 9513 BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl); 9514 9515 // Shrinkify inputs to v8i16. 9516 LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS); 9517 RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS); 9518 RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap); 9519 9520 // Low parts multiplied together, generating 32-bit results (we ignore the 9521 // top parts). 9522 SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh, 9523 LHS, RHS, DAG, dl, MVT::v4i32); 9524 9525 SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm, 9526 LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32); 9527 // Shift the high parts up 16 bits. 9528 HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd, 9529 Neg16, DAG, dl); 9530 return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd); 9531 } else if (Op.getValueType() == MVT::v8i16) { 9532 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 9533 9534 SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl); 9535 9536 return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm, 9537 LHS, RHS, Zero, DAG, dl); 9538 } else if (Op.getValueType() == MVT::v16i8) { 9539 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 9540 bool isLittleEndian = Subtarget.isLittleEndian(); 9541 9542 // Multiply the even 8-bit parts, producing 16-bit sums. 9543 SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub, 9544 LHS, RHS, DAG, dl, MVT::v8i16); 9545 EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts); 9546 9547 // Multiply the odd 8-bit parts, producing 16-bit sums. 9548 SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub, 9549 LHS, RHS, DAG, dl, MVT::v8i16); 9550 OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts); 9551 9552 // Merge the results together. Because vmuleub and vmuloub are 9553 // instructions with a big-endian bias, we must reverse the 9554 // element numbering and reverse the meaning of "odd" and "even" 9555 // when generating little endian code. 9556 int Ops[16]; 9557 for (unsigned i = 0; i != 8; ++i) { 9558 if (isLittleEndian) { 9559 Ops[i*2 ] = 2*i; 9560 Ops[i*2+1] = 2*i+16; 9561 } else { 9562 Ops[i*2 ] = 2*i+1; 9563 Ops[i*2+1] = 2*i+1+16; 9564 } 9565 } 9566 if (isLittleEndian) 9567 return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops); 9568 else 9569 return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops); 9570 } else { 9571 llvm_unreachable("Unknown mul to lower!"); 9572 } 9573 } 9574 9575 /// LowerOperation - Provide custom lowering hooks for some operations. 9576 /// 9577 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9578 switch (Op.getOpcode()) { 9579 default: llvm_unreachable("Wasn't expecting to be able to lower this!"); 9580 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9581 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9582 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9583 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9584 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 9585 case ISD::SETCC: return LowerSETCC(Op, DAG); 9586 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 9587 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 9588 9589 // Variable argument lowering. 9590 case ISD::VASTART: return LowerVASTART(Op, DAG); 9591 case ISD::VAARG: return LowerVAARG(Op, DAG); 9592 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 9593 9594 case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG); 9595 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 9596 case ISD::GET_DYNAMIC_AREA_OFFSET: 9597 return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG); 9598 9599 // Exception handling lowering. 9600 case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG); 9601 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); 9602 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); 9603 9604 case ISD::LOAD: return LowerLOAD(Op, DAG); 9605 case ISD::STORE: return LowerSTORE(Op, DAG); 9606 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); 9607 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 9608 case ISD::FP_TO_UINT: 9609 case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op)); 9610 case ISD::UINT_TO_FP: 9611 case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 9612 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9613 9614 // Lower 64-bit shifts. 9615 case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG); 9616 case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG); 9617 case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG); 9618 9619 // Vector-related lowering. 9620 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 9621 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 9622 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 9623 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 9624 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 9625 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9626 case ISD::MUL: return LowerMUL(Op, DAG); 9627 9628 // For counter-based loop handling. 9629 case ISD::INTRINSIC_W_CHAIN: return SDValue(); 9630 9631 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 9632 9633 // Frame & Return address. 9634 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9635 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9636 9637 case ISD::INTRINSIC_VOID: 9638 return LowerINTRINSIC_VOID(Op, DAG); 9639 case ISD::SREM: 9640 case ISD::UREM: 9641 return LowerREM(Op, DAG); 9642 case ISD::BSWAP: 9643 return LowerBSWAP(Op, DAG); 9644 case ISD::ATOMIC_CMP_SWAP: 9645 return LowerATOMIC_CMP_SWAP(Op, DAG); 9646 } 9647 } 9648 9649 void PPCTargetLowering::ReplaceNodeResults(SDNode *N, 9650 SmallVectorImpl<SDValue>&Results, 9651 SelectionDAG &DAG) const { 9652 SDLoc dl(N); 9653 switch (N->getOpcode()) { 9654 default: 9655 llvm_unreachable("Do not know how to custom type legalize this operation!"); 9656 case ISD::READCYCLECOUNTER: { 9657 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 9658 SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0)); 9659 9660 Results.push_back(RTB); 9661 Results.push_back(RTB.getValue(1)); 9662 Results.push_back(RTB.getValue(2)); 9663 break; 9664 } 9665 case ISD::INTRINSIC_W_CHAIN: { 9666 if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 9667 Intrinsic::ppc_is_decremented_ctr_nonzero) 9668 break; 9669 9670 assert(N->getValueType(0) == MVT::i1 && 9671 "Unexpected result type for CTR decrement intrinsic"); 9672 EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), 9673 N->getValueType(0)); 9674 SDVTList VTs = DAG.getVTList(SVT, MVT::Other); 9675 SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0), 9676 N->getOperand(1)); 9677 9678 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt)); 9679 Results.push_back(NewInt.getValue(1)); 9680 break; 9681 } 9682 case ISD::VAARG: { 9683 if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64()) 9684 return; 9685 9686 EVT VT = N->getValueType(0); 9687 9688 if (VT == MVT::i64) { 9689 SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG); 9690 9691 Results.push_back(NewNode); 9692 Results.push_back(NewNode.getValue(1)); 9693 } 9694 return; 9695 } 9696 case ISD::FP_TO_SINT: 9697 case ISD::FP_TO_UINT: 9698 // LowerFP_TO_INT() can only handle f32 and f64. 9699 if (N->getOperand(0).getValueType() == MVT::ppcf128) 9700 return; 9701 Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); 9702 return; 9703 case ISD::BITCAST: 9704 // Don't handle bitcast here. 9705 return; 9706 } 9707 } 9708 9709 //===----------------------------------------------------------------------===// 9710 // Other Lowering Code 9711 //===----------------------------------------------------------------------===// 9712 9713 static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { 9714 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 9715 Function *Func = Intrinsic::getDeclaration(M, Id); 9716 return Builder.CreateCall(Func, {}); 9717 } 9718 9719 // The mappings for emitLeading/TrailingFence is taken from 9720 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 9721 Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 9722 Instruction *Inst, 9723 AtomicOrdering Ord) const { 9724 if (Ord == AtomicOrdering::SequentiallyConsistent) 9725 return callIntrinsic(Builder, Intrinsic::ppc_sync); 9726 if (isReleaseOrStronger(Ord)) 9727 return callIntrinsic(Builder, Intrinsic::ppc_lwsync); 9728 return nullptr; 9729 } 9730 9731 Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 9732 Instruction *Inst, 9733 AtomicOrdering Ord) const { 9734 if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { 9735 // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and 9736 // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html 9737 // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. 9738 if (isa<LoadInst>(Inst) && Subtarget.isPPC64()) 9739 return Builder.CreateCall( 9740 Intrinsic::getDeclaration( 9741 Builder.GetInsertBlock()->getParent()->getParent(), 9742 Intrinsic::ppc_cfence, {Inst->getType()}), 9743 {Inst}); 9744 // FIXME: Can use isync for rmw operation. 9745 return callIntrinsic(Builder, Intrinsic::ppc_lwsync); 9746 } 9747 return nullptr; 9748 } 9749 9750 MachineBasicBlock * 9751 PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, 9752 unsigned AtomicSize, 9753 unsigned BinOpcode, 9754 unsigned CmpOpcode, 9755 unsigned CmpPred) const { 9756 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 9757 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9758 9759 auto LoadMnemonic = PPC::LDARX; 9760 auto StoreMnemonic = PPC::STDCX; 9761 switch (AtomicSize) { 9762 default: 9763 llvm_unreachable("Unexpected size of atomic entity"); 9764 case 1: 9765 LoadMnemonic = PPC::LBARX; 9766 StoreMnemonic = PPC::STBCX; 9767 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); 9768 break; 9769 case 2: 9770 LoadMnemonic = PPC::LHARX; 9771 StoreMnemonic = PPC::STHCX; 9772 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); 9773 break; 9774 case 4: 9775 LoadMnemonic = PPC::LWARX; 9776 StoreMnemonic = PPC::STWCX; 9777 break; 9778 case 8: 9779 LoadMnemonic = PPC::LDARX; 9780 StoreMnemonic = PPC::STDCX; 9781 break; 9782 } 9783 9784 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9785 MachineFunction *F = BB->getParent(); 9786 MachineFunction::iterator It = ++BB->getIterator(); 9787 9788 unsigned dest = MI.getOperand(0).getReg(); 9789 unsigned ptrA = MI.getOperand(1).getReg(); 9790 unsigned ptrB = MI.getOperand(2).getReg(); 9791 unsigned incr = MI.getOperand(3).getReg(); 9792 DebugLoc dl = MI.getDebugLoc(); 9793 9794 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); 9795 MachineBasicBlock *loop2MBB = 9796 CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr; 9797 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 9798 F->insert(It, loopMBB); 9799 if (CmpOpcode) 9800 F->insert(It, loop2MBB); 9801 F->insert(It, exitMBB); 9802 exitMBB->splice(exitMBB->begin(), BB, 9803 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9804 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 9805 9806 MachineRegisterInfo &RegInfo = F->getRegInfo(); 9807 unsigned TmpReg = (!BinOpcode) ? incr : 9808 RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass 9809 : &PPC::GPRCRegClass); 9810 9811 // thisMBB: 9812 // ... 9813 // fallthrough --> loopMBB 9814 BB->addSuccessor(loopMBB); 9815 9816 // loopMBB: 9817 // l[wd]arx dest, ptr 9818 // add r0, dest, incr 9819 // st[wd]cx. r0, ptr 9820 // bne- loopMBB 9821 // fallthrough --> exitMBB 9822 9823 // For max/min... 9824 // loopMBB: 9825 // l[wd]arx dest, ptr 9826 // cmpl?[wd] incr, dest 9827 // bgt exitMBB 9828 // loop2MBB: 9829 // st[wd]cx. dest, ptr 9830 // bne- loopMBB 9831 // fallthrough --> exitMBB 9832 9833 BB = loopMBB; 9834 BuildMI(BB, dl, TII->get(LoadMnemonic), dest) 9835 .addReg(ptrA).addReg(ptrB); 9836 if (BinOpcode) 9837 BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest); 9838 if (CmpOpcode) { 9839 // Signed comparisons of byte or halfword values must be sign-extended. 9840 if (CmpOpcode == PPC::CMPW && AtomicSize < 4) { 9841 unsigned ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); 9842 BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH), 9843 ExtReg).addReg(dest); 9844 BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) 9845 .addReg(incr).addReg(ExtReg); 9846 } else 9847 BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) 9848 .addReg(incr).addReg(dest); 9849 9850 BuildMI(BB, dl, TII->get(PPC::BCC)) 9851 .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB); 9852 BB->addSuccessor(loop2MBB); 9853 BB->addSuccessor(exitMBB); 9854 BB = loop2MBB; 9855 } 9856 BuildMI(BB, dl, TII->get(StoreMnemonic)) 9857 .addReg(TmpReg).addReg(ptrA).addReg(ptrB); 9858 BuildMI(BB, dl, TII->get(PPC::BCC)) 9859 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); 9860 BB->addSuccessor(loopMBB); 9861 BB->addSuccessor(exitMBB); 9862 9863 // exitMBB: 9864 // ... 9865 BB = exitMBB; 9866 return BB; 9867 } 9868 9869 MachineBasicBlock * 9870 PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI, 9871 MachineBasicBlock *BB, 9872 bool is8bit, // operation 9873 unsigned BinOpcode, 9874 unsigned CmpOpcode, 9875 unsigned CmpPred) const { 9876 // If we support part-word atomic mnemonics, just use them 9877 if (Subtarget.hasPartwordAtomics()) 9878 return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, 9879 CmpOpcode, CmpPred); 9880 9881 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 9882 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9883 // In 64 bit mode we have to use 64 bits for addresses, even though the 9884 // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address 9885 // registers without caring whether they're 32 or 64, but here we're 9886 // doing actual arithmetic on the addresses. 9887 bool is64bit = Subtarget.isPPC64(); 9888 bool isLittleEndian = Subtarget.isLittleEndian(); 9889 unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; 9890 9891 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9892 MachineFunction *F = BB->getParent(); 9893 MachineFunction::iterator It = ++BB->getIterator(); 9894 9895 unsigned dest = MI.getOperand(0).getReg(); 9896 unsigned ptrA = MI.getOperand(1).getReg(); 9897 unsigned ptrB = MI.getOperand(2).getReg(); 9898 unsigned incr = MI.getOperand(3).getReg(); 9899 DebugLoc dl = MI.getDebugLoc(); 9900 9901 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); 9902 MachineBasicBlock *loop2MBB = 9903 CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr; 9904 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 9905 F->insert(It, loopMBB); 9906 if (CmpOpcode) 9907 F->insert(It, loop2MBB); 9908 F->insert(It, exitMBB); 9909 exitMBB->splice(exitMBB->begin(), BB, 9910 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9911 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 9912 9913 MachineRegisterInfo &RegInfo = F->getRegInfo(); 9914 const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass 9915 : &PPC::GPRCRegClass; 9916 unsigned PtrReg = RegInfo.createVirtualRegister(RC); 9917 unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); 9918 unsigned ShiftReg = 9919 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC); 9920 unsigned Incr2Reg = RegInfo.createVirtualRegister(RC); 9921 unsigned MaskReg = RegInfo.createVirtualRegister(RC); 9922 unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); 9923 unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); 9924 unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); 9925 unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC); 9926 unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); 9927 unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); 9928 unsigned Ptr1Reg; 9929 unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC); 9930 9931 // thisMBB: 9932 // ... 9933 // fallthrough --> loopMBB 9934 BB->addSuccessor(loopMBB); 9935 9936 // The 4-byte load must be aligned, while a char or short may be 9937 // anywhere in the word. Hence all this nasty bookkeeping code. 9938 // add ptr1, ptrA, ptrB [copy if ptrA==0] 9939 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] 9940 // xori shift, shift1, 24 [16] 9941 // rlwinm ptr, ptr1, 0, 0, 29 9942 // slw incr2, incr, shift 9943 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] 9944 // slw mask, mask2, shift 9945 // loopMBB: 9946 // lwarx tmpDest, ptr 9947 // add tmp, tmpDest, incr2 9948 // andc tmp2, tmpDest, mask 9949 // and tmp3, tmp, mask 9950 // or tmp4, tmp3, tmp2 9951 // stwcx. tmp4, ptr 9952 // bne- loopMBB 9953 // fallthrough --> exitMBB 9954 // srw dest, tmpDest, shift 9955 if (ptrA != ZeroReg) { 9956 Ptr1Reg = RegInfo.createVirtualRegister(RC); 9957 BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) 9958 .addReg(ptrA).addReg(ptrB); 9959 } else { 9960 Ptr1Reg = ptrB; 9961 } 9962 BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) 9963 .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); 9964 if (!isLittleEndian) 9965 BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) 9966 .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); 9967 if (is64bit) 9968 BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) 9969 .addReg(Ptr1Reg).addImm(0).addImm(61); 9970 else 9971 BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) 9972 .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); 9973 BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg) 9974 .addReg(incr).addReg(ShiftReg); 9975 if (is8bit) 9976 BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); 9977 else { 9978 BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); 9979 BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535); 9980 } 9981 BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) 9982 .addReg(Mask2Reg).addReg(ShiftReg); 9983 9984 BB = loopMBB; 9985 BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) 9986 .addReg(ZeroReg).addReg(PtrReg); 9987 if (BinOpcode) 9988 BuildMI(BB, dl, TII->get(BinOpcode), TmpReg) 9989 .addReg(Incr2Reg).addReg(TmpDestReg); 9990 BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg) 9991 .addReg(TmpDestReg).addReg(MaskReg); 9992 BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg) 9993 .addReg(TmpReg).addReg(MaskReg); 9994 if (CmpOpcode) { 9995 // For unsigned comparisons, we can directly compare the shifted values. 9996 // For signed comparisons we shift and sign extend. 9997 unsigned SReg = RegInfo.createVirtualRegister(RC); 9998 BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), SReg) 9999 .addReg(TmpDestReg).addReg(MaskReg); 10000 unsigned ValueReg = SReg; 10001 unsigned CmpReg = Incr2Reg; 10002 if (CmpOpcode == PPC::CMPW) { 10003 ValueReg = RegInfo.createVirtualRegister(RC); 10004 BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg) 10005 .addReg(SReg).addReg(ShiftReg); 10006 unsigned ValueSReg = RegInfo.createVirtualRegister(RC); 10007 BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg) 10008 .addReg(ValueReg); 10009 ValueReg = ValueSReg; 10010 CmpReg = incr; 10011 } 10012 BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) 10013 .addReg(CmpReg).addReg(ValueReg); 10014 BuildMI(BB, dl, TII->get(PPC::BCC)) 10015 .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB); 10016 BB->addSuccessor(loop2MBB); 10017 BB->addSuccessor(exitMBB); 10018 BB = loop2MBB; 10019 } 10020 BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg) 10021 .addReg(Tmp3Reg).addReg(Tmp2Reg); 10022 BuildMI(BB, dl, TII->get(PPC::STWCX)) 10023 .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg); 10024 BuildMI(BB, dl, TII->get(PPC::BCC)) 10025 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); 10026 BB->addSuccessor(loopMBB); 10027 BB->addSuccessor(exitMBB); 10028 10029 // exitMBB: 10030 // ... 10031 BB = exitMBB; 10032 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg) 10033 .addReg(ShiftReg); 10034 return BB; 10035 } 10036 10037 llvm::MachineBasicBlock * 10038 PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, 10039 MachineBasicBlock *MBB) const { 10040 DebugLoc DL = MI.getDebugLoc(); 10041 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 10042 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); 10043 10044 MachineFunction *MF = MBB->getParent(); 10045 MachineRegisterInfo &MRI = MF->getRegInfo(); 10046 10047 const BasicBlock *BB = MBB->getBasicBlock(); 10048 MachineFunction::iterator I = ++MBB->getIterator(); 10049 10050 unsigned DstReg = MI.getOperand(0).getReg(); 10051 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 10052 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); 10053 unsigned mainDstReg = MRI.createVirtualRegister(RC); 10054 unsigned restoreDstReg = MRI.createVirtualRegister(RC); 10055 10056 MVT PVT = getPointerTy(MF->getDataLayout()); 10057 assert((PVT == MVT::i64 || PVT == MVT::i32) && 10058 "Invalid Pointer Size!"); 10059 // For v = setjmp(buf), we generate 10060 // 10061 // thisMBB: 10062 // SjLjSetup mainMBB 10063 // bl mainMBB 10064 // v_restore = 1 10065 // b sinkMBB 10066 // 10067 // mainMBB: 10068 // buf[LabelOffset] = LR 10069 // v_main = 0 10070 // 10071 // sinkMBB: 10072 // v = phi(main, restore) 10073 // 10074 10075 MachineBasicBlock *thisMBB = MBB; 10076 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 10077 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 10078 MF->insert(I, mainMBB); 10079 MF->insert(I, sinkMBB); 10080 10081 MachineInstrBuilder MIB; 10082 10083 // Transfer the remainder of BB and its successor edges to sinkMBB. 10084 sinkMBB->splice(sinkMBB->begin(), MBB, 10085 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 10086 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 10087 10088 // Note that the structure of the jmp_buf used here is not compatible 10089 // with that used by libc, and is not designed to be. Specifically, it 10090 // stores only those 'reserved' registers that LLVM does not otherwise 10091 // understand how to spill. Also, by convention, by the time this 10092 // intrinsic is called, Clang has already stored the frame address in the 10093 // first slot of the buffer and stack address in the third. Following the 10094 // X86 target code, we'll store the jump address in the second slot. We also 10095 // need to save the TOC pointer (R2) to handle jumps between shared 10096 // libraries, and that will be stored in the fourth slot. The thread 10097 // identifier (R13) is not affected. 10098 10099 // thisMBB: 10100 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 10101 const int64_t TOCOffset = 3 * PVT.getStoreSize(); 10102 const int64_t BPOffset = 4 * PVT.getStoreSize(); 10103 10104 // Prepare IP either in reg. 10105 const TargetRegisterClass *PtrRC = getRegClassFor(PVT); 10106 unsigned LabelReg = MRI.createVirtualRegister(PtrRC); 10107 unsigned BufReg = MI.getOperand(1).getReg(); 10108 10109 if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { 10110 setUsesTOCBasePtr(*MBB->getParent()); 10111 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) 10112 .addReg(PPC::X2) 10113 .addImm(TOCOffset) 10114 .addReg(BufReg) 10115 .cloneMemRefs(MI); 10116 } 10117 10118 // Naked functions never have a base pointer, and so we use r1. For all 10119 // other functions, this decision must be delayed until during PEI. 10120 unsigned BaseReg; 10121 if (MF->getFunction().hasFnAttribute(Attribute::Naked)) 10122 BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1; 10123 else 10124 BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP; 10125 10126 MIB = BuildMI(*thisMBB, MI, DL, 10127 TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW)) 10128 .addReg(BaseReg) 10129 .addImm(BPOffset) 10130 .addReg(BufReg) 10131 .cloneMemRefs(MI); 10132 10133 // Setup 10134 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); 10135 MIB.addRegMask(TRI->getNoPreservedMask()); 10136 10137 BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); 10138 10139 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup)) 10140 .addMBB(mainMBB); 10141 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB); 10142 10143 thisMBB->addSuccessor(mainMBB, BranchProbability::getZero()); 10144 thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne()); 10145 10146 // mainMBB: 10147 // mainDstReg = 0 10148 MIB = 10149 BuildMI(mainMBB, DL, 10150 TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); 10151 10152 // Store IP 10153 if (Subtarget.isPPC64()) { 10154 MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD)) 10155 .addReg(LabelReg) 10156 .addImm(LabelOffset) 10157 .addReg(BufReg); 10158 } else { 10159 MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW)) 10160 .addReg(LabelReg) 10161 .addImm(LabelOffset) 10162 .addReg(BufReg); 10163 } 10164 MIB.cloneMemRefs(MI); 10165 10166 BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0); 10167 mainMBB->addSuccessor(sinkMBB); 10168 10169 // sinkMBB: 10170 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 10171 TII->get(PPC::PHI), DstReg) 10172 .addReg(mainDstReg).addMBB(mainMBB) 10173 .addReg(restoreDstReg).addMBB(thisMBB); 10174 10175 MI.eraseFromParent(); 10176 return sinkMBB; 10177 } 10178 10179 MachineBasicBlock * 10180 PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, 10181 MachineBasicBlock *MBB) const { 10182 DebugLoc DL = MI.getDebugLoc(); 10183 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 10184 10185 MachineFunction *MF = MBB->getParent(); 10186 MachineRegisterInfo &MRI = MF->getRegInfo(); 10187 10188 MVT PVT = getPointerTy(MF->getDataLayout()); 10189 assert((PVT == MVT::i64 || PVT == MVT::i32) && 10190 "Invalid Pointer Size!"); 10191 10192 const TargetRegisterClass *RC = 10193 (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; 10194 unsigned Tmp = MRI.createVirtualRegister(RC); 10195 // Since FP is only updated here but NOT referenced, it's treated as GPR. 10196 unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; 10197 unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; 10198 unsigned BP = 10199 (PVT == MVT::i64) 10200 ? PPC::X30 10201 : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29 10202 : PPC::R30); 10203 10204 MachineInstrBuilder MIB; 10205 10206 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 10207 const int64_t SPOffset = 2 * PVT.getStoreSize(); 10208 const int64_t TOCOffset = 3 * PVT.getStoreSize(); 10209 const int64_t BPOffset = 4 * PVT.getStoreSize(); 10210 10211 unsigned BufReg = MI.getOperand(0).getReg(); 10212 10213 // Reload FP (the jumped-to function may not have had a 10214 // frame pointer, and if so, then its r31 will be restored 10215 // as necessary). 10216 if (PVT == MVT::i64) { 10217 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP) 10218 .addImm(0) 10219 .addReg(BufReg); 10220 } else { 10221 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP) 10222 .addImm(0) 10223 .addReg(BufReg); 10224 } 10225 MIB.cloneMemRefs(MI); 10226 10227 // Reload IP 10228 if (PVT == MVT::i64) { 10229 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp) 10230 .addImm(LabelOffset) 10231 .addReg(BufReg); 10232 } else { 10233 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp) 10234 .addImm(LabelOffset) 10235 .addReg(BufReg); 10236 } 10237 MIB.cloneMemRefs(MI); 10238 10239 // Reload SP 10240 if (PVT == MVT::i64) { 10241 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP) 10242 .addImm(SPOffset) 10243 .addReg(BufReg); 10244 } else { 10245 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP) 10246 .addImm(SPOffset) 10247 .addReg(BufReg); 10248 } 10249 MIB.cloneMemRefs(MI); 10250 10251 // Reload BP 10252 if (PVT == MVT::i64) { 10253 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP) 10254 .addImm(BPOffset) 10255 .addReg(BufReg); 10256 } else { 10257 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP) 10258 .addImm(BPOffset) 10259 .addReg(BufReg); 10260 } 10261 MIB.cloneMemRefs(MI); 10262 10263 // Reload TOC 10264 if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) { 10265 setUsesTOCBasePtr(*MBB->getParent()); 10266 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) 10267 .addImm(TOCOffset) 10268 .addReg(BufReg) 10269 .cloneMemRefs(MI); 10270 } 10271 10272 // Jump 10273 BuildMI(*MBB, MI, DL, 10274 TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp); 10275 BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR)); 10276 10277 MI.eraseFromParent(); 10278 return MBB; 10279 } 10280 10281 MachineBasicBlock * 10282 PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 10283 MachineBasicBlock *BB) const { 10284 if (MI.getOpcode() == TargetOpcode::STACKMAP || 10285 MI.getOpcode() == TargetOpcode::PATCHPOINT) { 10286 if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() && 10287 MI.getOpcode() == TargetOpcode::PATCHPOINT) { 10288 // Call lowering should have added an r2 operand to indicate a dependence 10289 // on the TOC base pointer value. It can't however, because there is no 10290 // way to mark the dependence as implicit there, and so the stackmap code 10291 // will confuse it with a regular operand. Instead, add the dependence 10292 // here. 10293 setUsesTOCBasePtr(*BB->getParent()); 10294 MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true)); 10295 } 10296 10297 return emitPatchPoint(MI, BB); 10298 } 10299 10300 if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 || 10301 MI.getOpcode() == PPC::EH_SjLj_SetJmp64) { 10302 return emitEHSjLjSetJmp(MI, BB); 10303 } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 || 10304 MI.getOpcode() == PPC::EH_SjLj_LongJmp64) { 10305 return emitEHSjLjLongJmp(MI, BB); 10306 } 10307 10308 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 10309 10310 // To "insert" these instructions we actually have to insert their 10311 // control-flow patterns. 10312 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10313 MachineFunction::iterator It = ++BB->getIterator(); 10314 10315 MachineFunction *F = BB->getParent(); 10316 10317 if (MI.getOpcode() == PPC::SELECT_CC_I4 || 10318 MI.getOpcode() == PPC::SELECT_CC_I8 || 10319 MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8) { 10320 SmallVector<MachineOperand, 2> Cond; 10321 if (MI.getOpcode() == PPC::SELECT_CC_I4 || 10322 MI.getOpcode() == PPC::SELECT_CC_I8) 10323 Cond.push_back(MI.getOperand(4)); 10324 else 10325 Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET)); 10326 Cond.push_back(MI.getOperand(1)); 10327 10328 DebugLoc dl = MI.getDebugLoc(); 10329 TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond, 10330 MI.getOperand(2).getReg(), MI.getOperand(3).getReg()); 10331 } else if (MI.getOpcode() == PPC::SELECT_CC_I4 || 10332 MI.getOpcode() == PPC::SELECT_CC_I8 || 10333 MI.getOpcode() == PPC::SELECT_CC_F4 || 10334 MI.getOpcode() == PPC::SELECT_CC_F8 || 10335 MI.getOpcode() == PPC::SELECT_CC_F16 || 10336 MI.getOpcode() == PPC::SELECT_CC_QFRC || 10337 MI.getOpcode() == PPC::SELECT_CC_QSRC || 10338 MI.getOpcode() == PPC::SELECT_CC_QBRC || 10339 MI.getOpcode() == PPC::SELECT_CC_VRRC || 10340 MI.getOpcode() == PPC::SELECT_CC_VSFRC || 10341 MI.getOpcode() == PPC::SELECT_CC_VSSRC || 10342 MI.getOpcode() == PPC::SELECT_CC_VSRC || 10343 MI.getOpcode() == PPC::SELECT_CC_SPE4 || 10344 MI.getOpcode() == PPC::SELECT_CC_SPE || 10345 MI.getOpcode() == PPC::SELECT_I4 || 10346 MI.getOpcode() == PPC::SELECT_I8 || 10347 MI.getOpcode() == PPC::SELECT_F4 || 10348 MI.getOpcode() == PPC::SELECT_F8 || 10349 MI.getOpcode() == PPC::SELECT_F16 || 10350 MI.getOpcode() == PPC::SELECT_QFRC || 10351 MI.getOpcode() == PPC::SELECT_QSRC || 10352 MI.getOpcode() == PPC::SELECT_QBRC || 10353 MI.getOpcode() == PPC::SELECT_SPE || 10354 MI.getOpcode() == PPC::SELECT_SPE4 || 10355 MI.getOpcode() == PPC::SELECT_VRRC || 10356 MI.getOpcode() == PPC::SELECT_VSFRC || 10357 MI.getOpcode() == PPC::SELECT_VSSRC || 10358 MI.getOpcode() == PPC::SELECT_VSRC) { 10359 // The incoming instruction knows the destination vreg to set, the 10360 // condition code register to branch on, the true/false values to 10361 // select between, and a branch opcode to use. 10362 10363 // thisMBB: 10364 // ... 10365 // TrueVal = ... 10366 // cmpTY ccX, r1, r2 10367 // bCC copy1MBB 10368 // fallthrough --> copy0MBB 10369 MachineBasicBlock *thisMBB = BB; 10370 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10371 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10372 DebugLoc dl = MI.getDebugLoc(); 10373 F->insert(It, copy0MBB); 10374 F->insert(It, sinkMBB); 10375 10376 // Transfer the remainder of BB and its successor edges to sinkMBB. 10377 sinkMBB->splice(sinkMBB->begin(), BB, 10378 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10379 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10380 10381 // Next, add the true and fallthrough blocks as its successors. 10382 BB->addSuccessor(copy0MBB); 10383 BB->addSuccessor(sinkMBB); 10384 10385 if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 || 10386 MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || 10387 MI.getOpcode() == PPC::SELECT_F16 || 10388 MI.getOpcode() == PPC::SELECT_SPE4 || 10389 MI.getOpcode() == PPC::SELECT_SPE || 10390 MI.getOpcode() == PPC::SELECT_QFRC || 10391 MI.getOpcode() == PPC::SELECT_QSRC || 10392 MI.getOpcode() == PPC::SELECT_QBRC || 10393 MI.getOpcode() == PPC::SELECT_VRRC || 10394 MI.getOpcode() == PPC::SELECT_VSFRC || 10395 MI.getOpcode() == PPC::SELECT_VSSRC || 10396 MI.getOpcode() == PPC::SELECT_VSRC) { 10397 BuildMI(BB, dl, TII->get(PPC::BC)) 10398 .addReg(MI.getOperand(1).getReg()) 10399 .addMBB(sinkMBB); 10400 } else { 10401 unsigned SelectPred = MI.getOperand(4).getImm(); 10402 BuildMI(BB, dl, TII->get(PPC::BCC)) 10403 .addImm(SelectPred) 10404 .addReg(MI.getOperand(1).getReg()) 10405 .addMBB(sinkMBB); 10406 } 10407 10408 // copy0MBB: 10409 // %FalseValue = ... 10410 // # fallthrough to sinkMBB 10411 BB = copy0MBB; 10412 10413 // Update machine-CFG edges 10414 BB->addSuccessor(sinkMBB); 10415 10416 // sinkMBB: 10417 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10418 // ... 10419 BB = sinkMBB; 10420 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg()) 10421 .addReg(MI.getOperand(3).getReg()) 10422 .addMBB(copy0MBB) 10423 .addReg(MI.getOperand(2).getReg()) 10424 .addMBB(thisMBB); 10425 } else if (MI.getOpcode() == PPC::ReadTB) { 10426 // To read the 64-bit time-base register on a 32-bit target, we read the 10427 // two halves. Should the counter have wrapped while it was being read, we 10428 // need to try again. 10429 // ... 10430 // readLoop: 10431 // mfspr Rx,TBU # load from TBU 10432 // mfspr Ry,TB # load from TB 10433 // mfspr Rz,TBU # load from TBU 10434 // cmpw crX,Rx,Rz # check if 'old'='new' 10435 // bne readLoop # branch if they're not equal 10436 // ... 10437 10438 MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB); 10439 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10440 DebugLoc dl = MI.getDebugLoc(); 10441 F->insert(It, readMBB); 10442 F->insert(It, sinkMBB); 10443 10444 // Transfer the remainder of BB and its successor edges to sinkMBB. 10445 sinkMBB->splice(sinkMBB->begin(), BB, 10446 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10447 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10448 10449 BB->addSuccessor(readMBB); 10450 BB = readMBB; 10451 10452 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10453 unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); 10454 unsigned LoReg = MI.getOperand(0).getReg(); 10455 unsigned HiReg = MI.getOperand(1).getReg(); 10456 10457 BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269); 10458 BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268); 10459 BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269); 10460 10461 unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); 10462 10463 BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg) 10464 .addReg(HiReg).addReg(ReadAgainReg); 10465 BuildMI(BB, dl, TII->get(PPC::BCC)) 10466 .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB); 10467 10468 BB->addSuccessor(readMBB); 10469 BB->addSuccessor(sinkMBB); 10470 } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8) 10471 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4); 10472 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16) 10473 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4); 10474 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32) 10475 BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4); 10476 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64) 10477 BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8); 10478 10479 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8) 10480 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND); 10481 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16) 10482 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND); 10483 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32) 10484 BB = EmitAtomicBinary(MI, BB, 4, PPC::AND); 10485 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64) 10486 BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8); 10487 10488 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8) 10489 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR); 10490 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16) 10491 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR); 10492 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32) 10493 BB = EmitAtomicBinary(MI, BB, 4, PPC::OR); 10494 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64) 10495 BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8); 10496 10497 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8) 10498 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR); 10499 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16) 10500 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR); 10501 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32) 10502 BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR); 10503 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64) 10504 BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8); 10505 10506 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) 10507 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); 10508 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) 10509 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); 10510 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) 10511 BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND); 10512 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) 10513 BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8); 10514 10515 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) 10516 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); 10517 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16) 10518 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF); 10519 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32) 10520 BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF); 10521 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64) 10522 BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8); 10523 10524 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8) 10525 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE); 10526 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16) 10527 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE); 10528 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32) 10529 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE); 10530 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64) 10531 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE); 10532 10533 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8) 10534 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE); 10535 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16) 10536 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE); 10537 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32) 10538 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE); 10539 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64) 10540 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE); 10541 10542 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8) 10543 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE); 10544 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16) 10545 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE); 10546 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32) 10547 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE); 10548 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64) 10549 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE); 10550 10551 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8) 10552 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE); 10553 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16) 10554 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE); 10555 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32) 10556 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE); 10557 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64) 10558 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE); 10559 10560 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8) 10561 BB = EmitPartwordAtomicBinary(MI, BB, true, 0); 10562 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16) 10563 BB = EmitPartwordAtomicBinary(MI, BB, false, 0); 10564 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32) 10565 BB = EmitAtomicBinary(MI, BB, 4, 0); 10566 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64) 10567 BB = EmitAtomicBinary(MI, BB, 8, 0); 10568 else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || 10569 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 || 10570 (Subtarget.hasPartwordAtomics() && 10571 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) || 10572 (Subtarget.hasPartwordAtomics() && 10573 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) { 10574 bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64; 10575 10576 auto LoadMnemonic = PPC::LDARX; 10577 auto StoreMnemonic = PPC::STDCX; 10578 switch (MI.getOpcode()) { 10579 default: 10580 llvm_unreachable("Compare and swap of unknown size"); 10581 case PPC::ATOMIC_CMP_SWAP_I8: 10582 LoadMnemonic = PPC::LBARX; 10583 StoreMnemonic = PPC::STBCX; 10584 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); 10585 break; 10586 case PPC::ATOMIC_CMP_SWAP_I16: 10587 LoadMnemonic = PPC::LHARX; 10588 StoreMnemonic = PPC::STHCX; 10589 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); 10590 break; 10591 case PPC::ATOMIC_CMP_SWAP_I32: 10592 LoadMnemonic = PPC::LWARX; 10593 StoreMnemonic = PPC::STWCX; 10594 break; 10595 case PPC::ATOMIC_CMP_SWAP_I64: 10596 LoadMnemonic = PPC::LDARX; 10597 StoreMnemonic = PPC::STDCX; 10598 break; 10599 } 10600 unsigned dest = MI.getOperand(0).getReg(); 10601 unsigned ptrA = MI.getOperand(1).getReg(); 10602 unsigned ptrB = MI.getOperand(2).getReg(); 10603 unsigned oldval = MI.getOperand(3).getReg(); 10604 unsigned newval = MI.getOperand(4).getReg(); 10605 DebugLoc dl = MI.getDebugLoc(); 10606 10607 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); 10608 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); 10609 MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); 10610 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 10611 F->insert(It, loop1MBB); 10612 F->insert(It, loop2MBB); 10613 F->insert(It, midMBB); 10614 F->insert(It, exitMBB); 10615 exitMBB->splice(exitMBB->begin(), BB, 10616 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10617 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 10618 10619 // thisMBB: 10620 // ... 10621 // fallthrough --> loopMBB 10622 BB->addSuccessor(loop1MBB); 10623 10624 // loop1MBB: 10625 // l[bhwd]arx dest, ptr 10626 // cmp[wd] dest, oldval 10627 // bne- midMBB 10628 // loop2MBB: 10629 // st[bhwd]cx. newval, ptr 10630 // bne- loopMBB 10631 // b exitBB 10632 // midMBB: 10633 // st[bhwd]cx. dest, ptr 10634 // exitBB: 10635 BB = loop1MBB; 10636 BuildMI(BB, dl, TII->get(LoadMnemonic), dest) 10637 .addReg(ptrA).addReg(ptrB); 10638 BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0) 10639 .addReg(oldval).addReg(dest); 10640 BuildMI(BB, dl, TII->get(PPC::BCC)) 10641 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); 10642 BB->addSuccessor(loop2MBB); 10643 BB->addSuccessor(midMBB); 10644 10645 BB = loop2MBB; 10646 BuildMI(BB, dl, TII->get(StoreMnemonic)) 10647 .addReg(newval).addReg(ptrA).addReg(ptrB); 10648 BuildMI(BB, dl, TII->get(PPC::BCC)) 10649 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); 10650 BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); 10651 BB->addSuccessor(loop1MBB); 10652 BB->addSuccessor(exitMBB); 10653 10654 BB = midMBB; 10655 BuildMI(BB, dl, TII->get(StoreMnemonic)) 10656 .addReg(dest).addReg(ptrA).addReg(ptrB); 10657 BB->addSuccessor(exitMBB); 10658 10659 // exitMBB: 10660 // ... 10661 BB = exitMBB; 10662 } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 || 10663 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) { 10664 // We must use 64-bit registers for addresses when targeting 64-bit, 10665 // since we're actually doing arithmetic on them. Other registers 10666 // can be 32-bit. 10667 bool is64bit = Subtarget.isPPC64(); 10668 bool isLittleEndian = Subtarget.isLittleEndian(); 10669 bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; 10670 10671 unsigned dest = MI.getOperand(0).getReg(); 10672 unsigned ptrA = MI.getOperand(1).getReg(); 10673 unsigned ptrB = MI.getOperand(2).getReg(); 10674 unsigned oldval = MI.getOperand(3).getReg(); 10675 unsigned newval = MI.getOperand(4).getReg(); 10676 DebugLoc dl = MI.getDebugLoc(); 10677 10678 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); 10679 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); 10680 MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); 10681 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 10682 F->insert(It, loop1MBB); 10683 F->insert(It, loop2MBB); 10684 F->insert(It, midMBB); 10685 F->insert(It, exitMBB); 10686 exitMBB->splice(exitMBB->begin(), BB, 10687 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10688 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 10689 10690 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10691 const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass 10692 : &PPC::GPRCRegClass; 10693 unsigned PtrReg = RegInfo.createVirtualRegister(RC); 10694 unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); 10695 unsigned ShiftReg = 10696 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC); 10697 unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC); 10698 unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC); 10699 unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC); 10700 unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC); 10701 unsigned MaskReg = RegInfo.createVirtualRegister(RC); 10702 unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); 10703 unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); 10704 unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); 10705 unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); 10706 unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); 10707 unsigned Ptr1Reg; 10708 unsigned TmpReg = RegInfo.createVirtualRegister(RC); 10709 unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; 10710 // thisMBB: 10711 // ... 10712 // fallthrough --> loopMBB 10713 BB->addSuccessor(loop1MBB); 10714 10715 // The 4-byte load must be aligned, while a char or short may be 10716 // anywhere in the word. Hence all this nasty bookkeeping code. 10717 // add ptr1, ptrA, ptrB [copy if ptrA==0] 10718 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] 10719 // xori shift, shift1, 24 [16] 10720 // rlwinm ptr, ptr1, 0, 0, 29 10721 // slw newval2, newval, shift 10722 // slw oldval2, oldval,shift 10723 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] 10724 // slw mask, mask2, shift 10725 // and newval3, newval2, mask 10726 // and oldval3, oldval2, mask 10727 // loop1MBB: 10728 // lwarx tmpDest, ptr 10729 // and tmp, tmpDest, mask 10730 // cmpw tmp, oldval3 10731 // bne- midMBB 10732 // loop2MBB: 10733 // andc tmp2, tmpDest, mask 10734 // or tmp4, tmp2, newval3 10735 // stwcx. tmp4, ptr 10736 // bne- loop1MBB 10737 // b exitBB 10738 // midMBB: 10739 // stwcx. tmpDest, ptr 10740 // exitBB: 10741 // srw dest, tmpDest, shift 10742 if (ptrA != ZeroReg) { 10743 Ptr1Reg = RegInfo.createVirtualRegister(RC); 10744 BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) 10745 .addReg(ptrA).addReg(ptrB); 10746 } else { 10747 Ptr1Reg = ptrB; 10748 } 10749 BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) 10750 .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); 10751 if (!isLittleEndian) 10752 BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) 10753 .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); 10754 if (is64bit) 10755 BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) 10756 .addReg(Ptr1Reg).addImm(0).addImm(61); 10757 else 10758 BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) 10759 .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); 10760 BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg) 10761 .addReg(newval).addReg(ShiftReg); 10762 BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg) 10763 .addReg(oldval).addReg(ShiftReg); 10764 if (is8bit) 10765 BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); 10766 else { 10767 BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); 10768 BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) 10769 .addReg(Mask3Reg).addImm(65535); 10770 } 10771 BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) 10772 .addReg(Mask2Reg).addReg(ShiftReg); 10773 BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg) 10774 .addReg(NewVal2Reg).addReg(MaskReg); 10775 BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg) 10776 .addReg(OldVal2Reg).addReg(MaskReg); 10777 10778 BB = loop1MBB; 10779 BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) 10780 .addReg(ZeroReg).addReg(PtrReg); 10781 BuildMI(BB, dl, TII->get(PPC::AND),TmpReg) 10782 .addReg(TmpDestReg).addReg(MaskReg); 10783 BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0) 10784 .addReg(TmpReg).addReg(OldVal3Reg); 10785 BuildMI(BB, dl, TII->get(PPC::BCC)) 10786 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); 10787 BB->addSuccessor(loop2MBB); 10788 BB->addSuccessor(midMBB); 10789 10790 BB = loop2MBB; 10791 BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg) 10792 .addReg(TmpDestReg).addReg(MaskReg); 10793 BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg) 10794 .addReg(Tmp2Reg).addReg(NewVal3Reg); 10795 BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg) 10796 .addReg(ZeroReg).addReg(PtrReg); 10797 BuildMI(BB, dl, TII->get(PPC::BCC)) 10798 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); 10799 BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); 10800 BB->addSuccessor(loop1MBB); 10801 BB->addSuccessor(exitMBB); 10802 10803 BB = midMBB; 10804 BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg) 10805 .addReg(ZeroReg).addReg(PtrReg); 10806 BB->addSuccessor(exitMBB); 10807 10808 // exitMBB: 10809 // ... 10810 BB = exitMBB; 10811 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg) 10812 .addReg(ShiftReg); 10813 } else if (MI.getOpcode() == PPC::FADDrtz) { 10814 // This pseudo performs an FADD with rounding mode temporarily forced 10815 // to round-to-zero. We emit this via custom inserter since the FPSCR 10816 // is not modeled at the SelectionDAG level. 10817 unsigned Dest = MI.getOperand(0).getReg(); 10818 unsigned Src1 = MI.getOperand(1).getReg(); 10819 unsigned Src2 = MI.getOperand(2).getReg(); 10820 DebugLoc dl = MI.getDebugLoc(); 10821 10822 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10823 unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); 10824 10825 // Save FPSCR value. 10826 BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg); 10827 10828 // Set rounding mode to round-to-zero. 10829 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31); 10830 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30); 10831 10832 // Perform addition. 10833 BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2); 10834 10835 // Restore FPSCR value. 10836 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg); 10837 } else if (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || 10838 MI.getOpcode() == PPC::ANDIo_1_GT_BIT || 10839 MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || 10840 MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) { 10841 unsigned Opcode = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || 10842 MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) 10843 ? PPC::ANDIo8 10844 : PPC::ANDIo; 10845 bool isEQ = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || 10846 MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8); 10847 10848 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10849 unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ? 10850 &PPC::GPRCRegClass : 10851 &PPC::G8RCRegClass); 10852 10853 DebugLoc dl = MI.getDebugLoc(); 10854 BuildMI(*BB, MI, dl, TII->get(Opcode), Dest) 10855 .addReg(MI.getOperand(1).getReg()) 10856 .addImm(1); 10857 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), 10858 MI.getOperand(0).getReg()) 10859 .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT); 10860 } else if (MI.getOpcode() == PPC::TCHECK_RET) { 10861 DebugLoc Dl = MI.getDebugLoc(); 10862 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10863 unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); 10864 BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg); 10865 return BB; 10866 } else { 10867 llvm_unreachable("Unexpected instr type to insert"); 10868 } 10869 10870 MI.eraseFromParent(); // The pseudo instruction is gone now. 10871 return BB; 10872 } 10873 10874 //===----------------------------------------------------------------------===// 10875 // Target Optimization Hooks 10876 //===----------------------------------------------------------------------===// 10877 10878 static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) { 10879 // For the estimates, convergence is quadratic, so we essentially double the 10880 // number of digits correct after every iteration. For both FRE and FRSQRTE, 10881 // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(), 10882 // this is 2^-14. IEEE float has 23 digits and double has 52 digits. 10883 int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3; 10884 if (VT.getScalarType() == MVT::f64) 10885 RefinementSteps++; 10886 return RefinementSteps; 10887 } 10888 10889 SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 10890 int Enabled, int &RefinementSteps, 10891 bool &UseOneConstNR, 10892 bool Reciprocal) const { 10893 EVT VT = Operand.getValueType(); 10894 if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || 10895 (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || 10896 (VT == MVT::v4f32 && Subtarget.hasAltivec()) || 10897 (VT == MVT::v2f64 && Subtarget.hasVSX()) || 10898 (VT == MVT::v4f32 && Subtarget.hasQPX()) || 10899 (VT == MVT::v4f64 && Subtarget.hasQPX())) { 10900 if (RefinementSteps == ReciprocalEstimate::Unspecified) 10901 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); 10902 10903 UseOneConstNR = true; 10904 return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); 10905 } 10906 return SDValue(); 10907 } 10908 10909 SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, 10910 int Enabled, 10911 int &RefinementSteps) const { 10912 EVT VT = Operand.getValueType(); 10913 if ((VT == MVT::f32 && Subtarget.hasFRES()) || 10914 (VT == MVT::f64 && Subtarget.hasFRE()) || 10915 (VT == MVT::v4f32 && Subtarget.hasAltivec()) || 10916 (VT == MVT::v2f64 && Subtarget.hasVSX()) || 10917 (VT == MVT::v4f32 && Subtarget.hasQPX()) || 10918 (VT == MVT::v4f64 && Subtarget.hasQPX())) { 10919 if (RefinementSteps == ReciprocalEstimate::Unspecified) 10920 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); 10921 return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); 10922 } 10923 return SDValue(); 10924 } 10925 10926 unsigned PPCTargetLowering::combineRepeatedFPDivisors() const { 10927 // Note: This functionality is used only when unsafe-fp-math is enabled, and 10928 // on cores with reciprocal estimates (which are used when unsafe-fp-math is 10929 // enabled for division), this functionality is redundant with the default 10930 // combiner logic (once the division -> reciprocal/multiply transformation 10931 // has taken place). As a result, this matters more for older cores than for 10932 // newer ones. 10933 10934 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 10935 // reciprocal if there are two or more FDIVs (for embedded cores with only 10936 // one FP pipeline) for three or more FDIVs (for generic OOO cores). 10937 switch (Subtarget.getDarwinDirective()) { 10938 default: 10939 return 3; 10940 case PPC::DIR_440: 10941 case PPC::DIR_A2: 10942 case PPC::DIR_E500: 10943 case PPC::DIR_E500mc: 10944 case PPC::DIR_E5500: 10945 return 2; 10946 } 10947 } 10948 10949 // isConsecutiveLSLoc needs to work even if all adds have not yet been 10950 // collapsed, and so we need to look through chains of them. 10951 static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base, 10952 int64_t& Offset, SelectionDAG &DAG) { 10953 if (DAG.isBaseWithConstantOffset(Loc)) { 10954 Base = Loc.getOperand(0); 10955 Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue(); 10956 10957 // The base might itself be a base plus an offset, and if so, accumulate 10958 // that as well. 10959 getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG); 10960 } 10961 } 10962 10963 static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, 10964 unsigned Bytes, int Dist, 10965 SelectionDAG &DAG) { 10966 if (VT.getSizeInBits() / 8 != Bytes) 10967 return false; 10968 10969 SDValue BaseLoc = Base->getBasePtr(); 10970 if (Loc.getOpcode() == ISD::FrameIndex) { 10971 if (BaseLoc.getOpcode() != ISD::FrameIndex) 10972 return false; 10973 const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 10974 int FI = cast<FrameIndexSDNode>(Loc)->getIndex(); 10975 int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex(); 10976 int FS = MFI.getObjectSize(FI); 10977 int BFS = MFI.getObjectSize(BFI); 10978 if (FS != BFS || FS != (int)Bytes) return false; 10979 return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes); 10980 } 10981 10982 SDValue Base1 = Loc, Base2 = BaseLoc; 10983 int64_t Offset1 = 0, Offset2 = 0; 10984 getBaseWithConstantOffset(Loc, Base1, Offset1, DAG); 10985 getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG); 10986 if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes)) 10987 return true; 10988 10989 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10990 const GlobalValue *GV1 = nullptr; 10991 const GlobalValue *GV2 = nullptr; 10992 Offset1 = 0; 10993 Offset2 = 0; 10994 bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1); 10995 bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); 10996 if (isGA1 && isGA2 && GV1 == GV2) 10997 return Offset1 == (Offset2 + Dist*Bytes); 10998 return false; 10999 } 11000 11001 // Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does 11002 // not enforce equality of the chain operands. 11003 static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, 11004 unsigned Bytes, int Dist, 11005 SelectionDAG &DAG) { 11006 if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) { 11007 EVT VT = LS->getMemoryVT(); 11008 SDValue Loc = LS->getBasePtr(); 11009 return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG); 11010 } 11011 11012 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 11013 EVT VT; 11014 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 11015 default: return false; 11016 case Intrinsic::ppc_qpx_qvlfd: 11017 case Intrinsic::ppc_qpx_qvlfda: 11018 VT = MVT::v4f64; 11019 break; 11020 case Intrinsic::ppc_qpx_qvlfs: 11021 case Intrinsic::ppc_qpx_qvlfsa: 11022 VT = MVT::v4f32; 11023 break; 11024 case Intrinsic::ppc_qpx_qvlfcd: 11025 case Intrinsic::ppc_qpx_qvlfcda: 11026 VT = MVT::v2f64; 11027 break; 11028 case Intrinsic::ppc_qpx_qvlfcs: 11029 case Intrinsic::ppc_qpx_qvlfcsa: 11030 VT = MVT::v2f32; 11031 break; 11032 case Intrinsic::ppc_qpx_qvlfiwa: 11033 case Intrinsic::ppc_qpx_qvlfiwz: 11034 case Intrinsic::ppc_altivec_lvx: 11035 case Intrinsic::ppc_altivec_lvxl: 11036 case Intrinsic::ppc_vsx_lxvw4x: 11037 case Intrinsic::ppc_vsx_lxvw4x_be: 11038 VT = MVT::v4i32; 11039 break; 11040 case Intrinsic::ppc_vsx_lxvd2x: 11041 case Intrinsic::ppc_vsx_lxvd2x_be: 11042 VT = MVT::v2f64; 11043 break; 11044 case Intrinsic::ppc_altivec_lvebx: 11045 VT = MVT::i8; 11046 break; 11047 case Intrinsic::ppc_altivec_lvehx: 11048 VT = MVT::i16; 11049 break; 11050 case Intrinsic::ppc_altivec_lvewx: 11051 VT = MVT::i32; 11052 break; 11053 } 11054 11055 return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG); 11056 } 11057 11058 if (N->getOpcode() == ISD::INTRINSIC_VOID) { 11059 EVT VT; 11060 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 11061 default: return false; 11062 case Intrinsic::ppc_qpx_qvstfd: 11063 case Intrinsic::ppc_qpx_qvstfda: 11064 VT = MVT::v4f64; 11065 break; 11066 case Intrinsic::ppc_qpx_qvstfs: 11067 case Intrinsic::ppc_qpx_qvstfsa: 11068 VT = MVT::v4f32; 11069 break; 11070 case Intrinsic::ppc_qpx_qvstfcd: 11071 case Intrinsic::ppc_qpx_qvstfcda: 11072 VT = MVT::v2f64; 11073 break; 11074 case Intrinsic::ppc_qpx_qvstfcs: 11075 case Intrinsic::ppc_qpx_qvstfcsa: 11076 VT = MVT::v2f32; 11077 break; 11078 case Intrinsic::ppc_qpx_qvstfiw: 11079 case Intrinsic::ppc_qpx_qvstfiwa: 11080 case Intrinsic::ppc_altivec_stvx: 11081 case Intrinsic::ppc_altivec_stvxl: 11082 case Intrinsic::ppc_vsx_stxvw4x: 11083 VT = MVT::v4i32; 11084 break; 11085 case Intrinsic::ppc_vsx_stxvd2x: 11086 VT = MVT::v2f64; 11087 break; 11088 case Intrinsic::ppc_vsx_stxvw4x_be: 11089 VT = MVT::v4i32; 11090 break; 11091 case Intrinsic::ppc_vsx_stxvd2x_be: 11092 VT = MVT::v2f64; 11093 break; 11094 case Intrinsic::ppc_altivec_stvebx: 11095 VT = MVT::i8; 11096 break; 11097 case Intrinsic::ppc_altivec_stvehx: 11098 VT = MVT::i16; 11099 break; 11100 case Intrinsic::ppc_altivec_stvewx: 11101 VT = MVT::i32; 11102 break; 11103 } 11104 11105 return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG); 11106 } 11107 11108 return false; 11109 } 11110 11111 // Return true is there is a nearyby consecutive load to the one provided 11112 // (regardless of alignment). We search up and down the chain, looking though 11113 // token factors and other loads (but nothing else). As a result, a true result 11114 // indicates that it is safe to create a new consecutive load adjacent to the 11115 // load provided. 11116 static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { 11117 SDValue Chain = LD->getChain(); 11118 EVT VT = LD->getMemoryVT(); 11119 11120 SmallSet<SDNode *, 16> LoadRoots; 11121 SmallVector<SDNode *, 8> Queue(1, Chain.getNode()); 11122 SmallSet<SDNode *, 16> Visited; 11123 11124 // First, search up the chain, branching to follow all token-factor operands. 11125 // If we find a consecutive load, then we're done, otherwise, record all 11126 // nodes just above the top-level loads and token factors. 11127 while (!Queue.empty()) { 11128 SDNode *ChainNext = Queue.pop_back_val(); 11129 if (!Visited.insert(ChainNext).second) 11130 continue; 11131 11132 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) { 11133 if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) 11134 return true; 11135 11136 if (!Visited.count(ChainLD->getChain().getNode())) 11137 Queue.push_back(ChainLD->getChain().getNode()); 11138 } else if (ChainNext->getOpcode() == ISD::TokenFactor) { 11139 for (const SDUse &O : ChainNext->ops()) 11140 if (!Visited.count(O.getNode())) 11141 Queue.push_back(O.getNode()); 11142 } else 11143 LoadRoots.insert(ChainNext); 11144 } 11145 11146 // Second, search down the chain, starting from the top-level nodes recorded 11147 // in the first phase. These top-level nodes are the nodes just above all 11148 // loads and token factors. Starting with their uses, recursively look though 11149 // all loads (just the chain uses) and token factors to find a consecutive 11150 // load. 11151 Visited.clear(); 11152 Queue.clear(); 11153 11154 for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(), 11155 IE = LoadRoots.end(); I != IE; ++I) { 11156 Queue.push_back(*I); 11157 11158 while (!Queue.empty()) { 11159 SDNode *LoadRoot = Queue.pop_back_val(); 11160 if (!Visited.insert(LoadRoot).second) 11161 continue; 11162 11163 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot)) 11164 if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) 11165 return true; 11166 11167 for (SDNode::use_iterator UI = LoadRoot->use_begin(), 11168 UE = LoadRoot->use_end(); UI != UE; ++UI) 11169 if (((isa<MemSDNode>(*UI) && 11170 cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) || 11171 UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) 11172 Queue.push_back(*UI); 11173 } 11174 } 11175 11176 return false; 11177 } 11178 11179 /// This function is called when we have proved that a SETCC node can be replaced 11180 /// by subtraction (and other supporting instructions) so that the result of 11181 /// comparison is kept in a GPR instead of CR. This function is purely for 11182 /// codegen purposes and has some flags to guide the codegen process. 11183 static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement, 11184 bool Swap, SDLoc &DL, SelectionDAG &DAG) { 11185 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); 11186 11187 // Zero extend the operands to the largest legal integer. Originally, they 11188 // must be of a strictly smaller size. 11189 auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0), 11190 DAG.getConstant(Size, DL, MVT::i32)); 11191 auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1), 11192 DAG.getConstant(Size, DL, MVT::i32)); 11193 11194 // Swap if needed. Depends on the condition code. 11195 if (Swap) 11196 std::swap(Op0, Op1); 11197 11198 // Subtract extended integers. 11199 auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1); 11200 11201 // Move the sign bit to the least significant position and zero out the rest. 11202 // Now the least significant bit carries the result of original comparison. 11203 auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode, 11204 DAG.getConstant(Size - 1, DL, MVT::i32)); 11205 auto Final = Shifted; 11206 11207 // Complement the result if needed. Based on the condition code. 11208 if (Complement) 11209 Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted, 11210 DAG.getConstant(1, DL, MVT::i64)); 11211 11212 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final); 11213 } 11214 11215 SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N, 11216 DAGCombinerInfo &DCI) const { 11217 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); 11218 11219 SelectionDAG &DAG = DCI.DAG; 11220 SDLoc DL(N); 11221 11222 // Size of integers being compared has a critical role in the following 11223 // analysis, so we prefer to do this when all types are legal. 11224 if (!DCI.isAfterLegalizeDAG()) 11225 return SDValue(); 11226 11227 // If all users of SETCC extend its value to a legal integer type 11228 // then we replace SETCC with a subtraction 11229 for (SDNode::use_iterator UI = N->use_begin(), 11230 UE = N->use_end(); UI != UE; ++UI) { 11231 if (UI->getOpcode() != ISD::ZERO_EXTEND) 11232 return SDValue(); 11233 } 11234 11235 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 11236 auto OpSize = N->getOperand(0).getValueSizeInBits(); 11237 11238 unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits(); 11239 11240 if (OpSize < Size) { 11241 switch (CC) { 11242 default: break; 11243 case ISD::SETULT: 11244 return generateEquivalentSub(N, Size, false, false, DL, DAG); 11245 case ISD::SETULE: 11246 return generateEquivalentSub(N, Size, true, true, DL, DAG); 11247 case ISD::SETUGT: 11248 return generateEquivalentSub(N, Size, false, true, DL, DAG); 11249 case ISD::SETUGE: 11250 return generateEquivalentSub(N, Size, true, false, DL, DAG); 11251 } 11252 } 11253 11254 return SDValue(); 11255 } 11256 11257 SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, 11258 DAGCombinerInfo &DCI) const { 11259 SelectionDAG &DAG = DCI.DAG; 11260 SDLoc dl(N); 11261 11262 assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits"); 11263 // If we're tracking CR bits, we need to be careful that we don't have: 11264 // trunc(binary-ops(zext(x), zext(y))) 11265 // or 11266 // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) 11267 // such that we're unnecessarily moving things into GPRs when it would be 11268 // better to keep them in CR bits. 11269 11270 // Note that trunc here can be an actual i1 trunc, or can be the effective 11271 // truncation that comes from a setcc or select_cc. 11272 if (N->getOpcode() == ISD::TRUNCATE && 11273 N->getValueType(0) != MVT::i1) 11274 return SDValue(); 11275 11276 if (N->getOperand(0).getValueType() != MVT::i32 && 11277 N->getOperand(0).getValueType() != MVT::i64) 11278 return SDValue(); 11279 11280 if (N->getOpcode() == ISD::SETCC || 11281 N->getOpcode() == ISD::SELECT_CC) { 11282 // If we're looking at a comparison, then we need to make sure that the 11283 // high bits (all except for the first) don't matter the result. 11284 ISD::CondCode CC = 11285 cast<CondCodeSDNode>(N->getOperand( 11286 N->getOpcode() == ISD::SETCC ? 2 : 4))->get(); 11287 unsigned OpBits = N->getOperand(0).getValueSizeInBits(); 11288 11289 if (ISD::isSignedIntSetCC(CC)) { 11290 if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits || 11291 DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits) 11292 return SDValue(); 11293 } else if (ISD::isUnsignedIntSetCC(CC)) { 11294 if (!DAG.MaskedValueIsZero(N->getOperand(0), 11295 APInt::getHighBitsSet(OpBits, OpBits-1)) || 11296 !DAG.MaskedValueIsZero(N->getOperand(1), 11297 APInt::getHighBitsSet(OpBits, OpBits-1))) 11298 return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI) 11299 : SDValue()); 11300 } else { 11301 // This is neither a signed nor an unsigned comparison, just make sure 11302 // that the high bits are equal. 11303 KnownBits Op1Known, Op2Known; 11304 DAG.computeKnownBits(N->getOperand(0), Op1Known); 11305 DAG.computeKnownBits(N->getOperand(1), Op2Known); 11306 11307 // We don't really care about what is known about the first bit (if 11308 // anything), so clear it in all masks prior to comparing them. 11309 Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0); 11310 Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0); 11311 11312 if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One) 11313 return SDValue(); 11314 } 11315 } 11316 11317 // We now know that the higher-order bits are irrelevant, we just need to 11318 // make sure that all of the intermediate operations are bit operations, and 11319 // all inputs are extensions. 11320 if (N->getOperand(0).getOpcode() != ISD::AND && 11321 N->getOperand(0).getOpcode() != ISD::OR && 11322 N->getOperand(0).getOpcode() != ISD::XOR && 11323 N->getOperand(0).getOpcode() != ISD::SELECT && 11324 N->getOperand(0).getOpcode() != ISD::SELECT_CC && 11325 N->getOperand(0).getOpcode() != ISD::TRUNCATE && 11326 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND && 11327 N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && 11328 N->getOperand(0).getOpcode() != ISD::ANY_EXTEND) 11329 return SDValue(); 11330 11331 if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) && 11332 N->getOperand(1).getOpcode() != ISD::AND && 11333 N->getOperand(1).getOpcode() != ISD::OR && 11334 N->getOperand(1).getOpcode() != ISD::XOR && 11335 N->getOperand(1).getOpcode() != ISD::SELECT && 11336 N->getOperand(1).getOpcode() != ISD::SELECT_CC && 11337 N->getOperand(1).getOpcode() != ISD::TRUNCATE && 11338 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND && 11339 N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && 11340 N->getOperand(1).getOpcode() != ISD::ANY_EXTEND) 11341 return SDValue(); 11342 11343 SmallVector<SDValue, 4> Inputs; 11344 SmallVector<SDValue, 8> BinOps, PromOps; 11345 SmallPtrSet<SDNode *, 16> Visited; 11346 11347 for (unsigned i = 0; i < 2; ++i) { 11348 if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND || 11349 N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND || 11350 N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) && 11351 N->getOperand(i).getOperand(0).getValueType() == MVT::i1) || 11352 isa<ConstantSDNode>(N->getOperand(i))) 11353 Inputs.push_back(N->getOperand(i)); 11354 else 11355 BinOps.push_back(N->getOperand(i)); 11356 11357 if (N->getOpcode() == ISD::TRUNCATE) 11358 break; 11359 } 11360 11361 // Visit all inputs, collect all binary operations (and, or, xor and 11362 // select) that are all fed by extensions. 11363 while (!BinOps.empty()) { 11364 SDValue BinOp = BinOps.back(); 11365 BinOps.pop_back(); 11366 11367 if (!Visited.insert(BinOp.getNode()).second) 11368 continue; 11369 11370 PromOps.push_back(BinOp); 11371 11372 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { 11373 // The condition of the select is not promoted. 11374 if (BinOp.getOpcode() == ISD::SELECT && i == 0) 11375 continue; 11376 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) 11377 continue; 11378 11379 if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || 11380 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || 11381 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) && 11382 BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) || 11383 isa<ConstantSDNode>(BinOp.getOperand(i))) { 11384 Inputs.push_back(BinOp.getOperand(i)); 11385 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || 11386 BinOp.getOperand(i).getOpcode() == ISD::OR || 11387 BinOp.getOperand(i).getOpcode() == ISD::XOR || 11388 BinOp.getOperand(i).getOpcode() == ISD::SELECT || 11389 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC || 11390 BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || 11391 BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || 11392 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || 11393 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) { 11394 BinOps.push_back(BinOp.getOperand(i)); 11395 } else { 11396 // We have an input that is not an extension or another binary 11397 // operation; we'll abort this transformation. 11398 return SDValue(); 11399 } 11400 } 11401 } 11402 11403 // Make sure that this is a self-contained cluster of operations (which 11404 // is not quite the same thing as saying that everything has only one 11405 // use). 11406 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 11407 if (isa<ConstantSDNode>(Inputs[i])) 11408 continue; 11409 11410 for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), 11411 UE = Inputs[i].getNode()->use_end(); 11412 UI != UE; ++UI) { 11413 SDNode *User = *UI; 11414 if (User != N && !Visited.count(User)) 11415 return SDValue(); 11416 11417 // Make sure that we're not going to promote the non-output-value 11418 // operand(s) or SELECT or SELECT_CC. 11419 // FIXME: Although we could sometimes handle this, and it does occur in 11420 // practice that one of the condition inputs to the select is also one of 11421 // the outputs, we currently can't deal with this. 11422 if (User->getOpcode() == ISD::SELECT) { 11423 if (User->getOperand(0) == Inputs[i]) 11424 return SDValue(); 11425 } else if (User->getOpcode() == ISD::SELECT_CC) { 11426 if (User->getOperand(0) == Inputs[i] || 11427 User->getOperand(1) == Inputs[i]) 11428 return SDValue(); 11429 } 11430 } 11431 } 11432 11433 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { 11434 for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), 11435 UE = PromOps[i].getNode()->use_end(); 11436 UI != UE; ++UI) { 11437 SDNode *User = *UI; 11438 if (User != N && !Visited.count(User)) 11439 return SDValue(); 11440 11441 // Make sure that we're not going to promote the non-output-value 11442 // operand(s) or SELECT or SELECT_CC. 11443 // FIXME: Although we could sometimes handle this, and it does occur in 11444 // practice that one of the condition inputs to the select is also one of 11445 // the outputs, we currently can't deal with this. 11446 if (User->getOpcode() == ISD::SELECT) { 11447 if (User->getOperand(0) == PromOps[i]) 11448 return SDValue(); 11449 } else if (User->getOpcode() == ISD::SELECT_CC) { 11450 if (User->getOperand(0) == PromOps[i] || 11451 User->getOperand(1) == PromOps[i]) 11452 return SDValue(); 11453 } 11454 } 11455 } 11456 11457 // Replace all inputs with the extension operand. 11458 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 11459 // Constants may have users outside the cluster of to-be-promoted nodes, 11460 // and so we need to replace those as we do the promotions. 11461 if (isa<ConstantSDNode>(Inputs[i])) 11462 continue; 11463 else 11464 DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); 11465 } 11466 11467 std::list<HandleSDNode> PromOpHandles; 11468 for (auto &PromOp : PromOps) 11469 PromOpHandles.emplace_back(PromOp); 11470 11471 // Replace all operations (these are all the same, but have a different 11472 // (i1) return type). DAG.getNode will validate that the types of 11473 // a binary operator match, so go through the list in reverse so that 11474 // we've likely promoted both operands first. Any intermediate truncations or 11475 // extensions disappear. 11476 while (!PromOpHandles.empty()) { 11477 SDValue PromOp = PromOpHandles.back().getValue(); 11478 PromOpHandles.pop_back(); 11479 11480 if (PromOp.getOpcode() == ISD::TRUNCATE || 11481 PromOp.getOpcode() == ISD::SIGN_EXTEND || 11482 PromOp.getOpcode() == ISD::ZERO_EXTEND || 11483 PromOp.getOpcode() == ISD::ANY_EXTEND) { 11484 if (!isa<ConstantSDNode>(PromOp.getOperand(0)) && 11485 PromOp.getOperand(0).getValueType() != MVT::i1) { 11486 // The operand is not yet ready (see comment below). 11487 PromOpHandles.emplace_front(PromOp); 11488 continue; 11489 } 11490 11491 SDValue RepValue = PromOp.getOperand(0); 11492 if (isa<ConstantSDNode>(RepValue)) 11493 RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue); 11494 11495 DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue); 11496 continue; 11497 } 11498 11499 unsigned C; 11500 switch (PromOp.getOpcode()) { 11501 default: C = 0; break; 11502 case ISD::SELECT: C = 1; break; 11503 case ISD::SELECT_CC: C = 2; break; 11504 } 11505 11506 if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) && 11507 PromOp.getOperand(C).getValueType() != MVT::i1) || 11508 (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) && 11509 PromOp.getOperand(C+1).getValueType() != MVT::i1)) { 11510 // The to-be-promoted operands of this node have not yet been 11511 // promoted (this should be rare because we're going through the 11512 // list backward, but if one of the operands has several users in 11513 // this cluster of to-be-promoted nodes, it is possible). 11514 PromOpHandles.emplace_front(PromOp); 11515 continue; 11516 } 11517 11518 SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), 11519 PromOp.getNode()->op_end()); 11520 11521 // If there are any constant inputs, make sure they're replaced now. 11522 for (unsigned i = 0; i < 2; ++i) 11523 if (isa<ConstantSDNode>(Ops[C+i])) 11524 Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]); 11525 11526 DAG.ReplaceAllUsesOfValueWith(PromOp, 11527 DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops)); 11528 } 11529 11530 // Now we're left with the initial truncation itself. 11531 if (N->getOpcode() == ISD::TRUNCATE) 11532 return N->getOperand(0); 11533 11534 // Otherwise, this is a comparison. The operands to be compared have just 11535 // changed type (to i1), but everything else is the same. 11536 return SDValue(N, 0); 11537 } 11538 11539 SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, 11540 DAGCombinerInfo &DCI) const { 11541 SelectionDAG &DAG = DCI.DAG; 11542 SDLoc dl(N); 11543 11544 // If we're tracking CR bits, we need to be careful that we don't have: 11545 // zext(binary-ops(trunc(x), trunc(y))) 11546 // or 11547 // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) 11548 // such that we're unnecessarily moving things into CR bits that can more 11549 // efficiently stay in GPRs. Note that if we're not certain that the high 11550 // bits are set as required by the final extension, we still may need to do 11551 // some masking to get the proper behavior. 11552 11553 // This same functionality is important on PPC64 when dealing with 11554 // 32-to-64-bit extensions; these occur often when 32-bit values are used as 11555 // the return values of functions. Because it is so similar, it is handled 11556 // here as well. 11557 11558 if (N->getValueType(0) != MVT::i32 && 11559 N->getValueType(0) != MVT::i64) 11560 return SDValue(); 11561 11562 if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) || 11563 (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64()))) 11564 return SDValue(); 11565 11566 if (N->getOperand(0).getOpcode() != ISD::AND && 11567 N->getOperand(0).getOpcode() != ISD::OR && 11568 N->getOperand(0).getOpcode() != ISD::XOR && 11569 N->getOperand(0).getOpcode() != ISD::SELECT && 11570 N->getOperand(0).getOpcode() != ISD::SELECT_CC) 11571 return SDValue(); 11572 11573 SmallVector<SDValue, 4> Inputs; 11574 SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps; 11575 SmallPtrSet<SDNode *, 16> Visited; 11576 11577 // Visit all inputs, collect all binary operations (and, or, xor and 11578 // select) that are all fed by truncations. 11579 while (!BinOps.empty()) { 11580 SDValue BinOp = BinOps.back(); 11581 BinOps.pop_back(); 11582 11583 if (!Visited.insert(BinOp.getNode()).second) 11584 continue; 11585 11586 PromOps.push_back(BinOp); 11587 11588 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { 11589 // The condition of the select is not promoted. 11590 if (BinOp.getOpcode() == ISD::SELECT && i == 0) 11591 continue; 11592 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) 11593 continue; 11594 11595 if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || 11596 isa<ConstantSDNode>(BinOp.getOperand(i))) { 11597 Inputs.push_back(BinOp.getOperand(i)); 11598 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || 11599 BinOp.getOperand(i).getOpcode() == ISD::OR || 11600 BinOp.getOperand(i).getOpcode() == ISD::XOR || 11601 BinOp.getOperand(i).getOpcode() == ISD::SELECT || 11602 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) { 11603 BinOps.push_back(BinOp.getOperand(i)); 11604 } else { 11605 // We have an input that is not a truncation or another binary 11606 // operation; we'll abort this transformation. 11607 return SDValue(); 11608 } 11609 } 11610 } 11611 11612 // The operands of a select that must be truncated when the select is 11613 // promoted because the operand is actually part of the to-be-promoted set. 11614 DenseMap<SDNode *, EVT> SelectTruncOp[2]; 11615 11616 // Make sure that this is a self-contained cluster of operations (which 11617 // is not quite the same thing as saying that everything has only one 11618 // use). 11619 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 11620 if (isa<ConstantSDNode>(Inputs[i])) 11621 continue; 11622 11623 for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), 11624 UE = Inputs[i].getNode()->use_end(); 11625 UI != UE; ++UI) { 11626 SDNode *User = *UI; 11627 if (User != N && !Visited.count(User)) 11628 return SDValue(); 11629 11630 // If we're going to promote the non-output-value operand(s) or SELECT or 11631 // SELECT_CC, record them for truncation. 11632 if (User->getOpcode() == ISD::SELECT) { 11633 if (User->getOperand(0) == Inputs[i]) 11634 SelectTruncOp[0].insert(std::make_pair(User, 11635 User->getOperand(0).getValueType())); 11636 } else if (User->getOpcode() == ISD::SELECT_CC) { 11637 if (User->getOperand(0) == Inputs[i]) 11638 SelectTruncOp[0].insert(std::make_pair(User, 11639 User->getOperand(0).getValueType())); 11640 if (User->getOperand(1) == Inputs[i]) 11641 SelectTruncOp[1].insert(std::make_pair(User, 11642 User->getOperand(1).getValueType())); 11643 } 11644 } 11645 } 11646 11647 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { 11648 for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), 11649 UE = PromOps[i].getNode()->use_end(); 11650 UI != UE; ++UI) { 11651 SDNode *User = *UI; 11652 if (User != N && !Visited.count(User)) 11653 return SDValue(); 11654 11655 // If we're going to promote the non-output-value operand(s) or SELECT or 11656 // SELECT_CC, record them for truncation. 11657 if (User->getOpcode() == ISD::SELECT) { 11658 if (User->getOperand(0) == PromOps[i]) 11659 SelectTruncOp[0].insert(std::make_pair(User, 11660 User->getOperand(0).getValueType())); 11661 } else if (User->getOpcode() == ISD::SELECT_CC) { 11662 if (User->getOperand(0) == PromOps[i]) 11663 SelectTruncOp[0].insert(std::make_pair(User, 11664 User->getOperand(0).getValueType())); 11665 if (User->getOperand(1) == PromOps[i]) 11666 SelectTruncOp[1].insert(std::make_pair(User, 11667 User->getOperand(1).getValueType())); 11668 } 11669 } 11670 } 11671 11672 unsigned PromBits = N->getOperand(0).getValueSizeInBits(); 11673 bool ReallyNeedsExt = false; 11674 if (N->getOpcode() != ISD::ANY_EXTEND) { 11675 // If all of the inputs are not already sign/zero extended, then 11676 // we'll still need to do that at the end. 11677 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 11678 if (isa<ConstantSDNode>(Inputs[i])) 11679 continue; 11680 11681 unsigned OpBits = 11682 Inputs[i].getOperand(0).getValueSizeInBits(); 11683 assert(PromBits < OpBits && "Truncation not to a smaller bit count?"); 11684 11685 if ((N->getOpcode() == ISD::ZERO_EXTEND && 11686 !DAG.MaskedValueIsZero(Inputs[i].getOperand(0), 11687 APInt::getHighBitsSet(OpBits, 11688 OpBits-PromBits))) || 11689 (N->getOpcode() == ISD::SIGN_EXTEND && 11690 DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) < 11691 (OpBits-(PromBits-1)))) { 11692 ReallyNeedsExt = true; 11693 break; 11694 } 11695 } 11696 } 11697 11698 // Replace all inputs, either with the truncation operand, or a 11699 // truncation or extension to the final output type. 11700 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 11701 // Constant inputs need to be replaced with the to-be-promoted nodes that 11702 // use them because they might have users outside of the cluster of 11703 // promoted nodes. 11704 if (isa<ConstantSDNode>(Inputs[i])) 11705 continue; 11706 11707 SDValue InSrc = Inputs[i].getOperand(0); 11708 if (Inputs[i].getValueType() == N->getValueType(0)) 11709 DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc); 11710 else if (N->getOpcode() == ISD::SIGN_EXTEND) 11711 DAG.ReplaceAllUsesOfValueWith(Inputs[i], 11712 DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0))); 11713 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11714 DAG.ReplaceAllUsesOfValueWith(Inputs[i], 11715 DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0))); 11716 else 11717 DAG.ReplaceAllUsesOfValueWith(Inputs[i], 11718 DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0))); 11719 } 11720 11721 std::list<HandleSDNode> PromOpHandles; 11722 for (auto &PromOp : PromOps) 11723 PromOpHandles.emplace_back(PromOp); 11724 11725 // Replace all operations (these are all the same, but have a different 11726 // (promoted) return type). DAG.getNode will validate that the types of 11727 // a binary operator match, so go through the list in reverse so that 11728 // we've likely promoted both operands first. 11729 while (!PromOpHandles.empty()) { 11730 SDValue PromOp = PromOpHandles.back().getValue(); 11731 PromOpHandles.pop_back(); 11732 11733 unsigned C; 11734 switch (PromOp.getOpcode()) { 11735 default: C = 0; break; 11736 case ISD::SELECT: C = 1; break; 11737 case ISD::SELECT_CC: C = 2; break; 11738 } 11739 11740 if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) && 11741 PromOp.getOperand(C).getValueType() != N->getValueType(0)) || 11742 (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) && 11743 PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) { 11744 // The to-be-promoted operands of this node have not yet been 11745 // promoted (this should be rare because we're going through the 11746 // list backward, but if one of the operands has several users in 11747 // this cluster of to-be-promoted nodes, it is possible). 11748 PromOpHandles.emplace_front(PromOp); 11749 continue; 11750 } 11751 11752 // For SELECT and SELECT_CC nodes, we do a similar check for any 11753 // to-be-promoted comparison inputs. 11754 if (PromOp.getOpcode() == ISD::SELECT || 11755 PromOp.getOpcode() == ISD::SELECT_CC) { 11756 if ((SelectTruncOp[0].count(PromOp.getNode()) && 11757 PromOp.getOperand(0).getValueType() != N->getValueType(0)) || 11758 (SelectTruncOp[1].count(PromOp.getNode()) && 11759 PromOp.getOperand(1).getValueType() != N->getValueType(0))) { 11760 PromOpHandles.emplace_front(PromOp); 11761 continue; 11762 } 11763 } 11764 11765 SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), 11766 PromOp.getNode()->op_end()); 11767 11768 // If this node has constant inputs, then they'll need to be promoted here. 11769 for (unsigned i = 0; i < 2; ++i) { 11770 if (!isa<ConstantSDNode>(Ops[C+i])) 11771 continue; 11772 if (Ops[C+i].getValueType() == N->getValueType(0)) 11773 continue; 11774 11775 if (N->getOpcode() == ISD::SIGN_EXTEND) 11776 Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); 11777 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11778 Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); 11779 else 11780 Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); 11781 } 11782 11783 // If we've promoted the comparison inputs of a SELECT or SELECT_CC, 11784 // truncate them again to the original value type. 11785 if (PromOp.getOpcode() == ISD::SELECT || 11786 PromOp.getOpcode() == ISD::SELECT_CC) { 11787 auto SI0 = SelectTruncOp[0].find(PromOp.getNode()); 11788 if (SI0 != SelectTruncOp[0].end()) 11789 Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]); 11790 auto SI1 = SelectTruncOp[1].find(PromOp.getNode()); 11791 if (SI1 != SelectTruncOp[1].end()) 11792 Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]); 11793 } 11794 11795 DAG.ReplaceAllUsesOfValueWith(PromOp, 11796 DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops)); 11797 } 11798 11799 // Now we're left with the initial extension itself. 11800 if (!ReallyNeedsExt) 11801 return N->getOperand(0); 11802 11803 // To zero extend, just mask off everything except for the first bit (in the 11804 // i1 case). 11805 if (N->getOpcode() == ISD::ZERO_EXTEND) 11806 return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0), 11807 DAG.getConstant(APInt::getLowBitsSet( 11808 N->getValueSizeInBits(0), PromBits), 11809 dl, N->getValueType(0))); 11810 11811 assert(N->getOpcode() == ISD::SIGN_EXTEND && 11812 "Invalid extension type"); 11813 EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout()); 11814 SDValue ShiftCst = 11815 DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy); 11816 return DAG.getNode( 11817 ISD::SRA, dl, N->getValueType(0), 11818 DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst), 11819 ShiftCst); 11820 } 11821 11822 SDValue PPCTargetLowering::combineSetCC(SDNode *N, 11823 DAGCombinerInfo &DCI) const { 11824 assert(N->getOpcode() == ISD::SETCC && 11825 "Should be called with a SETCC node"); 11826 11827 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 11828 if (CC == ISD::SETNE || CC == ISD::SETEQ) { 11829 SDValue LHS = N->getOperand(0); 11830 SDValue RHS = N->getOperand(1); 11831 11832 // If there is a '0 - y' pattern, canonicalize the pattern to the RHS. 11833 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) && 11834 LHS.hasOneUse()) 11835 std::swap(LHS, RHS); 11836 11837 // x == 0-y --> x+y == 0 11838 // x != 0-y --> x+y != 0 11839 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && 11840 RHS.hasOneUse()) { 11841 SDLoc DL(N); 11842 SelectionDAG &DAG = DCI.DAG; 11843 EVT VT = N->getValueType(0); 11844 EVT OpVT = LHS.getValueType(); 11845 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1)); 11846 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); 11847 } 11848 } 11849 11850 return DAGCombineTruncBoolExt(N, DCI); 11851 } 11852 11853 // Is this an extending load from an f32 to an f64? 11854 static bool isFPExtLoad(SDValue Op) { 11855 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode())) 11856 return LD->getExtensionType() == ISD::EXTLOAD && 11857 Op.getValueType() == MVT::f64; 11858 return false; 11859 } 11860 11861 /// Reduces the number of fp-to-int conversion when building a vector. 11862 /// 11863 /// If this vector is built out of floating to integer conversions, 11864 /// transform it to a vector built out of floating point values followed by a 11865 /// single floating to integer conversion of the vector. 11866 /// Namely (build_vector (fptosi $A), (fptosi $B), ...) 11867 /// becomes (fptosi (build_vector ($A, $B, ...))) 11868 SDValue PPCTargetLowering:: 11869 combineElementTruncationToVectorTruncation(SDNode *N, 11870 DAGCombinerInfo &DCI) const { 11871 assert(N->getOpcode() == ISD::BUILD_VECTOR && 11872 "Should be called with a BUILD_VECTOR node"); 11873 11874 SelectionDAG &DAG = DCI.DAG; 11875 SDLoc dl(N); 11876 11877 SDValue FirstInput = N->getOperand(0); 11878 assert(FirstInput.getOpcode() == PPCISD::MFVSR && 11879 "The input operand must be an fp-to-int conversion."); 11880 11881 // This combine happens after legalization so the fp_to_[su]i nodes are 11882 // already converted to PPCSISD nodes. 11883 unsigned FirstConversion = FirstInput.getOperand(0).getOpcode(); 11884 if (FirstConversion == PPCISD::FCTIDZ || 11885 FirstConversion == PPCISD::FCTIDUZ || 11886 FirstConversion == PPCISD::FCTIWZ || 11887 FirstConversion == PPCISD::FCTIWUZ) { 11888 bool IsSplat = true; 11889 bool Is32Bit = FirstConversion == PPCISD::FCTIWZ || 11890 FirstConversion == PPCISD::FCTIWUZ; 11891 EVT SrcVT = FirstInput.getOperand(0).getValueType(); 11892 SmallVector<SDValue, 4> Ops; 11893 EVT TargetVT = N->getValueType(0); 11894 for (int i = 0, e = N->getNumOperands(); i < e; ++i) { 11895 SDValue NextOp = N->getOperand(i); 11896 if (NextOp.getOpcode() != PPCISD::MFVSR) 11897 return SDValue(); 11898 unsigned NextConversion = NextOp.getOperand(0).getOpcode(); 11899 if (NextConversion != FirstConversion) 11900 return SDValue(); 11901 // If we are converting to 32-bit integers, we need to add an FP_ROUND. 11902 // This is not valid if the input was originally double precision. It is 11903 // also not profitable to do unless this is an extending load in which 11904 // case doing this combine will allow us to combine consecutive loads. 11905 if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0))) 11906 return SDValue(); 11907 if (N->getOperand(i) != FirstInput) 11908 IsSplat = false; 11909 } 11910 11911 // If this is a splat, we leave it as-is since there will be only a single 11912 // fp-to-int conversion followed by a splat of the integer. This is better 11913 // for 32-bit and smaller ints and neutral for 64-bit ints. 11914 if (IsSplat) 11915 return SDValue(); 11916 11917 // Now that we know we have the right type of node, get its operands 11918 for (int i = 0, e = N->getNumOperands(); i < e; ++i) { 11919 SDValue In = N->getOperand(i).getOperand(0); 11920 if (Is32Bit) { 11921 // For 32-bit values, we need to add an FP_ROUND node (if we made it 11922 // here, we know that all inputs are extending loads so this is safe). 11923 if (In.isUndef()) 11924 Ops.push_back(DAG.getUNDEF(SrcVT)); 11925 else { 11926 SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl, 11927 MVT::f32, In.getOperand(0), 11928 DAG.getIntPtrConstant(1, dl)); 11929 Ops.push_back(Trunc); 11930 } 11931 } else 11932 Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0)); 11933 } 11934 11935 unsigned Opcode; 11936 if (FirstConversion == PPCISD::FCTIDZ || 11937 FirstConversion == PPCISD::FCTIWZ) 11938 Opcode = ISD::FP_TO_SINT; 11939 else 11940 Opcode = ISD::FP_TO_UINT; 11941 11942 EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32; 11943 SDValue BV = DAG.getBuildVector(NewVT, dl, Ops); 11944 return DAG.getNode(Opcode, dl, TargetVT, BV); 11945 } 11946 return SDValue(); 11947 } 11948 11949 /// Reduce the number of loads when building a vector. 11950 /// 11951 /// Building a vector out of multiple loads can be converted to a load 11952 /// of the vector type if the loads are consecutive. If the loads are 11953 /// consecutive but in descending order, a shuffle is added at the end 11954 /// to reorder the vector. 11955 static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { 11956 assert(N->getOpcode() == ISD::BUILD_VECTOR && 11957 "Should be called with a BUILD_VECTOR node"); 11958 11959 SDLoc dl(N); 11960 bool InputsAreConsecutiveLoads = true; 11961 bool InputsAreReverseConsecutive = true; 11962 unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8; 11963 SDValue FirstInput = N->getOperand(0); 11964 bool IsRoundOfExtLoad = false; 11965 11966 if (FirstInput.getOpcode() == ISD::FP_ROUND && 11967 FirstInput.getOperand(0).getOpcode() == ISD::LOAD) { 11968 LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0)); 11969 IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD; 11970 } 11971 // Not a build vector of (possibly fp_rounded) loads. 11972 if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) || 11973 N->getNumOperands() == 1) 11974 return SDValue(); 11975 11976 for (int i = 1, e = N->getNumOperands(); i < e; ++i) { 11977 // If any inputs are fp_round(extload), they all must be. 11978 if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND) 11979 return SDValue(); 11980 11981 SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) : 11982 N->getOperand(i); 11983 if (NextInput.getOpcode() != ISD::LOAD) 11984 return SDValue(); 11985 11986 SDValue PreviousInput = 11987 IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1); 11988 LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput); 11989 LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput); 11990 11991 // If any inputs are fp_round(extload), they all must be. 11992 if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD) 11993 return SDValue(); 11994 11995 if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG)) 11996 InputsAreConsecutiveLoads = false; 11997 if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG)) 11998 InputsAreReverseConsecutive = false; 11999 12000 // Exit early if the loads are neither consecutive nor reverse consecutive. 12001 if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive) 12002 return SDValue(); 12003 } 12004 12005 assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) && 12006 "The loads cannot be both consecutive and reverse consecutive."); 12007 12008 SDValue FirstLoadOp = 12009 IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput; 12010 SDValue LastLoadOp = 12011 IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) : 12012 N->getOperand(N->getNumOperands()-1); 12013 12014 LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp); 12015 LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp); 12016 if (InputsAreConsecutiveLoads) { 12017 assert(LD1 && "Input needs to be a LoadSDNode."); 12018 return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(), 12019 LD1->getBasePtr(), LD1->getPointerInfo(), 12020 LD1->getAlignment()); 12021 } 12022 if (InputsAreReverseConsecutive) { 12023 assert(LDL && "Input needs to be a LoadSDNode."); 12024 SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(), 12025 LDL->getBasePtr(), LDL->getPointerInfo(), 12026 LDL->getAlignment()); 12027 SmallVector<int, 16> Ops; 12028 for (int i = N->getNumOperands() - 1; i >= 0; i--) 12029 Ops.push_back(i); 12030 12031 return DAG.getVectorShuffle(N->getValueType(0), dl, Load, 12032 DAG.getUNDEF(N->getValueType(0)), Ops); 12033 } 12034 return SDValue(); 12035 } 12036 12037 // This function adds the required vector_shuffle needed to get 12038 // the elements of the vector extract in the correct position 12039 // as specified by the CorrectElems encoding. 12040 static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG, 12041 SDValue Input, uint64_t Elems, 12042 uint64_t CorrectElems) { 12043 SDLoc dl(N); 12044 12045 unsigned NumElems = Input.getValueType().getVectorNumElements(); 12046 SmallVector<int, 16> ShuffleMask(NumElems, -1); 12047 12048 // Knowing the element indices being extracted from the original 12049 // vector and the order in which they're being inserted, just put 12050 // them at element indices required for the instruction. 12051 for (unsigned i = 0; i < N->getNumOperands(); i++) { 12052 if (DAG.getDataLayout().isLittleEndian()) 12053 ShuffleMask[CorrectElems & 0xF] = Elems & 0xF; 12054 else 12055 ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4; 12056 CorrectElems = CorrectElems >> 8; 12057 Elems = Elems >> 8; 12058 } 12059 12060 SDValue Shuffle = 12061 DAG.getVectorShuffle(Input.getValueType(), dl, Input, 12062 DAG.getUNDEF(Input.getValueType()), ShuffleMask); 12063 12064 EVT Ty = N->getValueType(0); 12065 SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle); 12066 return BV; 12067 } 12068 12069 // Look for build vector patterns where input operands come from sign 12070 // extended vector_extract elements of specific indices. If the correct indices 12071 // aren't used, add a vector shuffle to fix up the indices and create a new 12072 // PPCISD:SExtVElems node which selects the vector sign extend instructions 12073 // during instruction selection. 12074 static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) { 12075 // This array encodes the indices that the vector sign extend instructions 12076 // extract from when extending from one type to another for both BE and LE. 12077 // The right nibble of each byte corresponds to the LE incides. 12078 // and the left nibble of each byte corresponds to the BE incides. 12079 // For example: 0x3074B8FC byte->word 12080 // For LE: the allowed indices are: 0x0,0x4,0x8,0xC 12081 // For BE: the allowed indices are: 0x3,0x7,0xB,0xF 12082 // For example: 0x000070F8 byte->double word 12083 // For LE: the allowed indices are: 0x0,0x8 12084 // For BE: the allowed indices are: 0x7,0xF 12085 uint64_t TargetElems[] = { 12086 0x3074B8FC, // b->w 12087 0x000070F8, // b->d 12088 0x10325476, // h->w 12089 0x00003074, // h->d 12090 0x00001032, // w->d 12091 }; 12092 12093 uint64_t Elems = 0; 12094 int Index; 12095 SDValue Input; 12096 12097 auto isSExtOfVecExtract = [&](SDValue Op) -> bool { 12098 if (!Op) 12099 return false; 12100 if (Op.getOpcode() != ISD::SIGN_EXTEND && 12101 Op.getOpcode() != ISD::SIGN_EXTEND_INREG) 12102 return false; 12103 12104 // A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value 12105 // of the right width. 12106 SDValue Extract = Op.getOperand(0); 12107 if (Extract.getOpcode() == ISD::ANY_EXTEND) 12108 Extract = Extract.getOperand(0); 12109 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12110 return false; 12111 12112 ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1)); 12113 if (!ExtOp) 12114 return false; 12115 12116 Index = ExtOp->getZExtValue(); 12117 if (Input && Input != Extract.getOperand(0)) 12118 return false; 12119 12120 if (!Input) 12121 Input = Extract.getOperand(0); 12122 12123 Elems = Elems << 8; 12124 Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4; 12125 Elems |= Index; 12126 12127 return true; 12128 }; 12129 12130 // If the build vector operands aren't sign extended vector extracts, 12131 // of the same input vector, then return. 12132 for (unsigned i = 0; i < N->getNumOperands(); i++) { 12133 if (!isSExtOfVecExtract(N->getOperand(i))) { 12134 return SDValue(); 12135 } 12136 } 12137 12138 // If the vector extract indicies are not correct, add the appropriate 12139 // vector_shuffle. 12140 int TgtElemArrayIdx; 12141 int InputSize = Input.getValueType().getScalarSizeInBits(); 12142 int OutputSize = N->getValueType(0).getScalarSizeInBits(); 12143 if (InputSize + OutputSize == 40) 12144 TgtElemArrayIdx = 0; 12145 else if (InputSize + OutputSize == 72) 12146 TgtElemArrayIdx = 1; 12147 else if (InputSize + OutputSize == 48) 12148 TgtElemArrayIdx = 2; 12149 else if (InputSize + OutputSize == 80) 12150 TgtElemArrayIdx = 3; 12151 else if (InputSize + OutputSize == 96) 12152 TgtElemArrayIdx = 4; 12153 else 12154 return SDValue(); 12155 12156 uint64_t CorrectElems = TargetElems[TgtElemArrayIdx]; 12157 CorrectElems = DAG.getDataLayout().isLittleEndian() 12158 ? CorrectElems & 0x0F0F0F0F0F0F0F0F 12159 : CorrectElems & 0xF0F0F0F0F0F0F0F0; 12160 if (Elems != CorrectElems) { 12161 return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems); 12162 } 12163 12164 // Regular lowering will catch cases where a shuffle is not needed. 12165 return SDValue(); 12166 } 12167 12168 SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, 12169 DAGCombinerInfo &DCI) const { 12170 assert(N->getOpcode() == ISD::BUILD_VECTOR && 12171 "Should be called with a BUILD_VECTOR node"); 12172 12173 SelectionDAG &DAG = DCI.DAG; 12174 SDLoc dl(N); 12175 12176 if (!Subtarget.hasVSX()) 12177 return SDValue(); 12178 12179 // The target independent DAG combiner will leave a build_vector of 12180 // float-to-int conversions intact. We can generate MUCH better code for 12181 // a float-to-int conversion of a vector of floats. 12182 SDValue FirstInput = N->getOperand(0); 12183 if (FirstInput.getOpcode() == PPCISD::MFVSR) { 12184 SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI); 12185 if (Reduced) 12186 return Reduced; 12187 } 12188 12189 // If we're building a vector out of consecutive loads, just load that 12190 // vector type. 12191 SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG); 12192 if (Reduced) 12193 return Reduced; 12194 12195 // If we're building a vector out of extended elements from another vector 12196 // we have P9 vector integer extend instructions. The code assumes legal 12197 // input types (i.e. it can't handle things like v4i16) so do not run before 12198 // legalization. 12199 if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) { 12200 Reduced = combineBVOfVecSExt(N, DAG); 12201 if (Reduced) 12202 return Reduced; 12203 } 12204 12205 12206 if (N->getValueType(0) != MVT::v2f64) 12207 return SDValue(); 12208 12209 // Looking for: 12210 // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1)) 12211 if (FirstInput.getOpcode() != ISD::SINT_TO_FP && 12212 FirstInput.getOpcode() != ISD::UINT_TO_FP) 12213 return SDValue(); 12214 if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP && 12215 N->getOperand(1).getOpcode() != ISD::UINT_TO_FP) 12216 return SDValue(); 12217 if (FirstInput.getOpcode() != N->getOperand(1).getOpcode()) 12218 return SDValue(); 12219 12220 SDValue Ext1 = FirstInput.getOperand(0); 12221 SDValue Ext2 = N->getOperand(1).getOperand(0); 12222 if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 12223 Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12224 return SDValue(); 12225 12226 ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1)); 12227 ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1)); 12228 if (!Ext1Op || !Ext2Op) 12229 return SDValue(); 12230 if (Ext1.getValueType() != MVT::i32 || 12231 Ext2.getValueType() != MVT::i32) 12232 if (Ext1.getOperand(0) != Ext2.getOperand(0)) 12233 return SDValue(); 12234 12235 int FirstElem = Ext1Op->getZExtValue(); 12236 int SecondElem = Ext2Op->getZExtValue(); 12237 int SubvecIdx; 12238 if (FirstElem == 0 && SecondElem == 1) 12239 SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0; 12240 else if (FirstElem == 2 && SecondElem == 3) 12241 SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1; 12242 else 12243 return SDValue(); 12244 12245 SDValue SrcVec = Ext1.getOperand(0); 12246 auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ? 12247 PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP; 12248 return DAG.getNode(NodeType, dl, MVT::v2f64, 12249 SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl)); 12250 } 12251 12252 SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, 12253 DAGCombinerInfo &DCI) const { 12254 assert((N->getOpcode() == ISD::SINT_TO_FP || 12255 N->getOpcode() == ISD::UINT_TO_FP) && 12256 "Need an int -> FP conversion node here"); 12257 12258 if (useSoftFloat() || !Subtarget.has64BitSupport()) 12259 return SDValue(); 12260 12261 SelectionDAG &DAG = DCI.DAG; 12262 SDLoc dl(N); 12263 SDValue Op(N, 0); 12264 12265 // Don't handle ppc_fp128 here or conversions that are out-of-range capable 12266 // from the hardware. 12267 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) 12268 return SDValue(); 12269 if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) || 12270 Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64)) 12271 return SDValue(); 12272 12273 SDValue FirstOperand(Op.getOperand(0)); 12274 bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD && 12275 (FirstOperand.getValueType() == MVT::i8 || 12276 FirstOperand.getValueType() == MVT::i16); 12277 if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) { 12278 bool Signed = N->getOpcode() == ISD::SINT_TO_FP; 12279 bool DstDouble = Op.getValueType() == MVT::f64; 12280 unsigned ConvOp = Signed ? 12281 (DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) : 12282 (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS); 12283 SDValue WidthConst = 12284 DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2, 12285 dl, false); 12286 LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode()); 12287 SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst }; 12288 SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl, 12289 DAG.getVTList(MVT::f64, MVT::Other), 12290 Ops, MVT::i8, LDN->getMemOperand()); 12291 12292 // For signed conversion, we need to sign-extend the value in the VSR 12293 if (Signed) { 12294 SDValue ExtOps[] = { Ld, WidthConst }; 12295 SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps); 12296 return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext); 12297 } else 12298 return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld); 12299 } 12300 12301 12302 // For i32 intermediate values, unfortunately, the conversion functions 12303 // leave the upper 32 bits of the value are undefined. Within the set of 12304 // scalar instructions, we have no method for zero- or sign-extending the 12305 // value. Thus, we cannot handle i32 intermediate values here. 12306 if (Op.getOperand(0).getValueType() == MVT::i32) 12307 return SDValue(); 12308 12309 assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && 12310 "UINT_TO_FP is supported only with FPCVT"); 12311 12312 // If we have FCFIDS, then use it when converting to single-precision. 12313 // Otherwise, convert to double-precision and then round. 12314 unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 12315 ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS 12316 : PPCISD::FCFIDS) 12317 : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU 12318 : PPCISD::FCFID); 12319 MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 12320 ? MVT::f32 12321 : MVT::f64; 12322 12323 // If we're converting from a float, to an int, and back to a float again, 12324 // then we don't need the store/load pair at all. 12325 if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT && 12326 Subtarget.hasFPCVT()) || 12327 (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) { 12328 SDValue Src = Op.getOperand(0).getOperand(0); 12329 if (Src.getValueType() == MVT::f32) { 12330 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); 12331 DCI.AddToWorklist(Src.getNode()); 12332 } else if (Src.getValueType() != MVT::f64) { 12333 // Make sure that we don't pick up a ppc_fp128 source value. 12334 return SDValue(); 12335 } 12336 12337 unsigned FCTOp = 12338 Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ : 12339 PPCISD::FCTIDUZ; 12340 12341 SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src); 12342 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp); 12343 12344 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) { 12345 FP = DAG.getNode(ISD::FP_ROUND, dl, 12346 MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); 12347 DCI.AddToWorklist(FP.getNode()); 12348 } 12349 12350 return FP; 12351 } 12352 12353 return SDValue(); 12354 } 12355 12356 // expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for 12357 // builtins) into loads with swaps. 12358 SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, 12359 DAGCombinerInfo &DCI) const { 12360 SelectionDAG &DAG = DCI.DAG; 12361 SDLoc dl(N); 12362 SDValue Chain; 12363 SDValue Base; 12364 MachineMemOperand *MMO; 12365 12366 switch (N->getOpcode()) { 12367 default: 12368 llvm_unreachable("Unexpected opcode for little endian VSX load"); 12369 case ISD::LOAD: { 12370 LoadSDNode *LD = cast<LoadSDNode>(N); 12371 Chain = LD->getChain(); 12372 Base = LD->getBasePtr(); 12373 MMO = LD->getMemOperand(); 12374 // If the MMO suggests this isn't a load of a full vector, leave 12375 // things alone. For a built-in, we have to make the change for 12376 // correctness, so if there is a size problem that will be a bug. 12377 if (MMO->getSize() < 16) 12378 return SDValue(); 12379 break; 12380 } 12381 case ISD::INTRINSIC_W_CHAIN: { 12382 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N); 12383 Chain = Intrin->getChain(); 12384 // Similarly to the store case below, Intrin->getBasePtr() doesn't get 12385 // us what we want. Get operand 2 instead. 12386 Base = Intrin->getOperand(2); 12387 MMO = Intrin->getMemOperand(); 12388 break; 12389 } 12390 } 12391 12392 MVT VecTy = N->getValueType(0).getSimpleVT(); 12393 12394 // Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is 12395 // aligned and the type is a vector with elements up to 4 bytes 12396 if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) 12397 && VecTy.getScalarSizeInBits() <= 32 ) { 12398 return SDValue(); 12399 } 12400 12401 SDValue LoadOps[] = { Chain, Base }; 12402 SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl, 12403 DAG.getVTList(MVT::v2f64, MVT::Other), 12404 LoadOps, MVT::v2f64, MMO); 12405 12406 DCI.AddToWorklist(Load.getNode()); 12407 Chain = Load.getValue(1); 12408 SDValue Swap = DAG.getNode( 12409 PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load); 12410 DCI.AddToWorklist(Swap.getNode()); 12411 12412 // Add a bitcast if the resulting load type doesn't match v2f64. 12413 if (VecTy != MVT::v2f64) { 12414 SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap); 12415 DCI.AddToWorklist(N.getNode()); 12416 // Package {bitcast value, swap's chain} to match Load's shape. 12417 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other), 12418 N, Swap.getValue(1)); 12419 } 12420 12421 return Swap; 12422 } 12423 12424 // expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for 12425 // builtins) into stores with swaps. 12426 SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, 12427 DAGCombinerInfo &DCI) const { 12428 SelectionDAG &DAG = DCI.DAG; 12429 SDLoc dl(N); 12430 SDValue Chain; 12431 SDValue Base; 12432 unsigned SrcOpnd; 12433 MachineMemOperand *MMO; 12434 12435 switch (N->getOpcode()) { 12436 default: 12437 llvm_unreachable("Unexpected opcode for little endian VSX store"); 12438 case ISD::STORE: { 12439 StoreSDNode *ST = cast<StoreSDNode>(N); 12440 Chain = ST->getChain(); 12441 Base = ST->getBasePtr(); 12442 MMO = ST->getMemOperand(); 12443 SrcOpnd = 1; 12444 // If the MMO suggests this isn't a store of a full vector, leave 12445 // things alone. For a built-in, we have to make the change for 12446 // correctness, so if there is a size problem that will be a bug. 12447 if (MMO->getSize() < 16) 12448 return SDValue(); 12449 break; 12450 } 12451 case ISD::INTRINSIC_VOID: { 12452 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N); 12453 Chain = Intrin->getChain(); 12454 // Intrin->getBasePtr() oddly does not get what we want. 12455 Base = Intrin->getOperand(3); 12456 MMO = Intrin->getMemOperand(); 12457 SrcOpnd = 2; 12458 break; 12459 } 12460 } 12461 12462 SDValue Src = N->getOperand(SrcOpnd); 12463 MVT VecTy = Src.getValueType().getSimpleVT(); 12464 12465 // Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is 12466 // aligned and the type is a vector with elements up to 4 bytes 12467 if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) 12468 && VecTy.getScalarSizeInBits() <= 32 ) { 12469 return SDValue(); 12470 } 12471 12472 // All stores are done as v2f64 and possible bit cast. 12473 if (VecTy != MVT::v2f64) { 12474 Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src); 12475 DCI.AddToWorklist(Src.getNode()); 12476 } 12477 12478 SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl, 12479 DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src); 12480 DCI.AddToWorklist(Swap.getNode()); 12481 Chain = Swap.getValue(1); 12482 SDValue StoreOps[] = { Chain, Swap, Base }; 12483 SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl, 12484 DAG.getVTList(MVT::Other), 12485 StoreOps, VecTy, MMO); 12486 DCI.AddToWorklist(Store.getNode()); 12487 return Store; 12488 } 12489 12490 // Handle DAG combine for STORE (FP_TO_INT F). 12491 SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N, 12492 DAGCombinerInfo &DCI) const { 12493 12494 SelectionDAG &DAG = DCI.DAG; 12495 SDLoc dl(N); 12496 unsigned Opcode = N->getOperand(1).getOpcode(); 12497 12498 assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) 12499 && "Not a FP_TO_INT Instruction!"); 12500 12501 SDValue Val = N->getOperand(1).getOperand(0); 12502 EVT Op1VT = N->getOperand(1).getValueType(); 12503 EVT ResVT = Val.getValueType(); 12504 12505 // Floating point types smaller than 32 bits are not legal on Power. 12506 if (ResVT.getScalarSizeInBits() < 32) 12507 return SDValue(); 12508 12509 // Only perform combine for conversion to i64/i32 or power9 i16/i8. 12510 bool ValidTypeForStoreFltAsInt = 12511 (Op1VT == MVT::i32 || Op1VT == MVT::i64 || 12512 (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8))); 12513 12514 if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Altivec() || 12515 cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt) 12516 return SDValue(); 12517 12518 // Extend f32 values to f64 12519 if (ResVT.getScalarSizeInBits() == 32) { 12520 Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val); 12521 DCI.AddToWorklist(Val.getNode()); 12522 } 12523 12524 // Set signed or unsigned conversion opcode. 12525 unsigned ConvOpcode = (Opcode == ISD::FP_TO_SINT) ? 12526 PPCISD::FP_TO_SINT_IN_VSR : 12527 PPCISD::FP_TO_UINT_IN_VSR; 12528 12529 Val = DAG.getNode(ConvOpcode, 12530 dl, ResVT == MVT::f128 ? MVT::f128 : MVT::f64, Val); 12531 DCI.AddToWorklist(Val.getNode()); 12532 12533 // Set number of bytes being converted. 12534 unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8; 12535 SDValue Ops[] = { N->getOperand(0), Val, N->getOperand(2), 12536 DAG.getIntPtrConstant(ByteSize, dl, false), 12537 DAG.getValueType(Op1VT) }; 12538 12539 Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl, 12540 DAG.getVTList(MVT::Other), Ops, 12541 cast<StoreSDNode>(N)->getMemoryVT(), 12542 cast<StoreSDNode>(N)->getMemOperand()); 12543 12544 DCI.AddToWorklist(Val.getNode()); 12545 return Val; 12546 } 12547 12548 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, 12549 DAGCombinerInfo &DCI) const { 12550 SelectionDAG &DAG = DCI.DAG; 12551 SDLoc dl(N); 12552 switch (N->getOpcode()) { 12553 default: break; 12554 case ISD::ADD: 12555 return combineADD(N, DCI); 12556 case ISD::SHL: 12557 return combineSHL(N, DCI); 12558 case ISD::SRA: 12559 return combineSRA(N, DCI); 12560 case ISD::SRL: 12561 return combineSRL(N, DCI); 12562 case PPCISD::SHL: 12563 if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. 12564 return N->getOperand(0); 12565 break; 12566 case PPCISD::SRL: 12567 if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0. 12568 return N->getOperand(0); 12569 break; 12570 case PPCISD::SRA: 12571 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 12572 if (C->isNullValue() || // 0 >>s V -> 0. 12573 C->isAllOnesValue()) // -1 >>s V -> -1. 12574 return N->getOperand(0); 12575 } 12576 break; 12577 case ISD::SIGN_EXTEND: 12578 case ISD::ZERO_EXTEND: 12579 case ISD::ANY_EXTEND: 12580 return DAGCombineExtBoolTrunc(N, DCI); 12581 case ISD::TRUNCATE: 12582 return combineTRUNCATE(N, DCI); 12583 case ISD::SETCC: 12584 if (SDValue CSCC = combineSetCC(N, DCI)) 12585 return CSCC; 12586 LLVM_FALLTHROUGH; 12587 case ISD::SELECT_CC: 12588 return DAGCombineTruncBoolExt(N, DCI); 12589 case ISD::SINT_TO_FP: 12590 case ISD::UINT_TO_FP: 12591 return combineFPToIntToFP(N, DCI); 12592 case ISD::STORE: { 12593 12594 EVT Op1VT = N->getOperand(1).getValueType(); 12595 unsigned Opcode = N->getOperand(1).getOpcode(); 12596 12597 if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) { 12598 SDValue Val= combineStoreFPToInt(N, DCI); 12599 if (Val) 12600 return Val; 12601 } 12602 12603 // Turn STORE (BSWAP) -> sthbrx/stwbrx. 12604 if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP && 12605 N->getOperand(1).getNode()->hasOneUse() && 12606 (Op1VT == MVT::i32 || Op1VT == MVT::i16 || 12607 (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) { 12608 12609 // STBRX can only handle simple types and it makes no sense to store less 12610 // two bytes in byte-reversed order. 12611 EVT mVT = cast<StoreSDNode>(N)->getMemoryVT(); 12612 if (mVT.isExtended() || mVT.getSizeInBits() < 16) 12613 break; 12614 12615 SDValue BSwapOp = N->getOperand(1).getOperand(0); 12616 // Do an any-extend to 32-bits if this is a half-word input. 12617 if (BSwapOp.getValueType() == MVT::i16) 12618 BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp); 12619 12620 // If the type of BSWAP operand is wider than stored memory width 12621 // it need to be shifted to the right side before STBRX. 12622 if (Op1VT.bitsGT(mVT)) { 12623 int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits(); 12624 BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp, 12625 DAG.getConstant(Shift, dl, MVT::i32)); 12626 // Need to truncate if this is a bswap of i64 stored as i32/i16. 12627 if (Op1VT == MVT::i64) 12628 BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp); 12629 } 12630 12631 SDValue Ops[] = { 12632 N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT) 12633 }; 12634 return 12635 DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other), 12636 Ops, cast<StoreSDNode>(N)->getMemoryVT(), 12637 cast<StoreSDNode>(N)->getMemOperand()); 12638 } 12639 12640 // STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0> 12641 // So it can increase the chance of CSE constant construction. 12642 if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() && 12643 isa<ConstantSDNode>(N->getOperand(1)) && Op1VT == MVT::i32) { 12644 // Need to sign-extended to 64-bits to handle negative values. 12645 EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT(); 12646 uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1), 12647 MemVT.getSizeInBits()); 12648 SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64); 12649 12650 // DAG.getTruncStore() can't be used here because it doesn't accept 12651 // the general (base + offset) addressing mode. 12652 // So we use UpdateNodeOperands and setTruncatingStore instead. 12653 DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2), 12654 N->getOperand(3)); 12655 cast<StoreSDNode>(N)->setTruncatingStore(true); 12656 return SDValue(N, 0); 12657 } 12658 12659 // For little endian, VSX stores require generating xxswapd/lxvd2x. 12660 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. 12661 if (Op1VT.isSimple()) { 12662 MVT StoreVT = Op1VT.getSimpleVT(); 12663 if (Subtarget.needsSwapsForVSXMemOps() && 12664 (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 || 12665 StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32)) 12666 return expandVSXStoreForLE(N, DCI); 12667 } 12668 break; 12669 } 12670 case ISD::LOAD: { 12671 LoadSDNode *LD = cast<LoadSDNode>(N); 12672 EVT VT = LD->getValueType(0); 12673 12674 // For little endian, VSX loads require generating lxvd2x/xxswapd. 12675 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. 12676 if (VT.isSimple()) { 12677 MVT LoadVT = VT.getSimpleVT(); 12678 if (Subtarget.needsSwapsForVSXMemOps() && 12679 (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 || 12680 LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32)) 12681 return expandVSXLoadForLE(N, DCI); 12682 } 12683 12684 // We sometimes end up with a 64-bit integer load, from which we extract 12685 // two single-precision floating-point numbers. This happens with 12686 // std::complex<float>, and other similar structures, because of the way we 12687 // canonicalize structure copies. However, if we lack direct moves, 12688 // then the final bitcasts from the extracted integer values to the 12689 // floating-point numbers turn into store/load pairs. Even with direct moves, 12690 // just loading the two floating-point numbers is likely better. 12691 auto ReplaceTwoFloatLoad = [&]() { 12692 if (VT != MVT::i64) 12693 return false; 12694 12695 if (LD->getExtensionType() != ISD::NON_EXTLOAD || 12696 LD->isVolatile()) 12697 return false; 12698 12699 // We're looking for a sequence like this: 12700 // t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64 12701 // t16: i64 = srl t13, Constant:i32<32> 12702 // t17: i32 = truncate t16 12703 // t18: f32 = bitcast t17 12704 // t19: i32 = truncate t13 12705 // t20: f32 = bitcast t19 12706 12707 if (!LD->hasNUsesOfValue(2, 0)) 12708 return false; 12709 12710 auto UI = LD->use_begin(); 12711 while (UI.getUse().getResNo() != 0) ++UI; 12712 SDNode *Trunc = *UI++; 12713 while (UI.getUse().getResNo() != 0) ++UI; 12714 SDNode *RightShift = *UI; 12715 if (Trunc->getOpcode() != ISD::TRUNCATE) 12716 std::swap(Trunc, RightShift); 12717 12718 if (Trunc->getOpcode() != ISD::TRUNCATE || 12719 Trunc->getValueType(0) != MVT::i32 || 12720 !Trunc->hasOneUse()) 12721 return false; 12722 if (RightShift->getOpcode() != ISD::SRL || 12723 !isa<ConstantSDNode>(RightShift->getOperand(1)) || 12724 RightShift->getConstantOperandVal(1) != 32 || 12725 !RightShift->hasOneUse()) 12726 return false; 12727 12728 SDNode *Trunc2 = *RightShift->use_begin(); 12729 if (Trunc2->getOpcode() != ISD::TRUNCATE || 12730 Trunc2->getValueType(0) != MVT::i32 || 12731 !Trunc2->hasOneUse()) 12732 return false; 12733 12734 SDNode *Bitcast = *Trunc->use_begin(); 12735 SDNode *Bitcast2 = *Trunc2->use_begin(); 12736 12737 if (Bitcast->getOpcode() != ISD::BITCAST || 12738 Bitcast->getValueType(0) != MVT::f32) 12739 return false; 12740 if (Bitcast2->getOpcode() != ISD::BITCAST || 12741 Bitcast2->getValueType(0) != MVT::f32) 12742 return false; 12743 12744 if (Subtarget.isLittleEndian()) 12745 std::swap(Bitcast, Bitcast2); 12746 12747 // Bitcast has the second float (in memory-layout order) and Bitcast2 12748 // has the first one. 12749 12750 SDValue BasePtr = LD->getBasePtr(); 12751 if (LD->isIndexed()) { 12752 assert(LD->getAddressingMode() == ISD::PRE_INC && 12753 "Non-pre-inc AM on PPC?"); 12754 BasePtr = 12755 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 12756 LD->getOffset()); 12757 } 12758 12759 auto MMOFlags = 12760 LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile; 12761 SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr, 12762 LD->getPointerInfo(), LD->getAlignment(), 12763 MMOFlags, LD->getAAInfo()); 12764 SDValue AddPtr = 12765 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), 12766 BasePtr, DAG.getIntPtrConstant(4, dl)); 12767 SDValue FloatLoad2 = DAG.getLoad( 12768 MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr, 12769 LD->getPointerInfo().getWithOffset(4), 12770 MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo()); 12771 12772 if (LD->isIndexed()) { 12773 // Note that DAGCombine should re-form any pre-increment load(s) from 12774 // what is produced here if that makes sense. 12775 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr); 12776 } 12777 12778 DCI.CombineTo(Bitcast2, FloatLoad); 12779 DCI.CombineTo(Bitcast, FloatLoad2); 12780 12781 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1), 12782 SDValue(FloatLoad2.getNode(), 1)); 12783 return true; 12784 }; 12785 12786 if (ReplaceTwoFloatLoad()) 12787 return SDValue(N, 0); 12788 12789 EVT MemVT = LD->getMemoryVT(); 12790 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); 12791 unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); 12792 Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); 12793 unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy); 12794 if (LD->isUnindexed() && VT.isVector() && 12795 ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && 12796 // P8 and later hardware should just use LOAD. 12797 !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || 12798 VT == MVT::v4i32 || VT == MVT::v4f32)) || 12799 (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) && 12800 LD->getAlignment() >= ScalarABIAlignment)) && 12801 LD->getAlignment() < ABIAlignment) { 12802 // This is a type-legal unaligned Altivec or QPX load. 12803 SDValue Chain = LD->getChain(); 12804 SDValue Ptr = LD->getBasePtr(); 12805 bool isLittleEndian = Subtarget.isLittleEndian(); 12806 12807 // This implements the loading of unaligned vectors as described in 12808 // the venerable Apple Velocity Engine overview. Specifically: 12809 // https://developer.apple.com/hardwaredrivers/ve/alignment.html 12810 // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html 12811 // 12812 // The general idea is to expand a sequence of one or more unaligned 12813 // loads into an alignment-based permutation-control instruction (lvsl 12814 // or lvsr), a series of regular vector loads (which always truncate 12815 // their input address to an aligned address), and a series of 12816 // permutations. The results of these permutations are the requested 12817 // loaded values. The trick is that the last "extra" load is not taken 12818 // from the address you might suspect (sizeof(vector) bytes after the 12819 // last requested load), but rather sizeof(vector) - 1 bytes after the 12820 // last requested vector. The point of this is to avoid a page fault if 12821 // the base address happened to be aligned. This works because if the 12822 // base address is aligned, then adding less than a full vector length 12823 // will cause the last vector in the sequence to be (re)loaded. 12824 // Otherwise, the next vector will be fetched as you might suspect was 12825 // necessary. 12826 12827 // We might be able to reuse the permutation generation from 12828 // a different base address offset from this one by an aligned amount. 12829 // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this 12830 // optimization later. 12831 Intrinsic::ID Intr, IntrLD, IntrPerm; 12832 MVT PermCntlTy, PermTy, LDTy; 12833 if (Subtarget.hasAltivec()) { 12834 Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr : 12835 Intrinsic::ppc_altivec_lvsl; 12836 IntrLD = Intrinsic::ppc_altivec_lvx; 12837 IntrPerm = Intrinsic::ppc_altivec_vperm; 12838 PermCntlTy = MVT::v16i8; 12839 PermTy = MVT::v4i32; 12840 LDTy = MVT::v4i32; 12841 } else { 12842 Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld : 12843 Intrinsic::ppc_qpx_qvlpcls; 12844 IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd : 12845 Intrinsic::ppc_qpx_qvlfs; 12846 IntrPerm = Intrinsic::ppc_qpx_qvfperm; 12847 PermCntlTy = MVT::v4f64; 12848 PermTy = MVT::v4f64; 12849 LDTy = MemVT.getSimpleVT(); 12850 } 12851 12852 SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy); 12853 12854 // Create the new MMO for the new base load. It is like the original MMO, 12855 // but represents an area in memory almost twice the vector size centered 12856 // on the original address. If the address is unaligned, we might start 12857 // reading up to (sizeof(vector)-1) bytes below the address of the 12858 // original unaligned load. 12859 MachineFunction &MF = DAG.getMachineFunction(); 12860 MachineMemOperand *BaseMMO = 12861 MF.getMachineMemOperand(LD->getMemOperand(), 12862 -(long)MemVT.getStoreSize()+1, 12863 2*MemVT.getStoreSize()-1); 12864 12865 // Create the new base load. 12866 SDValue LDXIntID = 12867 DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout())); 12868 SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr }; 12869 SDValue BaseLoad = 12870 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, 12871 DAG.getVTList(PermTy, MVT::Other), 12872 BaseLoadOps, LDTy, BaseMMO); 12873 12874 // Note that the value of IncOffset (which is provided to the next 12875 // load's pointer info offset value, and thus used to calculate the 12876 // alignment), and the value of IncValue (which is actually used to 12877 // increment the pointer value) are different! This is because we 12878 // require the next load to appear to be aligned, even though it 12879 // is actually offset from the base pointer by a lesser amount. 12880 int IncOffset = VT.getSizeInBits() / 8; 12881 int IncValue = IncOffset; 12882 12883 // Walk (both up and down) the chain looking for another load at the real 12884 // (aligned) offset (the alignment of the other load does not matter in 12885 // this case). If found, then do not use the offset reduction trick, as 12886 // that will prevent the loads from being later combined (as they would 12887 // otherwise be duplicates). 12888 if (!findConsecutiveLoad(LD, DAG)) 12889 --IncValue; 12890 12891 SDValue Increment = 12892 DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout())); 12893 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 12894 12895 MachineMemOperand *ExtraMMO = 12896 MF.getMachineMemOperand(LD->getMemOperand(), 12897 1, 2*MemVT.getStoreSize()-1); 12898 SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr }; 12899 SDValue ExtraLoad = 12900 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, 12901 DAG.getVTList(PermTy, MVT::Other), 12902 ExtraLoadOps, LDTy, ExtraMMO); 12903 12904 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 12905 BaseLoad.getValue(1), ExtraLoad.getValue(1)); 12906 12907 // Because vperm has a big-endian bias, we must reverse the order 12908 // of the input vectors and complement the permute control vector 12909 // when generating little endian code. We have already handled the 12910 // latter by using lvsr instead of lvsl, so just reverse BaseLoad 12911 // and ExtraLoad here. 12912 SDValue Perm; 12913 if (isLittleEndian) 12914 Perm = BuildIntrinsicOp(IntrPerm, 12915 ExtraLoad, BaseLoad, PermCntl, DAG, dl); 12916 else 12917 Perm = BuildIntrinsicOp(IntrPerm, 12918 BaseLoad, ExtraLoad, PermCntl, DAG, dl); 12919 12920 if (VT != PermTy) 12921 Perm = Subtarget.hasAltivec() ? 12922 DAG.getNode(ISD::BITCAST, dl, VT, Perm) : 12923 DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX 12924 DAG.getTargetConstant(1, dl, MVT::i64)); 12925 // second argument is 1 because this rounding 12926 // is always exact. 12927 12928 // The output of the permutation is our loaded result, the TokenFactor is 12929 // our new chain. 12930 DCI.CombineTo(N, Perm, TF); 12931 return SDValue(N, 0); 12932 } 12933 } 12934 break; 12935 case ISD::INTRINSIC_WO_CHAIN: { 12936 bool isLittleEndian = Subtarget.isLittleEndian(); 12937 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 12938 Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr 12939 : Intrinsic::ppc_altivec_lvsl); 12940 if ((IID == Intr || 12941 IID == Intrinsic::ppc_qpx_qvlpcld || 12942 IID == Intrinsic::ppc_qpx_qvlpcls) && 12943 N->getOperand(1)->getOpcode() == ISD::ADD) { 12944 SDValue Add = N->getOperand(1); 12945 12946 int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ? 12947 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */; 12948 12949 if (DAG.MaskedValueIsZero(Add->getOperand(1), 12950 APInt::getAllOnesValue(Bits /* alignment */) 12951 .zext(Add.getScalarValueSizeInBits()))) { 12952 SDNode *BasePtr = Add->getOperand(0).getNode(); 12953 for (SDNode::use_iterator UI = BasePtr->use_begin(), 12954 UE = BasePtr->use_end(); 12955 UI != UE; ++UI) { 12956 if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && 12957 cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) { 12958 // We've found another LVSL/LVSR, and this address is an aligned 12959 // multiple of that one. The results will be the same, so use the 12960 // one we've just found instead. 12961 12962 return SDValue(*UI, 0); 12963 } 12964 } 12965 } 12966 12967 if (isa<ConstantSDNode>(Add->getOperand(1))) { 12968 SDNode *BasePtr = Add->getOperand(0).getNode(); 12969 for (SDNode::use_iterator UI = BasePtr->use_begin(), 12970 UE = BasePtr->use_end(); UI != UE; ++UI) { 12971 if (UI->getOpcode() == ISD::ADD && 12972 isa<ConstantSDNode>(UI->getOperand(1)) && 12973 (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() - 12974 cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) % 12975 (1ULL << Bits) == 0) { 12976 SDNode *OtherAdd = *UI; 12977 for (SDNode::use_iterator VI = OtherAdd->use_begin(), 12978 VE = OtherAdd->use_end(); VI != VE; ++VI) { 12979 if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && 12980 cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) { 12981 return SDValue(*VI, 0); 12982 } 12983 } 12984 } 12985 } 12986 } 12987 } 12988 } 12989 12990 break; 12991 case ISD::INTRINSIC_W_CHAIN: 12992 // For little endian, VSX loads require generating lxvd2x/xxswapd. 12993 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. 12994 if (Subtarget.needsSwapsForVSXMemOps()) { 12995 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 12996 default: 12997 break; 12998 case Intrinsic::ppc_vsx_lxvw4x: 12999 case Intrinsic::ppc_vsx_lxvd2x: 13000 return expandVSXLoadForLE(N, DCI); 13001 } 13002 } 13003 break; 13004 case ISD::INTRINSIC_VOID: 13005 // For little endian, VSX stores require generating xxswapd/stxvd2x. 13006 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. 13007 if (Subtarget.needsSwapsForVSXMemOps()) { 13008 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 13009 default: 13010 break; 13011 case Intrinsic::ppc_vsx_stxvw4x: 13012 case Intrinsic::ppc_vsx_stxvd2x: 13013 return expandVSXStoreForLE(N, DCI); 13014 } 13015 } 13016 break; 13017 case ISD::BSWAP: 13018 // Turn BSWAP (LOAD) -> lhbrx/lwbrx. 13019 if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && 13020 N->getOperand(0).hasOneUse() && 13021 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 || 13022 (Subtarget.hasLDBRX() && Subtarget.isPPC64() && 13023 N->getValueType(0) == MVT::i64))) { 13024 SDValue Load = N->getOperand(0); 13025 LoadSDNode *LD = cast<LoadSDNode>(Load); 13026 // Create the byte-swapping load. 13027 SDValue Ops[] = { 13028 LD->getChain(), // Chain 13029 LD->getBasePtr(), // Ptr 13030 DAG.getValueType(N->getValueType(0)) // VT 13031 }; 13032 SDValue BSLoad = 13033 DAG.getMemIntrinsicNode(PPCISD::LBRX, dl, 13034 DAG.getVTList(N->getValueType(0) == MVT::i64 ? 13035 MVT::i64 : MVT::i32, MVT::Other), 13036 Ops, LD->getMemoryVT(), LD->getMemOperand()); 13037 13038 // If this is an i16 load, insert the truncate. 13039 SDValue ResVal = BSLoad; 13040 if (N->getValueType(0) == MVT::i16) 13041 ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad); 13042 13043 // First, combine the bswap away. This makes the value produced by the 13044 // load dead. 13045 DCI.CombineTo(N, ResVal); 13046 13047 // Next, combine the load away, we give it a bogus result value but a real 13048 // chain result. The result value is dead because the bswap is dead. 13049 DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1)); 13050 13051 // Return N so it doesn't get rechecked! 13052 return SDValue(N, 0); 13053 } 13054 break; 13055 case PPCISD::VCMP: 13056 // If a VCMPo node already exists with exactly the same operands as this 13057 // node, use its result instead of this node (VCMPo computes both a CR6 and 13058 // a normal output). 13059 // 13060 if (!N->getOperand(0).hasOneUse() && 13061 !N->getOperand(1).hasOneUse() && 13062 !N->getOperand(2).hasOneUse()) { 13063 13064 // Scan all of the users of the LHS, looking for VCMPo's that match. 13065 SDNode *VCMPoNode = nullptr; 13066 13067 SDNode *LHSN = N->getOperand(0).getNode(); 13068 for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end(); 13069 UI != E; ++UI) 13070 if (UI->getOpcode() == PPCISD::VCMPo && 13071 UI->getOperand(1) == N->getOperand(1) && 13072 UI->getOperand(2) == N->getOperand(2) && 13073 UI->getOperand(0) == N->getOperand(0)) { 13074 VCMPoNode = *UI; 13075 break; 13076 } 13077 13078 // If there is no VCMPo node, or if the flag value has a single use, don't 13079 // transform this. 13080 if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1)) 13081 break; 13082 13083 // Look at the (necessarily single) use of the flag value. If it has a 13084 // chain, this transformation is more complex. Note that multiple things 13085 // could use the value result, which we should ignore. 13086 SDNode *FlagUser = nullptr; 13087 for (SDNode::use_iterator UI = VCMPoNode->use_begin(); 13088 FlagUser == nullptr; ++UI) { 13089 assert(UI != VCMPoNode->use_end() && "Didn't find user!"); 13090 SDNode *User = *UI; 13091 for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { 13092 if (User->getOperand(i) == SDValue(VCMPoNode, 1)) { 13093 FlagUser = User; 13094 break; 13095 } 13096 } 13097 } 13098 13099 // If the user is a MFOCRF instruction, we know this is safe. 13100 // Otherwise we give up for right now. 13101 if (FlagUser->getOpcode() == PPCISD::MFOCRF) 13102 return SDValue(VCMPoNode, 0); 13103 } 13104 break; 13105 case ISD::BRCOND: { 13106 SDValue Cond = N->getOperand(1); 13107 SDValue Target = N->getOperand(2); 13108 13109 if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && 13110 cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() == 13111 Intrinsic::ppc_is_decremented_ctr_nonzero) { 13112 13113 // We now need to make the intrinsic dead (it cannot be instruction 13114 // selected). 13115 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0)); 13116 assert(Cond.getNode()->hasOneUse() && 13117 "Counter decrement has more than one use"); 13118 13119 return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other, 13120 N->getOperand(0), Target); 13121 } 13122 } 13123 break; 13124 case ISD::BR_CC: { 13125 // If this is a branch on an altivec predicate comparison, lower this so 13126 // that we don't have to do a MFOCRF: instead, branch directly on CR6. This 13127 // lowering is done pre-legalize, because the legalizer lowers the predicate 13128 // compare down to code that is difficult to reassemble. 13129 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 13130 SDValue LHS = N->getOperand(2), RHS = N->getOperand(3); 13131 13132 // Sometimes the promoted value of the intrinsic is ANDed by some non-zero 13133 // value. If so, pass-through the AND to get to the intrinsic. 13134 if (LHS.getOpcode() == ISD::AND && 13135 LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && 13136 cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() == 13137 Intrinsic::ppc_is_decremented_ctr_nonzero && 13138 isa<ConstantSDNode>(LHS.getOperand(1)) && 13139 !isNullConstant(LHS.getOperand(1))) 13140 LHS = LHS.getOperand(0); 13141 13142 if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && 13143 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == 13144 Intrinsic::ppc_is_decremented_ctr_nonzero && 13145 isa<ConstantSDNode>(RHS)) { 13146 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && 13147 "Counter decrement comparison is not EQ or NE"); 13148 13149 unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); 13150 bool isBDNZ = (CC == ISD::SETEQ && Val) || 13151 (CC == ISD::SETNE && !Val); 13152 13153 // We now need to make the intrinsic dead (it cannot be instruction 13154 // selected). 13155 DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0)); 13156 assert(LHS.getNode()->hasOneUse() && 13157 "Counter decrement has more than one use"); 13158 13159 return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other, 13160 N->getOperand(0), N->getOperand(4)); 13161 } 13162 13163 int CompareOpc; 13164 bool isDot; 13165 13166 if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && 13167 isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && 13168 getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { 13169 assert(isDot && "Can't compare against a vector result!"); 13170 13171 // If this is a comparison against something other than 0/1, then we know 13172 // that the condition is never/always true. 13173 unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); 13174 if (Val != 0 && Val != 1) { 13175 if (CC == ISD::SETEQ) // Cond never true, remove branch. 13176 return N->getOperand(0); 13177 // Always !=, turn it into an unconditional branch. 13178 return DAG.getNode(ISD::BR, dl, MVT::Other, 13179 N->getOperand(0), N->getOperand(4)); 13180 } 13181 13182 bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0); 13183 13184 // Create the PPCISD altivec 'dot' comparison node. 13185 SDValue Ops[] = { 13186 LHS.getOperand(2), // LHS of compare 13187 LHS.getOperand(3), // RHS of compare 13188 DAG.getConstant(CompareOpc, dl, MVT::i32) 13189 }; 13190 EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue }; 13191 SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); 13192 13193 // Unpack the result based on how the target uses it. 13194 PPC::Predicate CompOpc; 13195 switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) { 13196 default: // Can't happen, don't crash on invalid number though. 13197 case 0: // Branch on the value of the EQ bit of CR6. 13198 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE; 13199 break; 13200 case 1: // Branch on the inverted value of the EQ bit of CR6. 13201 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ; 13202 break; 13203 case 2: // Branch on the value of the LT bit of CR6. 13204 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE; 13205 break; 13206 case 3: // Branch on the inverted value of the LT bit of CR6. 13207 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT; 13208 break; 13209 } 13210 13211 return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0), 13212 DAG.getConstant(CompOpc, dl, MVT::i32), 13213 DAG.getRegister(PPC::CR6, MVT::i32), 13214 N->getOperand(4), CompNode.getValue(1)); 13215 } 13216 break; 13217 } 13218 case ISD::BUILD_VECTOR: 13219 return DAGCombineBuildVector(N, DCI); 13220 } 13221 13222 return SDValue(); 13223 } 13224 13225 SDValue 13226 PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 13227 SelectionDAG &DAG, 13228 SmallVectorImpl<SDNode *> &Created) const { 13229 // fold (sdiv X, pow2) 13230 EVT VT = N->getValueType(0); 13231 if (VT == MVT::i64 && !Subtarget.isPPC64()) 13232 return SDValue(); 13233 if ((VT != MVT::i32 && VT != MVT::i64) || 13234 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) 13235 return SDValue(); 13236 13237 SDLoc DL(N); 13238 SDValue N0 = N->getOperand(0); 13239 13240 bool IsNegPow2 = (-Divisor).isPowerOf2(); 13241 unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros(); 13242 SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT); 13243 13244 SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt); 13245 Created.push_back(Op.getNode()); 13246 13247 if (IsNegPow2) { 13248 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); 13249 Created.push_back(Op.getNode()); 13250 } 13251 13252 return Op; 13253 } 13254 13255 //===----------------------------------------------------------------------===// 13256 // Inline Assembly Support 13257 //===----------------------------------------------------------------------===// 13258 13259 void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 13260 KnownBits &Known, 13261 const APInt &DemandedElts, 13262 const SelectionDAG &DAG, 13263 unsigned Depth) const { 13264 Known.resetAll(); 13265 switch (Op.getOpcode()) { 13266 default: break; 13267 case PPCISD::LBRX: { 13268 // lhbrx is known to have the top bits cleared out. 13269 if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16) 13270 Known.Zero = 0xFFFF0000; 13271 break; 13272 } 13273 case ISD::INTRINSIC_WO_CHAIN: { 13274 switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { 13275 default: break; 13276 case Intrinsic::ppc_altivec_vcmpbfp_p: 13277 case Intrinsic::ppc_altivec_vcmpeqfp_p: 13278 case Intrinsic::ppc_altivec_vcmpequb_p: 13279 case Intrinsic::ppc_altivec_vcmpequh_p: 13280 case Intrinsic::ppc_altivec_vcmpequw_p: 13281 case Intrinsic::ppc_altivec_vcmpequd_p: 13282 case Intrinsic::ppc_altivec_vcmpgefp_p: 13283 case Intrinsic::ppc_altivec_vcmpgtfp_p: 13284 case Intrinsic::ppc_altivec_vcmpgtsb_p: 13285 case Intrinsic::ppc_altivec_vcmpgtsh_p: 13286 case Intrinsic::ppc_altivec_vcmpgtsw_p: 13287 case Intrinsic::ppc_altivec_vcmpgtsd_p: 13288 case Intrinsic::ppc_altivec_vcmpgtub_p: 13289 case Intrinsic::ppc_altivec_vcmpgtuh_p: 13290 case Intrinsic::ppc_altivec_vcmpgtuw_p: 13291 case Intrinsic::ppc_altivec_vcmpgtud_p: 13292 Known.Zero = ~1U; // All bits but the low one are known to be zero. 13293 break; 13294 } 13295 } 13296 } 13297 } 13298 13299 unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { 13300 switch (Subtarget.getDarwinDirective()) { 13301 default: break; 13302 case PPC::DIR_970: 13303 case PPC::DIR_PWR4: 13304 case PPC::DIR_PWR5: 13305 case PPC::DIR_PWR5X: 13306 case PPC::DIR_PWR6: 13307 case PPC::DIR_PWR6X: 13308 case PPC::DIR_PWR7: 13309 case PPC::DIR_PWR8: 13310 case PPC::DIR_PWR9: { 13311 if (!ML) 13312 break; 13313 13314 const PPCInstrInfo *TII = Subtarget.getInstrInfo(); 13315 13316 // For small loops (between 5 and 8 instructions), align to a 32-byte 13317 // boundary so that the entire loop fits in one instruction-cache line. 13318 uint64_t LoopSize = 0; 13319 for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) 13320 for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) { 13321 LoopSize += TII->getInstSizeInBytes(*J); 13322 if (LoopSize > 32) 13323 break; 13324 } 13325 13326 if (LoopSize > 16 && LoopSize <= 32) 13327 return 5; 13328 13329 break; 13330 } 13331 } 13332 13333 return TargetLowering::getPrefLoopAlignment(ML); 13334 } 13335 13336 /// getConstraintType - Given a constraint, return the type of 13337 /// constraint it is for this target. 13338 PPCTargetLowering::ConstraintType 13339 PPCTargetLowering::getConstraintType(StringRef Constraint) const { 13340 if (Constraint.size() == 1) { 13341 switch (Constraint[0]) { 13342 default: break; 13343 case 'b': 13344 case 'r': 13345 case 'f': 13346 case 'd': 13347 case 'v': 13348 case 'y': 13349 return C_RegisterClass; 13350 case 'Z': 13351 // FIXME: While Z does indicate a memory constraint, it specifically 13352 // indicates an r+r address (used in conjunction with the 'y' modifier 13353 // in the replacement string). Currently, we're forcing the base 13354 // register to be r0 in the asm printer (which is interpreted as zero) 13355 // and forming the complete address in the second register. This is 13356 // suboptimal. 13357 return C_Memory; 13358 } 13359 } else if (Constraint == "wc") { // individual CR bits. 13360 return C_RegisterClass; 13361 } else if (Constraint == "wa" || Constraint == "wd" || 13362 Constraint == "wf" || Constraint == "ws" || 13363 Constraint == "wi") { 13364 return C_RegisterClass; // VSX registers. 13365 } 13366 return TargetLowering::getConstraintType(Constraint); 13367 } 13368 13369 /// Examine constraint type and operand type and determine a weight value. 13370 /// This object must already have been set up with the operand type 13371 /// and the current alternative constraint selected. 13372 TargetLowering::ConstraintWeight 13373 PPCTargetLowering::getSingleConstraintMatchWeight( 13374 AsmOperandInfo &info, const char *constraint) const { 13375 ConstraintWeight weight = CW_Invalid; 13376 Value *CallOperandVal = info.CallOperandVal; 13377 // If we don't have a value, we can't do a match, 13378 // but allow it at the lowest weight. 13379 if (!CallOperandVal) 13380 return CW_Default; 13381 Type *type = CallOperandVal->getType(); 13382 13383 // Look at the constraint type. 13384 if (StringRef(constraint) == "wc" && type->isIntegerTy(1)) 13385 return CW_Register; // an individual CR bit. 13386 else if ((StringRef(constraint) == "wa" || 13387 StringRef(constraint) == "wd" || 13388 StringRef(constraint) == "wf") && 13389 type->isVectorTy()) 13390 return CW_Register; 13391 else if (StringRef(constraint) == "ws" && type->isDoubleTy()) 13392 return CW_Register; 13393 else if (StringRef(constraint) == "wi" && type->isIntegerTy(64)) 13394 return CW_Register; // just hold 64-bit integers data. 13395 13396 switch (*constraint) { 13397 default: 13398 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 13399 break; 13400 case 'b': 13401 if (type->isIntegerTy()) 13402 weight = CW_Register; 13403 break; 13404 case 'f': 13405 if (type->isFloatTy()) 13406 weight = CW_Register; 13407 break; 13408 case 'd': 13409 if (type->isDoubleTy()) 13410 weight = CW_Register; 13411 break; 13412 case 'v': 13413 if (type->isVectorTy()) 13414 weight = CW_Register; 13415 break; 13416 case 'y': 13417 weight = CW_Register; 13418 break; 13419 case 'Z': 13420 weight = CW_Memory; 13421 break; 13422 } 13423 return weight; 13424 } 13425 13426 std::pair<unsigned, const TargetRegisterClass *> 13427 PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 13428 StringRef Constraint, 13429 MVT VT) const { 13430 if (Constraint.size() == 1) { 13431 // GCC RS6000 Constraint Letters 13432 switch (Constraint[0]) { 13433 case 'b': // R1-R31 13434 if (VT == MVT::i64 && Subtarget.isPPC64()) 13435 return std::make_pair(0U, &PPC::G8RC_NOX0RegClass); 13436 return std::make_pair(0U, &PPC::GPRC_NOR0RegClass); 13437 case 'r': // R0-R31 13438 if (VT == MVT::i64 && Subtarget.isPPC64()) 13439 return std::make_pair(0U, &PPC::G8RCRegClass); 13440 return std::make_pair(0U, &PPC::GPRCRegClass); 13441 // 'd' and 'f' constraints are both defined to be "the floating point 13442 // registers", where one is for 32-bit and the other for 64-bit. We don't 13443 // really care overly much here so just give them all the same reg classes. 13444 case 'd': 13445 case 'f': 13446 if (Subtarget.hasSPE()) { 13447 if (VT == MVT::f32 || VT == MVT::i32) 13448 return std::make_pair(0U, &PPC::SPE4RCRegClass); 13449 if (VT == MVT::f64 || VT == MVT::i64) 13450 return std::make_pair(0U, &PPC::SPERCRegClass); 13451 } else { 13452 if (VT == MVT::f32 || VT == MVT::i32) 13453 return std::make_pair(0U, &PPC::F4RCRegClass); 13454 if (VT == MVT::f64 || VT == MVT::i64) 13455 return std::make_pair(0U, &PPC::F8RCRegClass); 13456 if (VT == MVT::v4f64 && Subtarget.hasQPX()) 13457 return std::make_pair(0U, &PPC::QFRCRegClass); 13458 if (VT == MVT::v4f32 && Subtarget.hasQPX()) 13459 return std::make_pair(0U, &PPC::QSRCRegClass); 13460 } 13461 break; 13462 case 'v': 13463 if (VT == MVT::v4f64 && Subtarget.hasQPX()) 13464 return std::make_pair(0U, &PPC::QFRCRegClass); 13465 if (VT == MVT::v4f32 && Subtarget.hasQPX()) 13466 return std::make_pair(0U, &PPC::QSRCRegClass); 13467 if (Subtarget.hasAltivec()) 13468 return std::make_pair(0U, &PPC::VRRCRegClass); 13469 break; 13470 case 'y': // crrc 13471 return std::make_pair(0U, &PPC::CRRCRegClass); 13472 } 13473 } else if (Constraint == "wc" && Subtarget.useCRBits()) { 13474 // An individual CR bit. 13475 return std::make_pair(0U, &PPC::CRBITRCRegClass); 13476 } else if ((Constraint == "wa" || Constraint == "wd" || 13477 Constraint == "wf" || Constraint == "wi") && 13478 Subtarget.hasVSX()) { 13479 return std::make_pair(0U, &PPC::VSRCRegClass); 13480 } else if (Constraint == "ws" && Subtarget.hasVSX()) { 13481 if (VT == MVT::f32 && Subtarget.hasP8Vector()) 13482 return std::make_pair(0U, &PPC::VSSRCRegClass); 13483 else 13484 return std::make_pair(0U, &PPC::VSFRCRegClass); 13485 } 13486 13487 std::pair<unsigned, const TargetRegisterClass *> R = 13488 TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 13489 13490 // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers 13491 // (which we call X[0-9]+). If a 64-bit value has been requested, and a 13492 // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent 13493 // register. 13494 // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use 13495 // the AsmName field from *RegisterInfo.td, then this would not be necessary. 13496 if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && 13497 PPC::GPRCRegClass.contains(R.first)) 13498 return std::make_pair(TRI->getMatchingSuperReg(R.first, 13499 PPC::sub_32, &PPC::G8RCRegClass), 13500 &PPC::G8RCRegClass); 13501 13502 // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same. 13503 if (!R.second && StringRef("{cc}").equals_lower(Constraint)) { 13504 R.first = PPC::CR0; 13505 R.second = &PPC::CRRCRegClass; 13506 } 13507 13508 return R; 13509 } 13510 13511 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 13512 /// vector. If it is invalid, don't add anything to Ops. 13513 void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 13514 std::string &Constraint, 13515 std::vector<SDValue>&Ops, 13516 SelectionDAG &DAG) const { 13517 SDValue Result; 13518 13519 // Only support length 1 constraints. 13520 if (Constraint.length() > 1) return; 13521 13522 char Letter = Constraint[0]; 13523 switch (Letter) { 13524 default: break; 13525 case 'I': 13526 case 'J': 13527 case 'K': 13528 case 'L': 13529 case 'M': 13530 case 'N': 13531 case 'O': 13532 case 'P': { 13533 ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op); 13534 if (!CST) return; // Must be an immediate to match. 13535 SDLoc dl(Op); 13536 int64_t Value = CST->getSExtValue(); 13537 EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative 13538 // numbers are printed as such. 13539 switch (Letter) { 13540 default: llvm_unreachable("Unknown constraint letter!"); 13541 case 'I': // "I" is a signed 16-bit constant. 13542 if (isInt<16>(Value)) 13543 Result = DAG.getTargetConstant(Value, dl, TCVT); 13544 break; 13545 case 'J': // "J" is a constant with only the high-order 16 bits nonzero. 13546 if (isShiftedUInt<16, 16>(Value)) 13547 Result = DAG.getTargetConstant(Value, dl, TCVT); 13548 break; 13549 case 'L': // "L" is a signed 16-bit constant shifted left 16 bits. 13550 if (isShiftedInt<16, 16>(Value)) 13551 Result = DAG.getTargetConstant(Value, dl, TCVT); 13552 break; 13553 case 'K': // "K" is a constant with only the low-order 16 bits nonzero. 13554 if (isUInt<16>(Value)) 13555 Result = DAG.getTargetConstant(Value, dl, TCVT); 13556 break; 13557 case 'M': // "M" is a constant that is greater than 31. 13558 if (Value > 31) 13559 Result = DAG.getTargetConstant(Value, dl, TCVT); 13560 break; 13561 case 'N': // "N" is a positive constant that is an exact power of two. 13562 if (Value > 0 && isPowerOf2_64(Value)) 13563 Result = DAG.getTargetConstant(Value, dl, TCVT); 13564 break; 13565 case 'O': // "O" is the constant zero. 13566 if (Value == 0) 13567 Result = DAG.getTargetConstant(Value, dl, TCVT); 13568 break; 13569 case 'P': // "P" is a constant whose negation is a signed 16-bit constant. 13570 if (isInt<16>(-Value)) 13571 Result = DAG.getTargetConstant(Value, dl, TCVT); 13572 break; 13573 } 13574 break; 13575 } 13576 } 13577 13578 if (Result.getNode()) { 13579 Ops.push_back(Result); 13580 return; 13581 } 13582 13583 // Handle standard constraint letters. 13584 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 13585 } 13586 13587 // isLegalAddressingMode - Return true if the addressing mode represented 13588 // by AM is legal for this target, for a load/store of the specified type. 13589 bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, 13590 const AddrMode &AM, Type *Ty, 13591 unsigned AS, Instruction *I) const { 13592 // PPC does not allow r+i addressing modes for vectors! 13593 if (Ty->isVectorTy() && AM.BaseOffs != 0) 13594 return false; 13595 13596 // PPC allows a sign-extended 16-bit immediate field. 13597 if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1) 13598 return false; 13599 13600 // No global is ever allowed as a base. 13601 if (AM.BaseGV) 13602 return false; 13603 13604 // PPC only support r+r, 13605 switch (AM.Scale) { 13606 case 0: // "r+i" or just "i", depending on HasBaseReg. 13607 break; 13608 case 1: 13609 if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. 13610 return false; 13611 // Otherwise we have r+r or r+i. 13612 break; 13613 case 2: 13614 if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. 13615 return false; 13616 // Allow 2*r as r+r. 13617 break; 13618 default: 13619 // No other scales are supported. 13620 return false; 13621 } 13622 13623 return true; 13624 } 13625 13626 SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, 13627 SelectionDAG &DAG) const { 13628 MachineFunction &MF = DAG.getMachineFunction(); 13629 MachineFrameInfo &MFI = MF.getFrameInfo(); 13630 MFI.setReturnAddressIsTaken(true); 13631 13632 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 13633 return SDValue(); 13634 13635 SDLoc dl(Op); 13636 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 13637 13638 // Make sure the function does not optimize away the store of the RA to 13639 // the stack. 13640 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 13641 FuncInfo->setLRStoreRequired(); 13642 bool isPPC64 = Subtarget.isPPC64(); 13643 auto PtrVT = getPointerTy(MF.getDataLayout()); 13644 13645 if (Depth > 0) { 13646 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 13647 SDValue Offset = 13648 DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl, 13649 isPPC64 ? MVT::i64 : MVT::i32); 13650 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 13651 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), 13652 MachinePointerInfo()); 13653 } 13654 13655 // Just load the return address off the stack. 13656 SDValue RetAddrFI = getReturnAddrFrameIndex(DAG); 13657 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, 13658 MachinePointerInfo()); 13659 } 13660 13661 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, 13662 SelectionDAG &DAG) const { 13663 SDLoc dl(Op); 13664 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 13665 13666 MachineFunction &MF = DAG.getMachineFunction(); 13667 MachineFrameInfo &MFI = MF.getFrameInfo(); 13668 MFI.setFrameAddressIsTaken(true); 13669 13670 EVT PtrVT = getPointerTy(MF.getDataLayout()); 13671 bool isPPC64 = PtrVT == MVT::i64; 13672 13673 // Naked functions never have a frame pointer, and so we use r1. For all 13674 // other functions, this decision must be delayed until during PEI. 13675 unsigned FrameReg; 13676 if (MF.getFunction().hasFnAttribute(Attribute::Naked)) 13677 FrameReg = isPPC64 ? PPC::X1 : PPC::R1; 13678 else 13679 FrameReg = isPPC64 ? PPC::FP8 : PPC::FP; 13680 13681 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, 13682 PtrVT); 13683 while (Depth--) 13684 FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(), 13685 FrameAddr, MachinePointerInfo()); 13686 return FrameAddr; 13687 } 13688 13689 // FIXME? Maybe this could be a TableGen attribute on some registers and 13690 // this table could be generated automatically from RegInfo. 13691 unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, 13692 SelectionDAG &DAG) const { 13693 bool isPPC64 = Subtarget.isPPC64(); 13694 bool isDarwinABI = Subtarget.isDarwinABI(); 13695 13696 if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || 13697 (!isPPC64 && VT != MVT::i32)) 13698 report_fatal_error("Invalid register global variable type"); 13699 13700 bool is64Bit = isPPC64 && VT == MVT::i64; 13701 unsigned Reg = StringSwitch<unsigned>(RegName) 13702 .Case("r1", is64Bit ? PPC::X1 : PPC::R1) 13703 .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2) 13704 .Case("r13", (!isPPC64 && isDarwinABI) ? 0 : 13705 (is64Bit ? PPC::X13 : PPC::R13)) 13706 .Default(0); 13707 13708 if (Reg) 13709 return Reg; 13710 report_fatal_error("Invalid register name global variable"); 13711 } 13712 13713 bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const { 13714 // 32-bit SVR4 ABI access everything as got-indirect. 13715 if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) 13716 return true; 13717 13718 CodeModel::Model CModel = getTargetMachine().getCodeModel(); 13719 // If it is small or large code model, module locals are accessed 13720 // indirectly by loading their address from .toc/.got. The difference 13721 // is that for large code model we have ADDISTocHa + LDtocL and for 13722 // small code model we simply have LDtoc. 13723 if (CModel == CodeModel::Small || CModel == CodeModel::Large) 13724 return true; 13725 13726 // JumpTable and BlockAddress are accessed as got-indirect. 13727 if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA)) 13728 return true; 13729 13730 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) { 13731 const GlobalValue *GV = G->getGlobal(); 13732 unsigned char GVFlags = Subtarget.classifyGlobalReference(GV); 13733 // The NLP flag indicates that a global access has to use an 13734 // extra indirection. 13735 if (GVFlags & PPCII::MO_NLP_FLAG) 13736 return true; 13737 } 13738 13739 return false; 13740 } 13741 13742 bool 13743 PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 13744 // The PowerPC target isn't yet aware of offsets. 13745 return false; 13746 } 13747 13748 bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 13749 const CallInst &I, 13750 MachineFunction &MF, 13751 unsigned Intrinsic) const { 13752 switch (Intrinsic) { 13753 case Intrinsic::ppc_qpx_qvlfd: 13754 case Intrinsic::ppc_qpx_qvlfs: 13755 case Intrinsic::ppc_qpx_qvlfcd: 13756 case Intrinsic::ppc_qpx_qvlfcs: 13757 case Intrinsic::ppc_qpx_qvlfiwa: 13758 case Intrinsic::ppc_qpx_qvlfiwz: 13759 case Intrinsic::ppc_altivec_lvx: 13760 case Intrinsic::ppc_altivec_lvxl: 13761 case Intrinsic::ppc_altivec_lvebx: 13762 case Intrinsic::ppc_altivec_lvehx: 13763 case Intrinsic::ppc_altivec_lvewx: 13764 case Intrinsic::ppc_vsx_lxvd2x: 13765 case Intrinsic::ppc_vsx_lxvw4x: { 13766 EVT VT; 13767 switch (Intrinsic) { 13768 case Intrinsic::ppc_altivec_lvebx: 13769 VT = MVT::i8; 13770 break; 13771 case Intrinsic::ppc_altivec_lvehx: 13772 VT = MVT::i16; 13773 break; 13774 case Intrinsic::ppc_altivec_lvewx: 13775 VT = MVT::i32; 13776 break; 13777 case Intrinsic::ppc_vsx_lxvd2x: 13778 VT = MVT::v2f64; 13779 break; 13780 case Intrinsic::ppc_qpx_qvlfd: 13781 VT = MVT::v4f64; 13782 break; 13783 case Intrinsic::ppc_qpx_qvlfs: 13784 VT = MVT::v4f32; 13785 break; 13786 case Intrinsic::ppc_qpx_qvlfcd: 13787 VT = MVT::v2f64; 13788 break; 13789 case Intrinsic::ppc_qpx_qvlfcs: 13790 VT = MVT::v2f32; 13791 break; 13792 default: 13793 VT = MVT::v4i32; 13794 break; 13795 } 13796 13797 Info.opc = ISD::INTRINSIC_W_CHAIN; 13798 Info.memVT = VT; 13799 Info.ptrVal = I.getArgOperand(0); 13800 Info.offset = -VT.getStoreSize()+1; 13801 Info.size = 2*VT.getStoreSize()-1; 13802 Info.align = 1; 13803 Info.flags = MachineMemOperand::MOLoad; 13804 return true; 13805 } 13806 case Intrinsic::ppc_qpx_qvlfda: 13807 case Intrinsic::ppc_qpx_qvlfsa: 13808 case Intrinsic::ppc_qpx_qvlfcda: 13809 case Intrinsic::ppc_qpx_qvlfcsa: 13810 case Intrinsic::ppc_qpx_qvlfiwaa: 13811 case Intrinsic::ppc_qpx_qvlfiwza: { 13812 EVT VT; 13813 switch (Intrinsic) { 13814 case Intrinsic::ppc_qpx_qvlfda: 13815 VT = MVT::v4f64; 13816 break; 13817 case Intrinsic::ppc_qpx_qvlfsa: 13818 VT = MVT::v4f32; 13819 break; 13820 case Intrinsic::ppc_qpx_qvlfcda: 13821 VT = MVT::v2f64; 13822 break; 13823 case Intrinsic::ppc_qpx_qvlfcsa: 13824 VT = MVT::v2f32; 13825 break; 13826 default: 13827 VT = MVT::v4i32; 13828 break; 13829 } 13830 13831 Info.opc = ISD::INTRINSIC_W_CHAIN; 13832 Info.memVT = VT; 13833 Info.ptrVal = I.getArgOperand(0); 13834 Info.offset = 0; 13835 Info.size = VT.getStoreSize(); 13836 Info.align = 1; 13837 Info.flags = MachineMemOperand::MOLoad; 13838 return true; 13839 } 13840 case Intrinsic::ppc_qpx_qvstfd: 13841 case Intrinsic::ppc_qpx_qvstfs: 13842 case Intrinsic::ppc_qpx_qvstfcd: 13843 case Intrinsic::ppc_qpx_qvstfcs: 13844 case Intrinsic::ppc_qpx_qvstfiw: 13845 case Intrinsic::ppc_altivec_stvx: 13846 case Intrinsic::ppc_altivec_stvxl: 13847 case Intrinsic::ppc_altivec_stvebx: 13848 case Intrinsic::ppc_altivec_stvehx: 13849 case Intrinsic::ppc_altivec_stvewx: 13850 case Intrinsic::ppc_vsx_stxvd2x: 13851 case Intrinsic::ppc_vsx_stxvw4x: { 13852 EVT VT; 13853 switch (Intrinsic) { 13854 case Intrinsic::ppc_altivec_stvebx: 13855 VT = MVT::i8; 13856 break; 13857 case Intrinsic::ppc_altivec_stvehx: 13858 VT = MVT::i16; 13859 break; 13860 case Intrinsic::ppc_altivec_stvewx: 13861 VT = MVT::i32; 13862 break; 13863 case Intrinsic::ppc_vsx_stxvd2x: 13864 VT = MVT::v2f64; 13865 break; 13866 case Intrinsic::ppc_qpx_qvstfd: 13867 VT = MVT::v4f64; 13868 break; 13869 case Intrinsic::ppc_qpx_qvstfs: 13870 VT = MVT::v4f32; 13871 break; 13872 case Intrinsic::ppc_qpx_qvstfcd: 13873 VT = MVT::v2f64; 13874 break; 13875 case Intrinsic::ppc_qpx_qvstfcs: 13876 VT = MVT::v2f32; 13877 break; 13878 default: 13879 VT = MVT::v4i32; 13880 break; 13881 } 13882 13883 Info.opc = ISD::INTRINSIC_VOID; 13884 Info.memVT = VT; 13885 Info.ptrVal = I.getArgOperand(1); 13886 Info.offset = -VT.getStoreSize()+1; 13887 Info.size = 2*VT.getStoreSize()-1; 13888 Info.align = 1; 13889 Info.flags = MachineMemOperand::MOStore; 13890 return true; 13891 } 13892 case Intrinsic::ppc_qpx_qvstfda: 13893 case Intrinsic::ppc_qpx_qvstfsa: 13894 case Intrinsic::ppc_qpx_qvstfcda: 13895 case Intrinsic::ppc_qpx_qvstfcsa: 13896 case Intrinsic::ppc_qpx_qvstfiwa: { 13897 EVT VT; 13898 switch (Intrinsic) { 13899 case Intrinsic::ppc_qpx_qvstfda: 13900 VT = MVT::v4f64; 13901 break; 13902 case Intrinsic::ppc_qpx_qvstfsa: 13903 VT = MVT::v4f32; 13904 break; 13905 case Intrinsic::ppc_qpx_qvstfcda: 13906 VT = MVT::v2f64; 13907 break; 13908 case Intrinsic::ppc_qpx_qvstfcsa: 13909 VT = MVT::v2f32; 13910 break; 13911 default: 13912 VT = MVT::v4i32; 13913 break; 13914 } 13915 13916 Info.opc = ISD::INTRINSIC_VOID; 13917 Info.memVT = VT; 13918 Info.ptrVal = I.getArgOperand(1); 13919 Info.offset = 0; 13920 Info.size = VT.getStoreSize(); 13921 Info.align = 1; 13922 Info.flags = MachineMemOperand::MOStore; 13923 return true; 13924 } 13925 default: 13926 break; 13927 } 13928 13929 return false; 13930 } 13931 13932 /// getOptimalMemOpType - Returns the target specific optimal type for load 13933 /// and store operations as a result of memset, memcpy, and memmove 13934 /// lowering. If DstAlign is zero that means it's safe to destination 13935 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 13936 /// means there isn't a need to check it against alignment requirement, 13937 /// probably because the source does not need to be loaded. If 'IsMemset' is 13938 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 13939 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 13940 /// source is constant so it does not need to be loaded. 13941 /// It returns EVT::Other if the type should be determined using generic 13942 /// target-independent logic. 13943 EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, 13944 unsigned DstAlign, unsigned SrcAlign, 13945 bool IsMemset, bool ZeroMemset, 13946 bool MemcpyStrSrc, 13947 MachineFunction &MF) const { 13948 if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { 13949 const Function &F = MF.getFunction(); 13950 // When expanding a memset, require at least two QPX instructions to cover 13951 // the cost of loading the value to be stored from the constant pool. 13952 if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) && 13953 (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) && 13954 !F.hasFnAttribute(Attribute::NoImplicitFloat)) { 13955 return MVT::v4f64; 13956 } 13957 13958 // We should use Altivec/VSX loads and stores when available. For unaligned 13959 // addresses, unaligned VSX loads are only fast starting with the P8. 13960 if (Subtarget.hasAltivec() && Size >= 16 && 13961 (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) || 13962 ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector()))) 13963 return MVT::v4i32; 13964 } 13965 13966 if (Subtarget.isPPC64()) { 13967 return MVT::i64; 13968 } 13969 13970 return MVT::i32; 13971 } 13972 13973 /// Returns true if it is beneficial to convert a load of a constant 13974 /// to just the constant itself. 13975 bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 13976 Type *Ty) const { 13977 assert(Ty->isIntegerTy()); 13978 13979 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 13980 return !(BitSize == 0 || BitSize > 64); 13981 } 13982 13983 bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 13984 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 13985 return false; 13986 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 13987 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 13988 return NumBits1 == 64 && NumBits2 == 32; 13989 } 13990 13991 bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 13992 if (!VT1.isInteger() || !VT2.isInteger()) 13993 return false; 13994 unsigned NumBits1 = VT1.getSizeInBits(); 13995 unsigned NumBits2 = VT2.getSizeInBits(); 13996 return NumBits1 == 64 && NumBits2 == 32; 13997 } 13998 13999 bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 14000 // Generally speaking, zexts are not free, but they are free when they can be 14001 // folded with other operations. 14002 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) { 14003 EVT MemVT = LD->getMemoryVT(); 14004 if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 || 14005 (Subtarget.isPPC64() && MemVT == MVT::i32)) && 14006 (LD->getExtensionType() == ISD::NON_EXTLOAD || 14007 LD->getExtensionType() == ISD::ZEXTLOAD)) 14008 return true; 14009 } 14010 14011 // FIXME: Add other cases... 14012 // - 32-bit shifts with a zext to i64 14013 // - zext after ctlz, bswap, etc. 14014 // - zext after and by a constant mask 14015 14016 return TargetLowering::isZExtFree(Val, VT2); 14017 } 14018 14019 bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const { 14020 assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() && 14021 "invalid fpext types"); 14022 // Extending to float128 is not free. 14023 if (DestVT == MVT::f128) 14024 return false; 14025 return true; 14026 } 14027 14028 bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 14029 return isInt<16>(Imm) || isUInt<16>(Imm); 14030 } 14031 14032 bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { 14033 return isInt<16>(Imm) || isUInt<16>(Imm); 14034 } 14035 14036 bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 14037 unsigned, 14038 unsigned, 14039 bool *Fast) const { 14040 if (DisablePPCUnaligned) 14041 return false; 14042 14043 // PowerPC supports unaligned memory access for simple non-vector types. 14044 // Although accessing unaligned addresses is not as efficient as accessing 14045 // aligned addresses, it is generally more efficient than manual expansion, 14046 // and generally only traps for software emulation when crossing page 14047 // boundaries. 14048 14049 if (!VT.isSimple()) 14050 return false; 14051 14052 if (VT.getSimpleVT().isVector()) { 14053 if (Subtarget.hasVSX()) { 14054 if (VT != MVT::v2f64 && VT != MVT::v2i64 && 14055 VT != MVT::v4f32 && VT != MVT::v4i32) 14056 return false; 14057 } else { 14058 return false; 14059 } 14060 } 14061 14062 if (VT == MVT::ppcf128) 14063 return false; 14064 14065 if (Fast) 14066 *Fast = true; 14067 14068 return true; 14069 } 14070 14071 bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 14072 VT = VT.getScalarType(); 14073 14074 if (!VT.isSimple()) 14075 return false; 14076 14077 switch (VT.getSimpleVT().SimpleTy) { 14078 case MVT::f32: 14079 case MVT::f64: 14080 return true; 14081 case MVT::f128: 14082 return (EnableQuadPrecision && Subtarget.hasP9Vector()); 14083 default: 14084 break; 14085 } 14086 14087 return false; 14088 } 14089 14090 const MCPhysReg * 14091 PPCTargetLowering::getScratchRegisters(CallingConv::ID) const { 14092 // LR is a callee-save register, but we must treat it as clobbered by any call 14093 // site. Hence we include LR in the scratch registers, which are in turn added 14094 // as implicit-defs for stackmaps and patchpoints. The same reasoning applies 14095 // to CTR, which is used by any indirect call. 14096 static const MCPhysReg ScratchRegs[] = { 14097 PPC::X12, PPC::LR8, PPC::CTR8, 0 14098 }; 14099 14100 return ScratchRegs; 14101 } 14102 14103 unsigned PPCTargetLowering::getExceptionPointerRegister( 14104 const Constant *PersonalityFn) const { 14105 return Subtarget.isPPC64() ? PPC::X3 : PPC::R3; 14106 } 14107 14108 unsigned PPCTargetLowering::getExceptionSelectorRegister( 14109 const Constant *PersonalityFn) const { 14110 return Subtarget.isPPC64() ? PPC::X4 : PPC::R4; 14111 } 14112 14113 bool 14114 PPCTargetLowering::shouldExpandBuildVectorWithShuffles( 14115 EVT VT , unsigned DefinedValues) const { 14116 if (VT == MVT::v2i64) 14117 return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves 14118 14119 if (Subtarget.hasVSX() || Subtarget.hasQPX()) 14120 return true; 14121 14122 return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); 14123 } 14124 14125 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const { 14126 if (DisableILPPref || Subtarget.enableMachineScheduler()) 14127 return TargetLowering::getSchedulingPreference(N); 14128 14129 return Sched::ILP; 14130 } 14131 14132 // Create a fast isel object. 14133 FastISel * 14134 PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo, 14135 const TargetLibraryInfo *LibInfo) const { 14136 return PPC::createFastISel(FuncInfo, LibInfo); 14137 } 14138 14139 void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 14140 if (Subtarget.isDarwinABI()) return; 14141 if (!Subtarget.isPPC64()) return; 14142 14143 // Update IsSplitCSR in PPCFunctionInfo 14144 PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>(); 14145 PFI->setIsSplitCSR(true); 14146 } 14147 14148 void PPCTargetLowering::insertCopiesSplitCSR( 14149 MachineBasicBlock *Entry, 14150 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 14151 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); 14152 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 14153 if (!IStart) 14154 return; 14155 14156 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 14157 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 14158 MachineBasicBlock::iterator MBBI = Entry->begin(); 14159 for (const MCPhysReg *I = IStart; *I; ++I) { 14160 const TargetRegisterClass *RC = nullptr; 14161 if (PPC::G8RCRegClass.contains(*I)) 14162 RC = &PPC::G8RCRegClass; 14163 else if (PPC::F8RCRegClass.contains(*I)) 14164 RC = &PPC::F8RCRegClass; 14165 else if (PPC::CRRCRegClass.contains(*I)) 14166 RC = &PPC::CRRCRegClass; 14167 else if (PPC::VRRCRegClass.contains(*I)) 14168 RC = &PPC::VRRCRegClass; 14169 else 14170 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 14171 14172 unsigned NewVR = MRI->createVirtualRegister(RC); 14173 // Create copy from CSR to a virtual register. 14174 // FIXME: this currently does not emit CFI pseudo-instructions, it works 14175 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 14176 // nounwind. If we want to generalize this later, we may need to emit 14177 // CFI pseudo-instructions. 14178 assert(Entry->getParent()->getFunction().hasFnAttribute( 14179 Attribute::NoUnwind) && 14180 "Function should be nounwind in insertCopiesSplitCSR!"); 14181 Entry->addLiveIn(*I); 14182 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 14183 .addReg(*I); 14184 14185 // Insert the copy-back instructions right before the terminator 14186 for (auto *Exit : Exits) 14187 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 14188 TII->get(TargetOpcode::COPY), *I) 14189 .addReg(NewVR); 14190 } 14191 } 14192 14193 // Override to enable LOAD_STACK_GUARD lowering on Linux. 14194 bool PPCTargetLowering::useLoadStackGuardNode() const { 14195 if (!Subtarget.isTargetLinux()) 14196 return TargetLowering::useLoadStackGuardNode(); 14197 return true; 14198 } 14199 14200 // Override to disable global variable loading on Linux. 14201 void PPCTargetLowering::insertSSPDeclarations(Module &M) const { 14202 if (!Subtarget.isTargetLinux()) 14203 return TargetLowering::insertSSPDeclarations(M); 14204 } 14205 14206 bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 14207 if (!VT.isSimple() || !Subtarget.hasVSX()) 14208 return false; 14209 14210 switch(VT.getSimpleVT().SimpleTy) { 14211 default: 14212 // For FP types that are currently not supported by PPC backend, return 14213 // false. Examples: f16, f80. 14214 return false; 14215 case MVT::f32: 14216 case MVT::f64: 14217 case MVT::ppcf128: 14218 return Imm.isPosZero(); 14219 } 14220 } 14221 14222 // For vector shift operation op, fold 14223 // (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y) 14224 static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N, 14225 SelectionDAG &DAG) { 14226 SDValue N0 = N->getOperand(0); 14227 SDValue N1 = N->getOperand(1); 14228 EVT VT = N0.getValueType(); 14229 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 14230 unsigned Opcode = N->getOpcode(); 14231 unsigned TargetOpcode; 14232 14233 switch (Opcode) { 14234 default: 14235 llvm_unreachable("Unexpected shift operation"); 14236 case ISD::SHL: 14237 TargetOpcode = PPCISD::SHL; 14238 break; 14239 case ISD::SRL: 14240 TargetOpcode = PPCISD::SRL; 14241 break; 14242 case ISD::SRA: 14243 TargetOpcode = PPCISD::SRA; 14244 break; 14245 } 14246 14247 if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) && 14248 N1->getOpcode() == ISD::AND) 14249 if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) 14250 if (Mask->getZExtValue() == OpSizeInBits - 1) 14251 return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0)); 14252 14253 return SDValue(); 14254 } 14255 14256 SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const { 14257 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) 14258 return Value; 14259 14260 SDValue N0 = N->getOperand(0); 14261 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14262 if (!Subtarget.isISA3_0() || 14263 N0.getOpcode() != ISD::SIGN_EXTEND || 14264 N0.getOperand(0).getValueType() != MVT::i32 || 14265 CN1 == nullptr || N->getValueType(0) != MVT::i64) 14266 return SDValue(); 14267 14268 // We can't save an operation here if the value is already extended, and 14269 // the existing shift is easier to combine. 14270 SDValue ExtsSrc = N0.getOperand(0); 14271 if (ExtsSrc.getOpcode() == ISD::TRUNCATE && 14272 ExtsSrc.getOperand(0).getOpcode() == ISD::AssertSext) 14273 return SDValue(); 14274 14275 SDLoc DL(N0); 14276 SDValue ShiftBy = SDValue(CN1, 0); 14277 // We want the shift amount to be i32 on the extswli, but the shift could 14278 // have an i64. 14279 if (ShiftBy.getValueType() == MVT::i64) 14280 ShiftBy = DCI.DAG.getConstant(CN1->getZExtValue(), DL, MVT::i32); 14281 14282 return DCI.DAG.getNode(PPCISD::EXTSWSLI, DL, MVT::i64, N0->getOperand(0), 14283 ShiftBy); 14284 } 14285 14286 SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const { 14287 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) 14288 return Value; 14289 14290 return SDValue(); 14291 } 14292 14293 SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { 14294 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) 14295 return Value; 14296 14297 return SDValue(); 14298 } 14299 14300 // Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1)) 14301 // Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0)) 14302 // When C is zero, the equation (addi Z, -C) can be simplified to Z 14303 // Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types 14304 static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, 14305 const PPCSubtarget &Subtarget) { 14306 if (!Subtarget.isPPC64()) 14307 return SDValue(); 14308 14309 SDValue LHS = N->getOperand(0); 14310 SDValue RHS = N->getOperand(1); 14311 14312 auto isZextOfCompareWithConstant = [](SDValue Op) { 14313 if (Op.getOpcode() != ISD::ZERO_EXTEND || !Op.hasOneUse() || 14314 Op.getValueType() != MVT::i64) 14315 return false; 14316 14317 SDValue Cmp = Op.getOperand(0); 14318 if (Cmp.getOpcode() != ISD::SETCC || !Cmp.hasOneUse() || 14319 Cmp.getOperand(0).getValueType() != MVT::i64) 14320 return false; 14321 14322 if (auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1))) { 14323 int64_t NegConstant = 0 - Constant->getSExtValue(); 14324 // Due to the limitations of the addi instruction, 14325 // -C is required to be [-32768, 32767]. 14326 return isInt<16>(NegConstant); 14327 } 14328 14329 return false; 14330 }; 14331 14332 bool LHSHasPattern = isZextOfCompareWithConstant(LHS); 14333 bool RHSHasPattern = isZextOfCompareWithConstant(RHS); 14334 14335 // If there is a pattern, canonicalize a zext operand to the RHS. 14336 if (LHSHasPattern && !RHSHasPattern) 14337 std::swap(LHS, RHS); 14338 else if (!LHSHasPattern && !RHSHasPattern) 14339 return SDValue(); 14340 14341 SDLoc DL(N); 14342 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i64); 14343 SDValue Cmp = RHS.getOperand(0); 14344 SDValue Z = Cmp.getOperand(0); 14345 auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1)); 14346 14347 assert(Constant && "Constant Should not be a null pointer."); 14348 int64_t NegConstant = 0 - Constant->getSExtValue(); 14349 14350 switch(cast<CondCodeSDNode>(Cmp.getOperand(2))->get()) { 14351 default: break; 14352 case ISD::SETNE: { 14353 // when C == 0 14354 // --> addze X, (addic Z, -1).carry 14355 // / 14356 // add X, (zext(setne Z, C))-- 14357 // \ when -32768 <= -C <= 32767 && C != 0 14358 // --> addze X, (addic (addi Z, -C), -1).carry 14359 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, 14360 DAG.getConstant(NegConstant, DL, MVT::i64)); 14361 SDValue AddOrZ = NegConstant != 0 ? Add : Z; 14362 SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue), 14363 AddOrZ, DAG.getConstant(-1ULL, DL, MVT::i64)); 14364 return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), 14365 SDValue(Addc.getNode(), 1)); 14366 } 14367 case ISD::SETEQ: { 14368 // when C == 0 14369 // --> addze X, (subfic Z, 0).carry 14370 // / 14371 // add X, (zext(sete Z, C))-- 14372 // \ when -32768 <= -C <= 32767 && C != 0 14373 // --> addze X, (subfic (addi Z, -C), 0).carry 14374 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, 14375 DAG.getConstant(NegConstant, DL, MVT::i64)); 14376 SDValue AddOrZ = NegConstant != 0 ? Add : Z; 14377 SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue), 14378 DAG.getConstant(0, DL, MVT::i64), AddOrZ); 14379 return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), 14380 SDValue(Subc.getNode(), 1)); 14381 } 14382 } 14383 14384 return SDValue(); 14385 } 14386 14387 SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const { 14388 if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget)) 14389 return Value; 14390 14391 return SDValue(); 14392 } 14393 14394 // Detect TRUNCATE operations on bitcasts of float128 values. 14395 // What we are looking for here is the situtation where we extract a subset 14396 // of bits from a 128 bit float. 14397 // This can be of two forms: 14398 // 1) BITCAST of f128 feeding TRUNCATE 14399 // 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE 14400 // The reason this is required is because we do not have a legal i128 type 14401 // and so we want to prevent having to store the f128 and then reload part 14402 // of it. 14403 SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N, 14404 DAGCombinerInfo &DCI) const { 14405 // If we are using CRBits then try that first. 14406 if (Subtarget.useCRBits()) { 14407 // Check if CRBits did anything and return that if it did. 14408 if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI)) 14409 return CRTruncValue; 14410 } 14411 14412 SDLoc dl(N); 14413 SDValue Op0 = N->getOperand(0); 14414 14415 // Looking for a truncate of i128 to i64. 14416 if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64) 14417 return SDValue(); 14418 14419 int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0; 14420 14421 // SRL feeding TRUNCATE. 14422 if (Op0.getOpcode() == ISD::SRL) { 14423 ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Op0.getOperand(1)); 14424 // The right shift has to be by 64 bits. 14425 if (!ConstNode || ConstNode->getZExtValue() != 64) 14426 return SDValue(); 14427 14428 // Switch the element number to extract. 14429 EltToExtract = EltToExtract ? 0 : 1; 14430 // Update Op0 past the SRL. 14431 Op0 = Op0.getOperand(0); 14432 } 14433 14434 // BITCAST feeding a TRUNCATE possibly via SRL. 14435 if (Op0.getOpcode() == ISD::BITCAST && 14436 Op0.getValueType() == MVT::i128 && 14437 Op0.getOperand(0).getValueType() == MVT::f128) { 14438 SDValue Bitcast = DCI.DAG.getBitcast(MVT::v2i64, Op0.getOperand(0)); 14439 return DCI.DAG.getNode( 14440 ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Bitcast, 14441 DCI.DAG.getTargetConstant(EltToExtract, dl, MVT::i32)); 14442 } 14443 return SDValue(); 14444 } 14445 14446 bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 14447 // Only duplicate to increase tail-calls for the 64bit SysV ABIs. 14448 if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64()) 14449 return false; 14450 14451 // If not a tail call then no need to proceed. 14452 if (!CI->isTailCall()) 14453 return false; 14454 14455 // If tail calls are disabled for the caller then we are done. 14456 const Function *Caller = CI->getParent()->getParent(); 14457 auto Attr = Caller->getFnAttribute("disable-tail-calls"); 14458 if (Attr.getValueAsString() == "true") 14459 return false; 14460 14461 // If sibling calls have been disabled and tail-calls aren't guaranteed 14462 // there is no reason to duplicate. 14463 auto &TM = getTargetMachine(); 14464 if (!TM.Options.GuaranteedTailCallOpt && DisableSCO) 14465 return false; 14466 14467 // Can't tail call a function called indirectly, or if it has variadic args. 14468 const Function *Callee = CI->getCalledFunction(); 14469 if (!Callee || Callee->isVarArg()) 14470 return false; 14471 14472 // Make sure the callee and caller calling conventions are eligible for tco. 14473 if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(), 14474 CI->getCallingConv())) 14475 return false; 14476 14477 // If the function is local then we have a good chance at tail-calling it 14478 return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee); 14479 } 14480 14481 bool PPCTargetLowering::hasBitPreservingFPLogic(EVT VT) const { 14482 if (!Subtarget.hasVSX()) 14483 return false; 14484 if (Subtarget.hasP9Vector() && VT == MVT::f128) 14485 return true; 14486 return VT == MVT::f32 || VT == MVT::f64 || 14487 VT == MVT::v4f32 || VT == MVT::v2f64; 14488 } 14489 14490 bool PPCTargetLowering:: 14491 isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const { 14492 const Value *Mask = AndI.getOperand(1); 14493 // If the mask is suitable for andi. or andis. we should sink the and. 14494 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Mask)) { 14495 // Can't handle constants wider than 64-bits. 14496 if (CI->getBitWidth() > 64) 14497 return false; 14498 int64_t ConstVal = CI->getZExtValue(); 14499 return isUInt<16>(ConstVal) || 14500 (isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF)); 14501 } 14502 14503 // For non-constant masks, we can always use the record-form and. 14504 return true; 14505 } 14506