1 //===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the PPCISelLowering class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "PPCISelLowering.h" 14 #include "MCTargetDesc/PPCPredicates.h" 15 #include "PPC.h" 16 #include "PPCCCState.h" 17 #include "PPCCallingConv.h" 18 #include "PPCFrameLowering.h" 19 #include "PPCInstrInfo.h" 20 #include "PPCMachineFunctionInfo.h" 21 #include "PPCPerfectShuffle.h" 22 #include "PPCRegisterInfo.h" 23 #include "PPCSubtarget.h" 24 #include "PPCTargetMachine.h" 25 #include "llvm/ADT/APFloat.h" 26 #include "llvm/ADT/APInt.h" 27 #include "llvm/ADT/ArrayRef.h" 28 #include "llvm/ADT/DenseMap.h" 29 #include "llvm/ADT/None.h" 30 #include "llvm/ADT/STLExtras.h" 31 #include "llvm/ADT/SmallPtrSet.h" 32 #include "llvm/ADT/SmallSet.h" 33 #include "llvm/ADT/SmallVector.h" 34 #include "llvm/ADT/Statistic.h" 35 #include "llvm/ADT/StringRef.h" 36 #include "llvm/ADT/StringSwitch.h" 37 #include "llvm/CodeGen/CallingConvLower.h" 38 #include "llvm/CodeGen/ISDOpcodes.h" 39 #include "llvm/CodeGen/MachineBasicBlock.h" 40 #include "llvm/CodeGen/MachineFrameInfo.h" 41 #include "llvm/CodeGen/MachineFunction.h" 42 #include "llvm/CodeGen/MachineInstr.h" 43 #include "llvm/CodeGen/MachineInstrBuilder.h" 44 #include "llvm/CodeGen/MachineJumpTableInfo.h" 45 #include "llvm/CodeGen/MachineLoopInfo.h" 46 #include "llvm/CodeGen/MachineMemOperand.h" 47 #include "llvm/CodeGen/MachineOperand.h" 48 #include "llvm/CodeGen/MachineRegisterInfo.h" 49 #include "llvm/CodeGen/RuntimeLibcalls.h" 50 #include "llvm/CodeGen/SelectionDAG.h" 51 #include "llvm/CodeGen/SelectionDAGNodes.h" 52 #include "llvm/CodeGen/TargetInstrInfo.h" 53 #include "llvm/CodeGen/TargetLowering.h" 54 #include "llvm/CodeGen/TargetRegisterInfo.h" 55 #include "llvm/CodeGen/ValueTypes.h" 56 #include "llvm/IR/CallSite.h" 57 #include "llvm/IR/CallingConv.h" 58 #include "llvm/IR/Constant.h" 59 #include "llvm/IR/Constants.h" 60 #include "llvm/IR/DataLayout.h" 61 #include "llvm/IR/DebugLoc.h" 62 #include "llvm/IR/DerivedTypes.h" 63 #include "llvm/IR/Function.h" 64 #include "llvm/IR/GlobalValue.h" 65 #include "llvm/IR/IRBuilder.h" 66 #include "llvm/IR/Instructions.h" 67 #include "llvm/IR/Intrinsics.h" 68 #include "llvm/IR/Module.h" 69 #include "llvm/IR/Type.h" 70 #include "llvm/IR/Use.h" 71 #include "llvm/IR/Value.h" 72 #include "llvm/MC/MCExpr.h" 73 #include "llvm/MC/MCRegisterInfo.h" 74 #include "llvm/Support/AtomicOrdering.h" 75 #include "llvm/Support/BranchProbability.h" 76 #include "llvm/Support/Casting.h" 77 #include "llvm/Support/CodeGen.h" 78 #include "llvm/Support/CommandLine.h" 79 #include "llvm/Support/Compiler.h" 80 #include "llvm/Support/Debug.h" 81 #include "llvm/Support/ErrorHandling.h" 82 #include "llvm/Support/Format.h" 83 #include "llvm/Support/KnownBits.h" 84 #include "llvm/Support/MachineValueType.h" 85 #include "llvm/Support/MathExtras.h" 86 #include "llvm/Support/raw_ostream.h" 87 #include "llvm/Target/TargetMachine.h" 88 #include "llvm/Target/TargetOptions.h" 89 #include <algorithm> 90 #include <cassert> 91 #include <cstdint> 92 #include <iterator> 93 #include <list> 94 #include <utility> 95 #include <vector> 96 97 using namespace llvm; 98 99 #define DEBUG_TYPE "ppc-lowering" 100 101 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc", 102 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); 103 104 static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref", 105 cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden); 106 107 static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned", 108 cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); 109 110 static cl::opt<bool> DisableSCO("disable-ppc-sco", 111 cl::desc("disable sibling call optimization on ppc"), cl::Hidden); 112 113 static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision", 114 cl::desc("enable quad precision float support on ppc"), cl::Hidden); 115 116 STATISTIC(NumTailCalls, "Number of tail calls"); 117 STATISTIC(NumSiblingCalls, "Number of sibling calls"); 118 119 static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); 120 121 // FIXME: Remove this once the bug has been fixed! 122 extern cl::opt<bool> ANDIGlueBug; 123 124 PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, 125 const PPCSubtarget &STI) 126 : TargetLowering(TM), Subtarget(STI) { 127 // Use _setjmp/_longjmp instead of setjmp/longjmp. 128 setUseUnderscoreSetJmp(true); 129 setUseUnderscoreLongJmp(true); 130 131 // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all 132 // arguments are at least 4/8 bytes aligned. 133 bool isPPC64 = Subtarget.isPPC64(); 134 setMinStackArgumentAlignment(isPPC64 ? 8:4); 135 136 // Set up the register classes. 137 addRegisterClass(MVT::i32, &PPC::GPRCRegClass); 138 if (!useSoftFloat()) { 139 if (hasSPE()) { 140 addRegisterClass(MVT::f32, &PPC::SPE4RCRegClass); 141 addRegisterClass(MVT::f64, &PPC::SPERCRegClass); 142 } else { 143 addRegisterClass(MVT::f32, &PPC::F4RCRegClass); 144 addRegisterClass(MVT::f64, &PPC::F8RCRegClass); 145 } 146 } 147 148 // Match BITREVERSE to customized fast code sequence in the td file. 149 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 150 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 151 152 // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended. 153 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 154 155 // PowerPC has an i16 but no i8 (or i1) SEXTLOAD. 156 for (MVT VT : MVT::integer_valuetypes()) { 157 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 158 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); 159 } 160 161 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 162 163 // PowerPC has pre-inc load and store's. 164 setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal); 165 setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal); 166 setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal); 167 setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal); 168 setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal); 169 setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal); 170 setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal); 171 setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal); 172 setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal); 173 setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal); 174 if (!Subtarget.hasSPE()) { 175 setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal); 176 setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal); 177 setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal); 178 setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal); 179 } 180 181 // PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry. 182 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 183 for (MVT VT : ScalarIntVTs) { 184 setOperationAction(ISD::ADDC, VT, Legal); 185 setOperationAction(ISD::ADDE, VT, Legal); 186 setOperationAction(ISD::SUBC, VT, Legal); 187 setOperationAction(ISD::SUBE, VT, Legal); 188 } 189 190 if (Subtarget.useCRBits()) { 191 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 192 193 if (isPPC64 || Subtarget.hasFPCVT()) { 194 setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); 195 AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, 196 isPPC64 ? MVT::i64 : MVT::i32); 197 setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); 198 AddPromotedToType(ISD::UINT_TO_FP, MVT::i1, 199 isPPC64 ? MVT::i64 : MVT::i32); 200 } else { 201 setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom); 202 setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); 203 } 204 205 // PowerPC does not support direct load/store of condition registers. 206 setOperationAction(ISD::LOAD, MVT::i1, Custom); 207 setOperationAction(ISD::STORE, MVT::i1, Custom); 208 209 // FIXME: Remove this once the ANDI glue bug is fixed: 210 if (ANDIGlueBug) 211 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); 212 213 for (MVT VT : MVT::integer_valuetypes()) { 214 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 215 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 216 setTruncStoreAction(VT, MVT::i1, Expand); 217 } 218 219 addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass); 220 } 221 222 // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on 223 // PPC (the libcall is not available). 224 setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom); 225 setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom); 226 227 // We do not currently implement these libm ops for PowerPC. 228 setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand); 229 setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand); 230 setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand); 231 setOperationAction(ISD::FRINT, MVT::ppcf128, Expand); 232 setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand); 233 setOperationAction(ISD::FREM, MVT::ppcf128, Expand); 234 235 // PowerPC has no SREM/UREM instructions unless we are on P9 236 // On P9 we may use a hardware instruction to compute the remainder. 237 // The instructions are not legalized directly because in the cases where the 238 // result of both the remainder and the division is required it is more 239 // efficient to compute the remainder from the result of the division rather 240 // than use the remainder instruction. 241 if (Subtarget.isISA3_0()) { 242 setOperationAction(ISD::SREM, MVT::i32, Custom); 243 setOperationAction(ISD::UREM, MVT::i32, Custom); 244 setOperationAction(ISD::SREM, MVT::i64, Custom); 245 setOperationAction(ISD::UREM, MVT::i64, Custom); 246 } else { 247 setOperationAction(ISD::SREM, MVT::i32, Expand); 248 setOperationAction(ISD::UREM, MVT::i32, Expand); 249 setOperationAction(ISD::SREM, MVT::i64, Expand); 250 setOperationAction(ISD::UREM, MVT::i64, Expand); 251 } 252 253 // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. 254 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 255 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 256 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 257 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 258 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 259 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 260 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 261 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 262 263 // We don't support sin/cos/sqrt/fmod/pow 264 setOperationAction(ISD::FSIN , MVT::f64, Expand); 265 setOperationAction(ISD::FCOS , MVT::f64, Expand); 266 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 267 setOperationAction(ISD::FREM , MVT::f64, Expand); 268 setOperationAction(ISD::FPOW , MVT::f64, Expand); 269 setOperationAction(ISD::FSIN , MVT::f32, Expand); 270 setOperationAction(ISD::FCOS , MVT::f32, Expand); 271 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 272 setOperationAction(ISD::FREM , MVT::f32, Expand); 273 setOperationAction(ISD::FPOW , MVT::f32, Expand); 274 if (Subtarget.hasSPE()) { 275 setOperationAction(ISD::FMA , MVT::f64, Expand); 276 setOperationAction(ISD::FMA , MVT::f32, Expand); 277 } else { 278 setOperationAction(ISD::FMA , MVT::f64, Legal); 279 setOperationAction(ISD::FMA , MVT::f32, Legal); 280 } 281 282 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 283 284 // If we're enabling GP optimizations, use hardware square root 285 if (!Subtarget.hasFSQRT() && 286 !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() && 287 Subtarget.hasFRE())) 288 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 289 290 if (!Subtarget.hasFSQRT() && 291 !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() && 292 Subtarget.hasFRES())) 293 setOperationAction(ISD::FSQRT, MVT::f32, Expand); 294 295 if (Subtarget.hasFCPSGN()) { 296 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal); 297 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal); 298 } else { 299 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 300 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 301 } 302 303 if (Subtarget.hasFPRND()) { 304 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 305 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 306 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 307 setOperationAction(ISD::FROUND, MVT::f64, Legal); 308 309 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 310 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 311 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 312 setOperationAction(ISD::FROUND, MVT::f32, Legal); 313 } 314 315 // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd 316 // to speed up scalar BSWAP64. 317 // CTPOP or CTTZ were introduced in P8/P9 respectively 318 setOperationAction(ISD::BSWAP, MVT::i32 , Expand); 319 if (Subtarget.hasP9Vector()) 320 setOperationAction(ISD::BSWAP, MVT::i64 , Custom); 321 else 322 setOperationAction(ISD::BSWAP, MVT::i64 , Expand); 323 if (Subtarget.isISA3_0()) { 324 setOperationAction(ISD::CTTZ , MVT::i32 , Legal); 325 setOperationAction(ISD::CTTZ , MVT::i64 , Legal); 326 } else { 327 setOperationAction(ISD::CTTZ , MVT::i32 , Expand); 328 setOperationAction(ISD::CTTZ , MVT::i64 , Expand); 329 } 330 331 if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) { 332 setOperationAction(ISD::CTPOP, MVT::i32 , Legal); 333 setOperationAction(ISD::CTPOP, MVT::i64 , Legal); 334 } else { 335 setOperationAction(ISD::CTPOP, MVT::i32 , Expand); 336 setOperationAction(ISD::CTPOP, MVT::i64 , Expand); 337 } 338 339 // PowerPC does not have ROTR 340 setOperationAction(ISD::ROTR, MVT::i32 , Expand); 341 setOperationAction(ISD::ROTR, MVT::i64 , Expand); 342 343 if (!Subtarget.useCRBits()) { 344 // PowerPC does not have Select 345 setOperationAction(ISD::SELECT, MVT::i32, Expand); 346 setOperationAction(ISD::SELECT, MVT::i64, Expand); 347 setOperationAction(ISD::SELECT, MVT::f32, Expand); 348 setOperationAction(ISD::SELECT, MVT::f64, Expand); 349 } 350 351 // PowerPC wants to turn select_cc of FP into fsel when possible. 352 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 353 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 354 355 // PowerPC wants to optimize integer setcc a bit 356 if (!Subtarget.useCRBits()) 357 setOperationAction(ISD::SETCC, MVT::i32, Custom); 358 359 // PowerPC does not have BRCOND which requires SetCC 360 if (!Subtarget.useCRBits()) 361 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 362 363 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 364 365 if (Subtarget.hasSPE()) { 366 // SPE has built-in conversions 367 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); 368 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); 369 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); 370 } else { 371 // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores. 372 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 373 374 // PowerPC does not have [U|S]INT_TO_FP 375 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); 376 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); 377 } 378 379 if (Subtarget.hasDirectMove() && isPPC64) { 380 setOperationAction(ISD::BITCAST, MVT::f32, Legal); 381 setOperationAction(ISD::BITCAST, MVT::i32, Legal); 382 setOperationAction(ISD::BITCAST, MVT::i64, Legal); 383 setOperationAction(ISD::BITCAST, MVT::f64, Legal); 384 } else { 385 setOperationAction(ISD::BITCAST, MVT::f32, Expand); 386 setOperationAction(ISD::BITCAST, MVT::i32, Expand); 387 setOperationAction(ISD::BITCAST, MVT::i64, Expand); 388 setOperationAction(ISD::BITCAST, MVT::f64, Expand); 389 } 390 391 // We cannot sextinreg(i1). Expand to shifts. 392 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 393 394 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support 395 // SjLj exception handling but a light-weight setjmp/longjmp replacement to 396 // support continuation, user-level threading, and etc.. As a result, no 397 // other SjLj exception interfaces are implemented and please don't build 398 // your own exception handling based on them. 399 // LLVM/Clang supports zero-cost DWARF exception handling. 400 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 401 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 402 403 // We want to legalize GlobalAddress and ConstantPool nodes into the 404 // appropriate instructions to materialize the address. 405 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 406 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 407 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 408 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 409 setOperationAction(ISD::JumpTable, MVT::i32, Custom); 410 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 411 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 412 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 413 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 414 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 415 416 // TRAP is legal. 417 setOperationAction(ISD::TRAP, MVT::Other, Legal); 418 419 // TRAMPOLINE is custom lowered. 420 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); 421 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); 422 423 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 424 setOperationAction(ISD::VASTART , MVT::Other, Custom); 425 426 if (Subtarget.isSVR4ABI()) { 427 if (isPPC64) { 428 // VAARG always uses double-word chunks, so promote anything smaller. 429 setOperationAction(ISD::VAARG, MVT::i1, Promote); 430 AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64); 431 setOperationAction(ISD::VAARG, MVT::i8, Promote); 432 AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64); 433 setOperationAction(ISD::VAARG, MVT::i16, Promote); 434 AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64); 435 setOperationAction(ISD::VAARG, MVT::i32, Promote); 436 AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64); 437 setOperationAction(ISD::VAARG, MVT::Other, Expand); 438 } else { 439 // VAARG is custom lowered with the 32-bit SVR4 ABI. 440 setOperationAction(ISD::VAARG, MVT::Other, Custom); 441 setOperationAction(ISD::VAARG, MVT::i64, Custom); 442 } 443 } else 444 setOperationAction(ISD::VAARG, MVT::Other, Expand); 445 446 if (Subtarget.isSVR4ABI() && !isPPC64) 447 // VACOPY is custom lowered with the 32-bit SVR4 ABI. 448 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 449 else 450 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 451 452 // Use the default implementation. 453 setOperationAction(ISD::VAEND , MVT::Other, Expand); 454 setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); 455 setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); 456 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); 457 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); 458 setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom); 459 setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom); 460 setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); 461 setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom); 462 463 // We want to custom lower some of our intrinsics. 464 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 465 466 // To handle counter-based loop conditions. 467 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); 468 469 setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); 470 setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); 471 setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom); 472 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 473 474 // Comparisons that require checking two conditions. 475 if (Subtarget.hasSPE()) { 476 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 477 setCondCodeAction(ISD::SETO, MVT::f64, Expand); 478 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 479 setCondCodeAction(ISD::SETUO, MVT::f64, Expand); 480 } 481 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 482 setCondCodeAction(ISD::SETULT, MVT::f64, Expand); 483 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 484 setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); 485 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 486 setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); 487 setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); 488 setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); 489 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 490 setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); 491 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 492 setCondCodeAction(ISD::SETONE, MVT::f64, Expand); 493 494 if (Subtarget.has64BitSupport()) { 495 // They also have instructions for converting between i64 and fp. 496 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 497 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); 498 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 499 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); 500 // This is just the low 32 bits of a (signed) fp->i64 conversion. 501 // We cannot do this with Promote because i64 is not a legal type. 502 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 503 504 if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) 505 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 506 } else { 507 // PowerPC does not have FP_TO_UINT on 32-bit implementations. 508 if (Subtarget.hasSPE()) 509 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); 510 else 511 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); 512 } 513 514 // With the instructions enabled under FPCVT, we can do everything. 515 if (Subtarget.hasFPCVT()) { 516 if (Subtarget.has64BitSupport()) { 517 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 518 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 519 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 520 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 521 } 522 523 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 524 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 525 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 526 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 527 } 528 529 if (Subtarget.use64BitRegs()) { 530 // 64-bit PowerPC implementations can support i64 types directly 531 addRegisterClass(MVT::i64, &PPC::G8RCRegClass); 532 // BUILD_PAIR can't be handled natively, and should be expanded to shl/or 533 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); 534 // 64-bit PowerPC wants to expand i128 shifts itself. 535 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 536 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 537 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 538 } else { 539 // 32-bit PowerPC wants to expand i64 shifts itself. 540 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 541 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 542 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 543 } 544 545 if (Subtarget.hasAltivec()) { 546 // First set operation action for all vector types to expand. Then we 547 // will selectively turn on ones that can be effectively codegen'd. 548 for (MVT VT : MVT::vector_valuetypes()) { 549 // add/sub are legal for all supported vector VT's. 550 setOperationAction(ISD::ADD, VT, Legal); 551 setOperationAction(ISD::SUB, VT, Legal); 552 setOperationAction(ISD::ABS, VT, Custom); 553 554 // Vector instructions introduced in P8 555 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { 556 setOperationAction(ISD::CTPOP, VT, Legal); 557 setOperationAction(ISD::CTLZ, VT, Legal); 558 } 559 else { 560 setOperationAction(ISD::CTPOP, VT, Expand); 561 setOperationAction(ISD::CTLZ, VT, Expand); 562 } 563 564 // Vector instructions introduced in P9 565 if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128)) 566 setOperationAction(ISD::CTTZ, VT, Legal); 567 else 568 setOperationAction(ISD::CTTZ, VT, Expand); 569 570 // We promote all shuffles to v16i8. 571 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote); 572 AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8); 573 574 // We promote all non-typed operations to v4i32. 575 setOperationAction(ISD::AND , VT, Promote); 576 AddPromotedToType (ISD::AND , VT, MVT::v4i32); 577 setOperationAction(ISD::OR , VT, Promote); 578 AddPromotedToType (ISD::OR , VT, MVT::v4i32); 579 setOperationAction(ISD::XOR , VT, Promote); 580 AddPromotedToType (ISD::XOR , VT, MVT::v4i32); 581 setOperationAction(ISD::LOAD , VT, Promote); 582 AddPromotedToType (ISD::LOAD , VT, MVT::v4i32); 583 setOperationAction(ISD::SELECT, VT, Promote); 584 AddPromotedToType (ISD::SELECT, VT, MVT::v4i32); 585 setOperationAction(ISD::VSELECT, VT, Legal); 586 setOperationAction(ISD::SELECT_CC, VT, Promote); 587 AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32); 588 setOperationAction(ISD::STORE, VT, Promote); 589 AddPromotedToType (ISD::STORE, VT, MVT::v4i32); 590 591 // No other operations are legal. 592 setOperationAction(ISD::MUL , VT, Expand); 593 setOperationAction(ISD::SDIV, VT, Expand); 594 setOperationAction(ISD::SREM, VT, Expand); 595 setOperationAction(ISD::UDIV, VT, Expand); 596 setOperationAction(ISD::UREM, VT, Expand); 597 setOperationAction(ISD::FDIV, VT, Expand); 598 setOperationAction(ISD::FREM, VT, Expand); 599 setOperationAction(ISD::FNEG, VT, Expand); 600 setOperationAction(ISD::FSQRT, VT, Expand); 601 setOperationAction(ISD::FLOG, VT, Expand); 602 setOperationAction(ISD::FLOG10, VT, Expand); 603 setOperationAction(ISD::FLOG2, VT, Expand); 604 setOperationAction(ISD::FEXP, VT, Expand); 605 setOperationAction(ISD::FEXP2, VT, Expand); 606 setOperationAction(ISD::FSIN, VT, Expand); 607 setOperationAction(ISD::FCOS, VT, Expand); 608 setOperationAction(ISD::FABS, VT, Expand); 609 setOperationAction(ISD::FFLOOR, VT, Expand); 610 setOperationAction(ISD::FCEIL, VT, Expand); 611 setOperationAction(ISD::FTRUNC, VT, Expand); 612 setOperationAction(ISD::FRINT, VT, Expand); 613 setOperationAction(ISD::FNEARBYINT, VT, Expand); 614 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); 615 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); 616 setOperationAction(ISD::BUILD_VECTOR, VT, Expand); 617 setOperationAction(ISD::MULHU, VT, Expand); 618 setOperationAction(ISD::MULHS, VT, Expand); 619 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 620 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 621 setOperationAction(ISD::UDIVREM, VT, Expand); 622 setOperationAction(ISD::SDIVREM, VT, Expand); 623 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 624 setOperationAction(ISD::FPOW, VT, Expand); 625 setOperationAction(ISD::BSWAP, VT, Expand); 626 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 627 setOperationAction(ISD::ROTL, VT, Expand); 628 setOperationAction(ISD::ROTR, VT, Expand); 629 630 for (MVT InnerVT : MVT::vector_valuetypes()) { 631 setTruncStoreAction(VT, InnerVT, Expand); 632 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 633 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 634 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 635 } 636 } 637 638 // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle 639 // with merges, splats, etc. 640 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); 641 642 setOperationAction(ISD::AND , MVT::v4i32, Legal); 643 setOperationAction(ISD::OR , MVT::v4i32, Legal); 644 setOperationAction(ISD::XOR , MVT::v4i32, Legal); 645 setOperationAction(ISD::LOAD , MVT::v4i32, Legal); 646 setOperationAction(ISD::SELECT, MVT::v4i32, 647 Subtarget.useCRBits() ? Legal : Expand); 648 setOperationAction(ISD::STORE , MVT::v4i32, Legal); 649 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 650 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); 651 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 652 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); 653 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 654 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 655 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 656 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); 657 658 // Without hasP8Altivec set, v2i64 SMAX isn't available. 659 // But ABS custom lowering requires SMAX support. 660 if (!Subtarget.hasP8Altivec()) 661 setOperationAction(ISD::ABS, MVT::v2i64, Expand); 662 663 addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); 664 addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); 665 addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); 666 addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass); 667 668 setOperationAction(ISD::MUL, MVT::v4f32, Legal); 669 setOperationAction(ISD::FMA, MVT::v4f32, Legal); 670 671 if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) { 672 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 673 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 674 } 675 676 if (Subtarget.hasP8Altivec()) 677 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 678 else 679 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 680 681 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 682 setOperationAction(ISD::MUL, MVT::v16i8, Custom); 683 684 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom); 685 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom); 686 687 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); 688 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); 689 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); 690 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 691 692 // Altivec does not contain unordered floating-point compare instructions 693 setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); 694 setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand); 695 setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); 696 setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); 697 698 if (Subtarget.hasVSX()) { 699 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); 700 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); 701 if (Subtarget.hasP8Vector()) { 702 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); 703 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); 704 } 705 if (Subtarget.hasDirectMove() && isPPC64) { 706 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal); 707 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal); 708 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal); 709 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal); 710 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal); 711 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal); 712 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); 713 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 714 } 715 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); 716 717 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); 718 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); 719 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); 720 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); 721 setOperationAction(ISD::FROUND, MVT::v2f64, Legal); 722 723 setOperationAction(ISD::FROUND, MVT::v4f32, Legal); 724 725 setOperationAction(ISD::MUL, MVT::v2f64, Legal); 726 setOperationAction(ISD::FMA, MVT::v2f64, Legal); 727 728 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 729 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 730 731 // Share the Altivec comparison restrictions. 732 setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); 733 setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); 734 setCondCodeAction(ISD::SETO, MVT::v2f64, Expand); 735 setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand); 736 737 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 738 setOperationAction(ISD::STORE, MVT::v2f64, Legal); 739 740 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal); 741 742 if (Subtarget.hasP8Vector()) 743 addRegisterClass(MVT::f32, &PPC::VSSRCRegClass); 744 745 addRegisterClass(MVT::f64, &PPC::VSFRCRegClass); 746 747 addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass); 748 addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass); 749 addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass); 750 751 if (Subtarget.hasP8Altivec()) { 752 setOperationAction(ISD::SHL, MVT::v2i64, Legal); 753 setOperationAction(ISD::SRA, MVT::v2i64, Legal); 754 setOperationAction(ISD::SRL, MVT::v2i64, Legal); 755 756 // 128 bit shifts can be accomplished via 3 instructions for SHL and 757 // SRL, but not for SRA because of the instructions available: 758 // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth 759 // doing 760 setOperationAction(ISD::SHL, MVT::v1i128, Expand); 761 setOperationAction(ISD::SRL, MVT::v1i128, Expand); 762 setOperationAction(ISD::SRA, MVT::v1i128, Expand); 763 764 setOperationAction(ISD::SETCC, MVT::v2i64, Legal); 765 } 766 else { 767 setOperationAction(ISD::SHL, MVT::v2i64, Expand); 768 setOperationAction(ISD::SRA, MVT::v2i64, Expand); 769 setOperationAction(ISD::SRL, MVT::v2i64, Expand); 770 771 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 772 773 // VSX v2i64 only supports non-arithmetic operations. 774 setOperationAction(ISD::ADD, MVT::v2i64, Expand); 775 setOperationAction(ISD::SUB, MVT::v2i64, Expand); 776 } 777 778 setOperationAction(ISD::LOAD, MVT::v2i64, Promote); 779 AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64); 780 setOperationAction(ISD::STORE, MVT::v2i64, Promote); 781 AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64); 782 783 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal); 784 785 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); 786 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); 787 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); 788 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); 789 790 // Custom handling for partial vectors of integers converted to 791 // floating point. We already have optimal handling for v2i32 through 792 // the DAG combine, so those aren't necessary. 793 setOperationAction(ISD::UINT_TO_FP, MVT::v2i8, Custom); 794 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); 795 setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom); 796 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 797 setOperationAction(ISD::SINT_TO_FP, MVT::v2i8, Custom); 798 setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Custom); 799 setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom); 800 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 801 802 setOperationAction(ISD::FNEG, MVT::v4f32, Legal); 803 setOperationAction(ISD::FNEG, MVT::v2f64, Legal); 804 setOperationAction(ISD::FABS, MVT::v4f32, Legal); 805 setOperationAction(ISD::FABS, MVT::v2f64, Legal); 806 807 if (Subtarget.hasDirectMove()) 808 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 809 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 810 811 addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass); 812 } 813 814 if (Subtarget.hasP8Altivec()) { 815 addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass); 816 addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass); 817 } 818 819 if (Subtarget.hasP9Vector()) { 820 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 821 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 822 823 // 128 bit shifts can be accomplished via 3 instructions for SHL and 824 // SRL, but not for SRA because of the instructions available: 825 // VS{RL} and VS{RL}O. 826 setOperationAction(ISD::SHL, MVT::v1i128, Legal); 827 setOperationAction(ISD::SRL, MVT::v1i128, Legal); 828 setOperationAction(ISD::SRA, MVT::v1i128, Expand); 829 830 if (EnableQuadPrecision) { 831 addRegisterClass(MVT::f128, &PPC::VRRCRegClass); 832 setOperationAction(ISD::FADD, MVT::f128, Legal); 833 setOperationAction(ISD::FSUB, MVT::f128, Legal); 834 setOperationAction(ISD::FDIV, MVT::f128, Legal); 835 setOperationAction(ISD::FMUL, MVT::f128, Legal); 836 setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal); 837 // No extending loads to f128 on PPC. 838 for (MVT FPT : MVT::fp_valuetypes()) 839 setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand); 840 setOperationAction(ISD::FMA, MVT::f128, Legal); 841 setCondCodeAction(ISD::SETULT, MVT::f128, Expand); 842 setCondCodeAction(ISD::SETUGT, MVT::f128, Expand); 843 setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand); 844 setCondCodeAction(ISD::SETOGE, MVT::f128, Expand); 845 setCondCodeAction(ISD::SETOLE, MVT::f128, Expand); 846 setCondCodeAction(ISD::SETONE, MVT::f128, Expand); 847 848 setOperationAction(ISD::FTRUNC, MVT::f128, Legal); 849 setOperationAction(ISD::FRINT, MVT::f128, Legal); 850 setOperationAction(ISD::FFLOOR, MVT::f128, Legal); 851 setOperationAction(ISD::FCEIL, MVT::f128, Legal); 852 setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal); 853 setOperationAction(ISD::FROUND, MVT::f128, Legal); 854 855 setOperationAction(ISD::SELECT, MVT::f128, Expand); 856 setOperationAction(ISD::FP_ROUND, MVT::f64, Legal); 857 setOperationAction(ISD::FP_ROUND, MVT::f32, Legal); 858 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 859 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 860 setOperationAction(ISD::BITCAST, MVT::i128, Custom); 861 // No implementation for these ops for PowerPC. 862 setOperationAction(ISD::FSIN , MVT::f128, Expand); 863 setOperationAction(ISD::FCOS , MVT::f128, Expand); 864 setOperationAction(ISD::FPOW, MVT::f128, Expand); 865 setOperationAction(ISD::FPOWI, MVT::f128, Expand); 866 setOperationAction(ISD::FREM, MVT::f128, Expand); 867 } 868 869 } 870 871 if (Subtarget.hasP9Altivec()) { 872 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 873 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 874 } 875 } 876 877 if (Subtarget.hasQPX()) { 878 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 879 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 880 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 881 setOperationAction(ISD::FREM, MVT::v4f64, Expand); 882 883 setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal); 884 setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand); 885 886 setOperationAction(ISD::LOAD , MVT::v4f64, Custom); 887 setOperationAction(ISD::STORE , MVT::v4f64, Custom); 888 889 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); 890 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom); 891 892 if (!Subtarget.useCRBits()) 893 setOperationAction(ISD::SELECT, MVT::v4f64, Expand); 894 setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); 895 896 setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal); 897 setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand); 898 setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand); 899 setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand); 900 setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom); 901 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal); 902 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 903 904 setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal); 905 setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); 906 907 setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); 908 setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand); 909 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); 910 911 setOperationAction(ISD::FNEG , MVT::v4f64, Legal); 912 setOperationAction(ISD::FABS , MVT::v4f64, Legal); 913 setOperationAction(ISD::FSIN , MVT::v4f64, Expand); 914 setOperationAction(ISD::FCOS , MVT::v4f64, Expand); 915 setOperationAction(ISD::FPOW , MVT::v4f64, Expand); 916 setOperationAction(ISD::FLOG , MVT::v4f64, Expand); 917 setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); 918 setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand); 919 setOperationAction(ISD::FEXP , MVT::v4f64, Expand); 920 setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand); 921 922 setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal); 923 setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal); 924 925 setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal); 926 setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal); 927 928 addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass); 929 930 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 931 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 932 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 933 setOperationAction(ISD::FREM, MVT::v4f32, Expand); 934 935 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); 936 setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand); 937 938 setOperationAction(ISD::LOAD , MVT::v4f32, Custom); 939 setOperationAction(ISD::STORE , MVT::v4f32, Custom); 940 941 if (!Subtarget.useCRBits()) 942 setOperationAction(ISD::SELECT, MVT::v4f32, Expand); 943 setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); 944 945 setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal); 946 setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand); 947 setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand); 948 setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand); 949 setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom); 950 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); 951 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 952 953 setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal); 954 setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand); 955 956 setOperationAction(ISD::FNEG , MVT::v4f32, Legal); 957 setOperationAction(ISD::FABS , MVT::v4f32, Legal); 958 setOperationAction(ISD::FSIN , MVT::v4f32, Expand); 959 setOperationAction(ISD::FCOS , MVT::v4f32, Expand); 960 setOperationAction(ISD::FPOW , MVT::v4f32, Expand); 961 setOperationAction(ISD::FLOG , MVT::v4f32, Expand); 962 setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); 963 setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand); 964 setOperationAction(ISD::FEXP , MVT::v4f32, Expand); 965 setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand); 966 967 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 968 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 969 970 setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal); 971 setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal); 972 973 addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass); 974 975 setOperationAction(ISD::AND , MVT::v4i1, Legal); 976 setOperationAction(ISD::OR , MVT::v4i1, Legal); 977 setOperationAction(ISD::XOR , MVT::v4i1, Legal); 978 979 if (!Subtarget.useCRBits()) 980 setOperationAction(ISD::SELECT, MVT::v4i1, Expand); 981 setOperationAction(ISD::VSELECT, MVT::v4i1, Legal); 982 983 setOperationAction(ISD::LOAD , MVT::v4i1, Custom); 984 setOperationAction(ISD::STORE , MVT::v4i1, Custom); 985 986 setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom); 987 setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand); 988 setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand); 989 setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand); 990 setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom); 991 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand); 992 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); 993 994 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); 995 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); 996 997 addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass); 998 999 setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); 1000 setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); 1001 setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); 1002 setOperationAction(ISD::FROUND, MVT::v4f64, Legal); 1003 1004 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 1005 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 1006 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 1007 setOperationAction(ISD::FROUND, MVT::v4f32, Legal); 1008 1009 setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand); 1010 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 1011 1012 // These need to set FE_INEXACT, and so cannot be vectorized here. 1013 setOperationAction(ISD::FRINT, MVT::v4f64, Expand); 1014 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 1015 1016 if (TM.Options.UnsafeFPMath) { 1017 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 1018 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 1019 1020 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 1021 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 1022 } else { 1023 setOperationAction(ISD::FDIV, MVT::v4f64, Expand); 1024 setOperationAction(ISD::FSQRT, MVT::v4f64, Expand); 1025 1026 setOperationAction(ISD::FDIV, MVT::v4f32, Expand); 1027 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 1028 } 1029 } 1030 1031 if (Subtarget.has64BitSupport()) 1032 setOperationAction(ISD::PREFETCH, MVT::Other, Legal); 1033 1034 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom); 1035 1036 if (!isPPC64) { 1037 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); 1038 setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); 1039 } 1040 1041 setBooleanContents(ZeroOrOneBooleanContent); 1042 1043 if (Subtarget.hasAltivec()) { 1044 // Altivec instructions set fields to all zeros or all ones. 1045 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 1046 } 1047 1048 if (!isPPC64) { 1049 // These libcalls are not available in 32-bit. 1050 setLibcallName(RTLIB::SHL_I128, nullptr); 1051 setLibcallName(RTLIB::SRL_I128, nullptr); 1052 setLibcallName(RTLIB::SRA_I128, nullptr); 1053 } 1054 1055 setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); 1056 1057 // We have target-specific dag combine patterns for the following nodes: 1058 setTargetDAGCombine(ISD::ADD); 1059 setTargetDAGCombine(ISD::SHL); 1060 setTargetDAGCombine(ISD::SRA); 1061 setTargetDAGCombine(ISD::SRL); 1062 setTargetDAGCombine(ISD::SINT_TO_FP); 1063 setTargetDAGCombine(ISD::BUILD_VECTOR); 1064 if (Subtarget.hasFPCVT()) 1065 setTargetDAGCombine(ISD::UINT_TO_FP); 1066 setTargetDAGCombine(ISD::LOAD); 1067 setTargetDAGCombine(ISD::STORE); 1068 setTargetDAGCombine(ISD::BR_CC); 1069 if (Subtarget.useCRBits()) 1070 setTargetDAGCombine(ISD::BRCOND); 1071 setTargetDAGCombine(ISD::BSWAP); 1072 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 1073 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 1074 setTargetDAGCombine(ISD::INTRINSIC_VOID); 1075 1076 setTargetDAGCombine(ISD::SIGN_EXTEND); 1077 setTargetDAGCombine(ISD::ZERO_EXTEND); 1078 setTargetDAGCombine(ISD::ANY_EXTEND); 1079 1080 setTargetDAGCombine(ISD::TRUNCATE); 1081 1082 if (Subtarget.useCRBits()) { 1083 setTargetDAGCombine(ISD::TRUNCATE); 1084 setTargetDAGCombine(ISD::SETCC); 1085 setTargetDAGCombine(ISD::SELECT_CC); 1086 } 1087 1088 // Use reciprocal estimates. 1089 if (TM.Options.UnsafeFPMath) { 1090 setTargetDAGCombine(ISD::FDIV); 1091 setTargetDAGCombine(ISD::FSQRT); 1092 } 1093 1094 if (Subtarget.hasP9Altivec()) { 1095 setTargetDAGCombine(ISD::ABS); 1096 setTargetDAGCombine(ISD::VSELECT); 1097 } 1098 1099 // Darwin long double math library functions have $LDBL128 appended. 1100 if (Subtarget.isDarwin()) { 1101 setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); 1102 setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128"); 1103 setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128"); 1104 setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128"); 1105 setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128"); 1106 setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128"); 1107 setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128"); 1108 setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128"); 1109 setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128"); 1110 setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128"); 1111 } 1112 1113 if (EnableQuadPrecision) { 1114 setLibcallName(RTLIB::LOG_F128, "logf128"); 1115 setLibcallName(RTLIB::LOG2_F128, "log2f128"); 1116 setLibcallName(RTLIB::LOG10_F128, "log10f128"); 1117 setLibcallName(RTLIB::EXP_F128, "expf128"); 1118 setLibcallName(RTLIB::EXP2_F128, "exp2f128"); 1119 setLibcallName(RTLIB::SIN_F128, "sinf128"); 1120 setLibcallName(RTLIB::COS_F128, "cosf128"); 1121 setLibcallName(RTLIB::POW_F128, "powf128"); 1122 setLibcallName(RTLIB::FMIN_F128, "fminf128"); 1123 setLibcallName(RTLIB::FMAX_F128, "fmaxf128"); 1124 setLibcallName(RTLIB::POWI_F128, "__powikf2"); 1125 setLibcallName(RTLIB::REM_F128, "fmodf128"); 1126 } 1127 1128 // With 32 condition bits, we don't need to sink (and duplicate) compares 1129 // aggressively in CodeGenPrep. 1130 if (Subtarget.useCRBits()) { 1131 setHasMultipleConditionRegisters(); 1132 setJumpIsExpensive(); 1133 } 1134 1135 setMinFunctionAlignment(2); 1136 if (Subtarget.isDarwin()) 1137 setPrefFunctionAlignment(4); 1138 1139 switch (Subtarget.getDarwinDirective()) { 1140 default: break; 1141 case PPC::DIR_970: 1142 case PPC::DIR_A2: 1143 case PPC::DIR_E500: 1144 case PPC::DIR_E500mc: 1145 case PPC::DIR_E5500: 1146 case PPC::DIR_PWR4: 1147 case PPC::DIR_PWR5: 1148 case PPC::DIR_PWR5X: 1149 case PPC::DIR_PWR6: 1150 case PPC::DIR_PWR6X: 1151 case PPC::DIR_PWR7: 1152 case PPC::DIR_PWR8: 1153 case PPC::DIR_PWR9: 1154 setPrefFunctionAlignment(4); 1155 setPrefLoopAlignment(4); 1156 break; 1157 } 1158 1159 if (Subtarget.enableMachineScheduler()) 1160 setSchedulingPreference(Sched::Source); 1161 else 1162 setSchedulingPreference(Sched::Hybrid); 1163 1164 computeRegisterProperties(STI.getRegisterInfo()); 1165 1166 // The Freescale cores do better with aggressive inlining of memcpy and 1167 // friends. GCC uses same threshold of 128 bytes (= 32 word stores). 1168 if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || 1169 Subtarget.getDarwinDirective() == PPC::DIR_E5500) { 1170 MaxStoresPerMemset = 32; 1171 MaxStoresPerMemsetOptSize = 16; 1172 MaxStoresPerMemcpy = 32; 1173 MaxStoresPerMemcpyOptSize = 8; 1174 MaxStoresPerMemmove = 32; 1175 MaxStoresPerMemmoveOptSize = 8; 1176 } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) { 1177 // The A2 also benefits from (very) aggressive inlining of memcpy and 1178 // friends. The overhead of a the function call, even when warm, can be 1179 // over one hundred cycles. 1180 MaxStoresPerMemset = 128; 1181 MaxStoresPerMemcpy = 128; 1182 MaxStoresPerMemmove = 128; 1183 MaxLoadsPerMemcmp = 128; 1184 } else { 1185 MaxLoadsPerMemcmp = 8; 1186 MaxLoadsPerMemcmpOptSize = 4; 1187 } 1188 } 1189 1190 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1191 /// the desired ByVal argument alignment. 1192 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, 1193 unsigned MaxMaxAlign) { 1194 if (MaxAlign == MaxMaxAlign) 1195 return; 1196 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1197 if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256) 1198 MaxAlign = 32; 1199 else if (VTy->getBitWidth() >= 128 && MaxAlign < 16) 1200 MaxAlign = 16; 1201 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1202 unsigned EltAlign = 0; 1203 getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign); 1204 if (EltAlign > MaxAlign) 1205 MaxAlign = EltAlign; 1206 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1207 for (auto *EltTy : STy->elements()) { 1208 unsigned EltAlign = 0; 1209 getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign); 1210 if (EltAlign > MaxAlign) 1211 MaxAlign = EltAlign; 1212 if (MaxAlign == MaxMaxAlign) 1213 break; 1214 } 1215 } 1216 } 1217 1218 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1219 /// function arguments in the caller parameter area. 1220 unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, 1221 const DataLayout &DL) const { 1222 // Darwin passes everything on 4 byte boundary. 1223 if (Subtarget.isDarwin()) 1224 return 4; 1225 1226 // 16byte and wider vectors are passed on 16byte boundary. 1227 // The rest is 8 on PPC64 and 4 on PPC32 boundary. 1228 unsigned Align = Subtarget.isPPC64() ? 8 : 4; 1229 if (Subtarget.hasAltivec() || Subtarget.hasQPX()) 1230 getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16); 1231 return Align; 1232 } 1233 1234 unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, 1235 CallingConv:: ID CC, 1236 EVT VT) const { 1237 if (Subtarget.hasSPE() && VT == MVT::f64) 1238 return 2; 1239 return PPCTargetLowering::getNumRegisters(Context, VT); 1240 } 1241 1242 MVT PPCTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, 1243 CallingConv:: ID CC, 1244 EVT VT) const { 1245 if (Subtarget.hasSPE() && VT == MVT::f64) 1246 return MVT::i32; 1247 return PPCTargetLowering::getRegisterType(Context, VT); 1248 } 1249 1250 bool PPCTargetLowering::useSoftFloat() const { 1251 return Subtarget.useSoftFloat(); 1252 } 1253 1254 bool PPCTargetLowering::hasSPE() const { 1255 return Subtarget.hasSPE(); 1256 } 1257 1258 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { 1259 switch ((PPCISD::NodeType)Opcode) { 1260 case PPCISD::FIRST_NUMBER: break; 1261 case PPCISD::FSEL: return "PPCISD::FSEL"; 1262 case PPCISD::FCFID: return "PPCISD::FCFID"; 1263 case PPCISD::FCFIDU: return "PPCISD::FCFIDU"; 1264 case PPCISD::FCFIDS: return "PPCISD::FCFIDS"; 1265 case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS"; 1266 case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ"; 1267 case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; 1268 case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ"; 1269 case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ"; 1270 case PPCISD::FP_TO_UINT_IN_VSR: 1271 return "PPCISD::FP_TO_UINT_IN_VSR,"; 1272 case PPCISD::FP_TO_SINT_IN_VSR: 1273 return "PPCISD::FP_TO_SINT_IN_VSR"; 1274 case PPCISD::FRE: return "PPCISD::FRE"; 1275 case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; 1276 case PPCISD::STFIWX: return "PPCISD::STFIWX"; 1277 case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; 1278 case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; 1279 case PPCISD::VPERM: return "PPCISD::VPERM"; 1280 case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; 1281 case PPCISD::VECINSERT: return "PPCISD::VECINSERT"; 1282 case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE"; 1283 case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; 1284 case PPCISD::VECSHL: return "PPCISD::VECSHL"; 1285 case PPCISD::CMPB: return "PPCISD::CMPB"; 1286 case PPCISD::Hi: return "PPCISD::Hi"; 1287 case PPCISD::Lo: return "PPCISD::Lo"; 1288 case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; 1289 case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8"; 1290 case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16"; 1291 case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; 1292 case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; 1293 case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; 1294 case PPCISD::SRL: return "PPCISD::SRL"; 1295 case PPCISD::SRA: return "PPCISD::SRA"; 1296 case PPCISD::SHL: return "PPCISD::SHL"; 1297 case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE"; 1298 case PPCISD::CALL: return "PPCISD::CALL"; 1299 case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP"; 1300 case PPCISD::MTCTR: return "PPCISD::MTCTR"; 1301 case PPCISD::BCTRL: return "PPCISD::BCTRL"; 1302 case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC"; 1303 case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; 1304 case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE"; 1305 case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; 1306 case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP"; 1307 case PPCISD::MFOCRF: return "PPCISD::MFOCRF"; 1308 case PPCISD::MFVSR: return "PPCISD::MFVSR"; 1309 case PPCISD::MTVSRA: return "PPCISD::MTVSRA"; 1310 case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ"; 1311 case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP"; 1312 case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP"; 1313 case PPCISD::ANDIo_1_EQ_BIT: return "PPCISD::ANDIo_1_EQ_BIT"; 1314 case PPCISD::ANDIo_1_GT_BIT: return "PPCISD::ANDIo_1_GT_BIT"; 1315 case PPCISD::VCMP: return "PPCISD::VCMP"; 1316 case PPCISD::VCMPo: return "PPCISD::VCMPo"; 1317 case PPCISD::LBRX: return "PPCISD::LBRX"; 1318 case PPCISD::STBRX: return "PPCISD::STBRX"; 1319 case PPCISD::LFIWAX: return "PPCISD::LFIWAX"; 1320 case PPCISD::LFIWZX: return "PPCISD::LFIWZX"; 1321 case PPCISD::LXSIZX: return "PPCISD::LXSIZX"; 1322 case PPCISD::STXSIX: return "PPCISD::STXSIX"; 1323 case PPCISD::VEXTS: return "PPCISD::VEXTS"; 1324 case PPCISD::SExtVElems: return "PPCISD::SExtVElems"; 1325 case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; 1326 case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; 1327 case PPCISD::ST_VSR_SCAL_INT: 1328 return "PPCISD::ST_VSR_SCAL_INT"; 1329 case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; 1330 case PPCISD::BDNZ: return "PPCISD::BDNZ"; 1331 case PPCISD::BDZ: return "PPCISD::BDZ"; 1332 case PPCISD::MFFS: return "PPCISD::MFFS"; 1333 case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ"; 1334 case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN"; 1335 case PPCISD::CR6SET: return "PPCISD::CR6SET"; 1336 case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET"; 1337 case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT"; 1338 case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT"; 1339 case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA"; 1340 case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L"; 1341 case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS"; 1342 case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA"; 1343 case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L"; 1344 case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR"; 1345 case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR"; 1346 case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA"; 1347 case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L"; 1348 case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR"; 1349 case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR"; 1350 case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; 1351 case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; 1352 case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; 1353 case PPCISD::SC: return "PPCISD::SC"; 1354 case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB"; 1355 case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE"; 1356 case PPCISD::RFEBB: return "PPCISD::RFEBB"; 1357 case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; 1358 case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; 1359 case PPCISD::VABSD: return "PPCISD::VABSD"; 1360 case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; 1361 case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; 1362 case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; 1363 case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI"; 1364 case PPCISD::QBFLT: return "PPCISD::QBFLT"; 1365 case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; 1366 case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; 1367 case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI"; 1368 } 1369 return nullptr; 1370 } 1371 1372 EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, 1373 EVT VT) const { 1374 if (!VT.isVector()) 1375 return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; 1376 1377 if (Subtarget.hasQPX()) 1378 return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements()); 1379 1380 return VT.changeVectorElementTypeToInteger(); 1381 } 1382 1383 bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const { 1384 assert(VT.isFloatingPoint() && "Non-floating-point FMA?"); 1385 return true; 1386 } 1387 1388 //===----------------------------------------------------------------------===// 1389 // Node matching predicates, for use by the tblgen matching code. 1390 //===----------------------------------------------------------------------===// 1391 1392 /// isFloatingPointZero - Return true if this is 0.0 or -0.0. 1393 static bool isFloatingPointZero(SDValue Op) { 1394 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 1395 return CFP->getValueAPF().isZero(); 1396 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 1397 // Maybe this has already been legalized into the constant pool? 1398 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1))) 1399 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 1400 return CFP->getValueAPF().isZero(); 1401 } 1402 return false; 1403 } 1404 1405 /// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return 1406 /// true if Op is undef or if it matches the specified value. 1407 static bool isConstantOrUndef(int Op, int Val) { 1408 return Op < 0 || Op == Val; 1409 } 1410 1411 /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a 1412 /// VPKUHUM instruction. 1413 /// The ShuffleKind distinguishes between big-endian operations with 1414 /// two different inputs (0), either-endian operations with two identical 1415 /// inputs (1), and little-endian operations with two different inputs (2). 1416 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). 1417 bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, 1418 SelectionDAG &DAG) { 1419 bool IsLE = DAG.getDataLayout().isLittleEndian(); 1420 if (ShuffleKind == 0) { 1421 if (IsLE) 1422 return false; 1423 for (unsigned i = 0; i != 16; ++i) 1424 if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) 1425 return false; 1426 } else if (ShuffleKind == 2) { 1427 if (!IsLE) 1428 return false; 1429 for (unsigned i = 0; i != 16; ++i) 1430 if (!isConstantOrUndef(N->getMaskElt(i), i*2)) 1431 return false; 1432 } else if (ShuffleKind == 1) { 1433 unsigned j = IsLE ? 0 : 1; 1434 for (unsigned i = 0; i != 8; ++i) 1435 if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || 1436 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) 1437 return false; 1438 } 1439 return true; 1440 } 1441 1442 /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a 1443 /// VPKUWUM instruction. 1444 /// The ShuffleKind distinguishes between big-endian operations with 1445 /// two different inputs (0), either-endian operations with two identical 1446 /// inputs (1), and little-endian operations with two different inputs (2). 1447 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). 1448 bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, 1449 SelectionDAG &DAG) { 1450 bool IsLE = DAG.getDataLayout().isLittleEndian(); 1451 if (ShuffleKind == 0) { 1452 if (IsLE) 1453 return false; 1454 for (unsigned i = 0; i != 16; i += 2) 1455 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || 1456 !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) 1457 return false; 1458 } else if (ShuffleKind == 2) { 1459 if (!IsLE) 1460 return false; 1461 for (unsigned i = 0; i != 16; i += 2) 1462 if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || 1463 !isConstantOrUndef(N->getMaskElt(i+1), i*2+1)) 1464 return false; 1465 } else if (ShuffleKind == 1) { 1466 unsigned j = IsLE ? 0 : 2; 1467 for (unsigned i = 0; i != 8; i += 2) 1468 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || 1469 !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || 1470 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || 1471 !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1)) 1472 return false; 1473 } 1474 return true; 1475 } 1476 1477 /// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a 1478 /// VPKUDUM instruction, AND the VPKUDUM instruction exists for the 1479 /// current subtarget. 1480 /// 1481 /// The ShuffleKind distinguishes between big-endian operations with 1482 /// two different inputs (0), either-endian operations with two identical 1483 /// inputs (1), and little-endian operations with two different inputs (2). 1484 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). 1485 bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, 1486 SelectionDAG &DAG) { 1487 const PPCSubtarget& Subtarget = 1488 static_cast<const PPCSubtarget&>(DAG.getSubtarget()); 1489 if (!Subtarget.hasP8Vector()) 1490 return false; 1491 1492 bool IsLE = DAG.getDataLayout().isLittleEndian(); 1493 if (ShuffleKind == 0) { 1494 if (IsLE) 1495 return false; 1496 for (unsigned i = 0; i != 16; i += 4) 1497 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) || 1498 !isConstantOrUndef(N->getMaskElt(i+1), i*2+5) || 1499 !isConstantOrUndef(N->getMaskElt(i+2), i*2+6) || 1500 !isConstantOrUndef(N->getMaskElt(i+3), i*2+7)) 1501 return false; 1502 } else if (ShuffleKind == 2) { 1503 if (!IsLE) 1504 return false; 1505 for (unsigned i = 0; i != 16; i += 4) 1506 if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || 1507 !isConstantOrUndef(N->getMaskElt(i+1), i*2+1) || 1508 !isConstantOrUndef(N->getMaskElt(i+2), i*2+2) || 1509 !isConstantOrUndef(N->getMaskElt(i+3), i*2+3)) 1510 return false; 1511 } else if (ShuffleKind == 1) { 1512 unsigned j = IsLE ? 0 : 4; 1513 for (unsigned i = 0; i != 8; i += 4) 1514 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || 1515 !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || 1516 !isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) || 1517 !isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) || 1518 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || 1519 !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) || 1520 !isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) || 1521 !isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3)) 1522 return false; 1523 } 1524 return true; 1525 } 1526 1527 /// isVMerge - Common function, used to match vmrg* shuffles. 1528 /// 1529 static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, 1530 unsigned LHSStart, unsigned RHSStart) { 1531 if (N->getValueType(0) != MVT::v16i8) 1532 return false; 1533 assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) && 1534 "Unsupported merge size!"); 1535 1536 for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units 1537 for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit 1538 if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j), 1539 LHSStart+j+i*UnitSize) || 1540 !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j), 1541 RHSStart+j+i*UnitSize)) 1542 return false; 1543 } 1544 return true; 1545 } 1546 1547 /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for 1548 /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). 1549 /// The ShuffleKind distinguishes between big-endian merges with two 1550 /// different inputs (0), either-endian merges with two identical inputs (1), 1551 /// and little-endian merges with two different inputs (2). For the latter, 1552 /// the input operands are swapped (see PPCInstrAltivec.td). 1553 bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, 1554 unsigned ShuffleKind, SelectionDAG &DAG) { 1555 if (DAG.getDataLayout().isLittleEndian()) { 1556 if (ShuffleKind == 1) // unary 1557 return isVMerge(N, UnitSize, 0, 0); 1558 else if (ShuffleKind == 2) // swapped 1559 return isVMerge(N, UnitSize, 0, 16); 1560 else 1561 return false; 1562 } else { 1563 if (ShuffleKind == 1) // unary 1564 return isVMerge(N, UnitSize, 8, 8); 1565 else if (ShuffleKind == 0) // normal 1566 return isVMerge(N, UnitSize, 8, 24); 1567 else 1568 return false; 1569 } 1570 } 1571 1572 /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for 1573 /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). 1574 /// The ShuffleKind distinguishes between big-endian merges with two 1575 /// different inputs (0), either-endian merges with two identical inputs (1), 1576 /// and little-endian merges with two different inputs (2). For the latter, 1577 /// the input operands are swapped (see PPCInstrAltivec.td). 1578 bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, 1579 unsigned ShuffleKind, SelectionDAG &DAG) { 1580 if (DAG.getDataLayout().isLittleEndian()) { 1581 if (ShuffleKind == 1) // unary 1582 return isVMerge(N, UnitSize, 8, 8); 1583 else if (ShuffleKind == 2) // swapped 1584 return isVMerge(N, UnitSize, 8, 24); 1585 else 1586 return false; 1587 } else { 1588 if (ShuffleKind == 1) // unary 1589 return isVMerge(N, UnitSize, 0, 0); 1590 else if (ShuffleKind == 0) // normal 1591 return isVMerge(N, UnitSize, 0, 16); 1592 else 1593 return false; 1594 } 1595 } 1596 1597 /** 1598 * Common function used to match vmrgew and vmrgow shuffles 1599 * 1600 * The indexOffset determines whether to look for even or odd words in 1601 * the shuffle mask. This is based on the of the endianness of the target 1602 * machine. 1603 * - Little Endian: 1604 * - Use offset of 0 to check for odd elements 1605 * - Use offset of 4 to check for even elements 1606 * - Big Endian: 1607 * - Use offset of 0 to check for even elements 1608 * - Use offset of 4 to check for odd elements 1609 * A detailed description of the vector element ordering for little endian and 1610 * big endian can be found at 1611 * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html 1612 * Targeting your applications - what little endian and big endian IBM XL C/C++ 1613 * compiler differences mean to you 1614 * 1615 * The mask to the shuffle vector instruction specifies the indices of the 1616 * elements from the two input vectors to place in the result. The elements are 1617 * numbered in array-access order, starting with the first vector. These vectors 1618 * are always of type v16i8, thus each vector will contain 16 elements of size 1619 * 8. More info on the shuffle vector can be found in the 1620 * http://llvm.org/docs/LangRef.html#shufflevector-instruction 1621 * Language Reference. 1622 * 1623 * The RHSStartValue indicates whether the same input vectors are used (unary) 1624 * or two different input vectors are used, based on the following: 1625 * - If the instruction uses the same vector for both inputs, the range of the 1626 * indices will be 0 to 15. In this case, the RHSStart value passed should 1627 * be 0. 1628 * - If the instruction has two different vectors then the range of the 1629 * indices will be 0 to 31. In this case, the RHSStart value passed should 1630 * be 16 (indices 0-15 specify elements in the first vector while indices 16 1631 * to 31 specify elements in the second vector). 1632 * 1633 * \param[in] N The shuffle vector SD Node to analyze 1634 * \param[in] IndexOffset Specifies whether to look for even or odd elements 1635 * \param[in] RHSStartValue Specifies the starting index for the righthand input 1636 * vector to the shuffle_vector instruction 1637 * \return true iff this shuffle vector represents an even or odd word merge 1638 */ 1639 static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset, 1640 unsigned RHSStartValue) { 1641 if (N->getValueType(0) != MVT::v16i8) 1642 return false; 1643 1644 for (unsigned i = 0; i < 2; ++i) 1645 for (unsigned j = 0; j < 4; ++j) 1646 if (!isConstantOrUndef(N->getMaskElt(i*4+j), 1647 i*RHSStartValue+j+IndexOffset) || 1648 !isConstantOrUndef(N->getMaskElt(i*4+j+8), 1649 i*RHSStartValue+j+IndexOffset+8)) 1650 return false; 1651 return true; 1652 } 1653 1654 /** 1655 * Determine if the specified shuffle mask is suitable for the vmrgew or 1656 * vmrgow instructions. 1657 * 1658 * \param[in] N The shuffle vector SD Node to analyze 1659 * \param[in] CheckEven Check for an even merge (true) or an odd merge (false) 1660 * \param[in] ShuffleKind Identify the type of merge: 1661 * - 0 = big-endian merge with two different inputs; 1662 * - 1 = either-endian merge with two identical inputs; 1663 * - 2 = little-endian merge with two different inputs (inputs are swapped for 1664 * little-endian merges). 1665 * \param[in] DAG The current SelectionDAG 1666 * \return true iff this shuffle mask 1667 */ 1668 bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, 1669 unsigned ShuffleKind, SelectionDAG &DAG) { 1670 if (DAG.getDataLayout().isLittleEndian()) { 1671 unsigned indexOffset = CheckEven ? 4 : 0; 1672 if (ShuffleKind == 1) // Unary 1673 return isVMerge(N, indexOffset, 0); 1674 else if (ShuffleKind == 2) // swapped 1675 return isVMerge(N, indexOffset, 16); 1676 else 1677 return false; 1678 } 1679 else { 1680 unsigned indexOffset = CheckEven ? 0 : 4; 1681 if (ShuffleKind == 1) // Unary 1682 return isVMerge(N, indexOffset, 0); 1683 else if (ShuffleKind == 0) // Normal 1684 return isVMerge(N, indexOffset, 16); 1685 else 1686 return false; 1687 } 1688 return false; 1689 } 1690 1691 /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift 1692 /// amount, otherwise return -1. 1693 /// The ShuffleKind distinguishes between big-endian operations with two 1694 /// different inputs (0), either-endian operations with two identical inputs 1695 /// (1), and little-endian operations with two different inputs (2). For the 1696 /// latter, the input operands are swapped (see PPCInstrAltivec.td). 1697 int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, 1698 SelectionDAG &DAG) { 1699 if (N->getValueType(0) != MVT::v16i8) 1700 return -1; 1701 1702 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 1703 1704 // Find the first non-undef value in the shuffle mask. 1705 unsigned i; 1706 for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i) 1707 /*search*/; 1708 1709 if (i == 16) return -1; // all undef. 1710 1711 // Otherwise, check to see if the rest of the elements are consecutively 1712 // numbered from this value. 1713 unsigned ShiftAmt = SVOp->getMaskElt(i); 1714 if (ShiftAmt < i) return -1; 1715 1716 ShiftAmt -= i; 1717 bool isLE = DAG.getDataLayout().isLittleEndian(); 1718 1719 if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { 1720 // Check the rest of the elements to see if they are consecutive. 1721 for (++i; i != 16; ++i) 1722 if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) 1723 return -1; 1724 } else if (ShuffleKind == 1) { 1725 // Check the rest of the elements to see if they are consecutive. 1726 for (++i; i != 16; ++i) 1727 if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) 1728 return -1; 1729 } else 1730 return -1; 1731 1732 if (isLE) 1733 ShiftAmt = 16 - ShiftAmt; 1734 1735 return ShiftAmt; 1736 } 1737 1738 /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand 1739 /// specifies a splat of a single element that is suitable for input to 1740 /// VSPLTB/VSPLTH/VSPLTW. 1741 bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { 1742 assert(N->getValueType(0) == MVT::v16i8 && 1743 (EltSize == 1 || EltSize == 2 || EltSize == 4)); 1744 1745 // The consecutive indices need to specify an element, not part of two 1746 // different elements. So abandon ship early if this isn't the case. 1747 if (N->getMaskElt(0) % EltSize != 0) 1748 return false; 1749 1750 // This is a splat operation if each element of the permute is the same, and 1751 // if the value doesn't reference the second vector. 1752 unsigned ElementBase = N->getMaskElt(0); 1753 1754 // FIXME: Handle UNDEF elements too! 1755 if (ElementBase >= 16) 1756 return false; 1757 1758 // Check that the indices are consecutive, in the case of a multi-byte element 1759 // splatted with a v16i8 mask. 1760 for (unsigned i = 1; i != EltSize; ++i) 1761 if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase)) 1762 return false; 1763 1764 for (unsigned i = EltSize, e = 16; i != e; i += EltSize) { 1765 if (N->getMaskElt(i) < 0) continue; 1766 for (unsigned j = 0; j != EltSize; ++j) 1767 if (N->getMaskElt(i+j) != N->getMaskElt(j)) 1768 return false; 1769 } 1770 return true; 1771 } 1772 1773 /// Check that the mask is shuffling N byte elements. Within each N byte 1774 /// element of the mask, the indices could be either in increasing or 1775 /// decreasing order as long as they are consecutive. 1776 /// \param[in] N the shuffle vector SD Node to analyze 1777 /// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/ 1778 /// Word/DoubleWord/QuadWord). 1779 /// \param[in] StepLen the delta indices number among the N byte element, if 1780 /// the mask is in increasing/decreasing order then it is 1/-1. 1781 /// \return true iff the mask is shuffling N byte elements. 1782 static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width, 1783 int StepLen) { 1784 assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) && 1785 "Unexpected element width."); 1786 assert((StepLen == 1 || StepLen == -1) && "Unexpected element width."); 1787 1788 unsigned NumOfElem = 16 / Width; 1789 unsigned MaskVal[16]; // Width is never greater than 16 1790 for (unsigned i = 0; i < NumOfElem; ++i) { 1791 MaskVal[0] = N->getMaskElt(i * Width); 1792 if ((StepLen == 1) && (MaskVal[0] % Width)) { 1793 return false; 1794 } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) { 1795 return false; 1796 } 1797 1798 for (unsigned int j = 1; j < Width; ++j) { 1799 MaskVal[j] = N->getMaskElt(i * Width + j); 1800 if (MaskVal[j] != MaskVal[j-1] + StepLen) { 1801 return false; 1802 } 1803 } 1804 } 1805 1806 return true; 1807 } 1808 1809 bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, 1810 unsigned &InsertAtByte, bool &Swap, bool IsLE) { 1811 if (!isNByteElemShuffleMask(N, 4, 1)) 1812 return false; 1813 1814 // Now we look at mask elements 0,4,8,12 1815 unsigned M0 = N->getMaskElt(0) / 4; 1816 unsigned M1 = N->getMaskElt(4) / 4; 1817 unsigned M2 = N->getMaskElt(8) / 4; 1818 unsigned M3 = N->getMaskElt(12) / 4; 1819 unsigned LittleEndianShifts[] = { 2, 1, 0, 3 }; 1820 unsigned BigEndianShifts[] = { 3, 0, 1, 2 }; 1821 1822 // Below, let H and L be arbitrary elements of the shuffle mask 1823 // where H is in the range [4,7] and L is in the range [0,3]. 1824 // H, 1, 2, 3 or L, 5, 6, 7 1825 if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) || 1826 (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) { 1827 ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3]; 1828 InsertAtByte = IsLE ? 12 : 0; 1829 Swap = M0 < 4; 1830 return true; 1831 } 1832 // 0, H, 2, 3 or 4, L, 6, 7 1833 if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) || 1834 (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) { 1835 ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3]; 1836 InsertAtByte = IsLE ? 8 : 4; 1837 Swap = M1 < 4; 1838 return true; 1839 } 1840 // 0, 1, H, 3 or 4, 5, L, 7 1841 if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) || 1842 (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) { 1843 ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3]; 1844 InsertAtByte = IsLE ? 4 : 8; 1845 Swap = M2 < 4; 1846 return true; 1847 } 1848 // 0, 1, 2, H or 4, 5, 6, L 1849 if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) || 1850 (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) { 1851 ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3]; 1852 InsertAtByte = IsLE ? 0 : 12; 1853 Swap = M3 < 4; 1854 return true; 1855 } 1856 1857 // If both vector operands for the shuffle are the same vector, the mask will 1858 // contain only elements from the first one and the second one will be undef. 1859 if (N->getOperand(1).isUndef()) { 1860 ShiftElts = 0; 1861 Swap = true; 1862 unsigned XXINSERTWSrcElem = IsLE ? 2 : 1; 1863 if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) { 1864 InsertAtByte = IsLE ? 12 : 0; 1865 return true; 1866 } 1867 if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) { 1868 InsertAtByte = IsLE ? 8 : 4; 1869 return true; 1870 } 1871 if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) { 1872 InsertAtByte = IsLE ? 4 : 8; 1873 return true; 1874 } 1875 if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) { 1876 InsertAtByte = IsLE ? 0 : 12; 1877 return true; 1878 } 1879 } 1880 1881 return false; 1882 } 1883 1884 bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, 1885 bool &Swap, bool IsLE) { 1886 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); 1887 // Ensure each byte index of the word is consecutive. 1888 if (!isNByteElemShuffleMask(N, 4, 1)) 1889 return false; 1890 1891 // Now we look at mask elements 0,4,8,12, which are the beginning of words. 1892 unsigned M0 = N->getMaskElt(0) / 4; 1893 unsigned M1 = N->getMaskElt(4) / 4; 1894 unsigned M2 = N->getMaskElt(8) / 4; 1895 unsigned M3 = N->getMaskElt(12) / 4; 1896 1897 // If both vector operands for the shuffle are the same vector, the mask will 1898 // contain only elements from the first one and the second one will be undef. 1899 if (N->getOperand(1).isUndef()) { 1900 assert(M0 < 4 && "Indexing into an undef vector?"); 1901 if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4) 1902 return false; 1903 1904 ShiftElts = IsLE ? (4 - M0) % 4 : M0; 1905 Swap = false; 1906 return true; 1907 } 1908 1909 // Ensure each word index of the ShuffleVector Mask is consecutive. 1910 if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8) 1911 return false; 1912 1913 if (IsLE) { 1914 if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) { 1915 // Input vectors don't need to be swapped if the leading element 1916 // of the result is one of the 3 left elements of the second vector 1917 // (or if there is no shift to be done at all). 1918 Swap = false; 1919 ShiftElts = (8 - M0) % 8; 1920 } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) { 1921 // Input vectors need to be swapped if the leading element 1922 // of the result is one of the 3 left elements of the first vector 1923 // (or if we're shifting by 4 - thereby simply swapping the vectors). 1924 Swap = true; 1925 ShiftElts = (4 - M0) % 4; 1926 } 1927 1928 return true; 1929 } else { // BE 1930 if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) { 1931 // Input vectors don't need to be swapped if the leading element 1932 // of the result is one of the 4 elements of the first vector. 1933 Swap = false; 1934 ShiftElts = M0; 1935 } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) { 1936 // Input vectors need to be swapped if the leading element 1937 // of the result is one of the 4 elements of the right vector. 1938 Swap = true; 1939 ShiftElts = M0 - 4; 1940 } 1941 1942 return true; 1943 } 1944 } 1945 1946 bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) { 1947 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); 1948 1949 if (!isNByteElemShuffleMask(N, Width, -1)) 1950 return false; 1951 1952 for (int i = 0; i < 16; i += Width) 1953 if (N->getMaskElt(i) != i + Width - 1) 1954 return false; 1955 1956 return true; 1957 } 1958 1959 bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) { 1960 return isXXBRShuffleMaskHelper(N, 2); 1961 } 1962 1963 bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) { 1964 return isXXBRShuffleMaskHelper(N, 4); 1965 } 1966 1967 bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) { 1968 return isXXBRShuffleMaskHelper(N, 8); 1969 } 1970 1971 bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) { 1972 return isXXBRShuffleMaskHelper(N, 16); 1973 } 1974 1975 /// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap 1976 /// if the inputs to the instruction should be swapped and set \p DM to the 1977 /// value for the immediate. 1978 /// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI 1979 /// AND element 0 of the result comes from the first input (LE) or second input 1980 /// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered. 1981 /// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle 1982 /// mask. 1983 bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM, 1984 bool &Swap, bool IsLE) { 1985 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); 1986 1987 // Ensure each byte index of the double word is consecutive. 1988 if (!isNByteElemShuffleMask(N, 8, 1)) 1989 return false; 1990 1991 unsigned M0 = N->getMaskElt(0) / 8; 1992 unsigned M1 = N->getMaskElt(8) / 8; 1993 assert(((M0 | M1) < 4) && "A mask element out of bounds?"); 1994 1995 // If both vector operands for the shuffle are the same vector, the mask will 1996 // contain only elements from the first one and the second one will be undef. 1997 if (N->getOperand(1).isUndef()) { 1998 if ((M0 | M1) < 2) { 1999 DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1); 2000 Swap = false; 2001 return true; 2002 } else 2003 return false; 2004 } 2005 2006 if (IsLE) { 2007 if (M0 > 1 && M1 < 2) { 2008 Swap = false; 2009 } else if (M0 < 2 && M1 > 1) { 2010 M0 = (M0 + 2) % 4; 2011 M1 = (M1 + 2) % 4; 2012 Swap = true; 2013 } else 2014 return false; 2015 2016 // Note: if control flow comes here that means Swap is already set above 2017 DM = (((~M1) & 1) << 1) + ((~M0) & 1); 2018 return true; 2019 } else { // BE 2020 if (M0 < 2 && M1 > 1) { 2021 Swap = false; 2022 } else if (M0 > 1 && M1 < 2) { 2023 M0 = (M0 + 2) % 4; 2024 M1 = (M1 + 2) % 4; 2025 Swap = true; 2026 } else 2027 return false; 2028 2029 // Note: if control flow comes here that means Swap is already set above 2030 DM = (M0 << 1) + (M1 & 1); 2031 return true; 2032 } 2033 } 2034 2035 2036 /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the 2037 /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. 2038 unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, 2039 SelectionDAG &DAG) { 2040 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2041 assert(isSplatShuffleMask(SVOp, EltSize)); 2042 if (DAG.getDataLayout().isLittleEndian()) 2043 return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); 2044 else 2045 return SVOp->getMaskElt(0) / EltSize; 2046 } 2047 2048 /// get_VSPLTI_elt - If this is a build_vector of constants which can be formed 2049 /// by using a vspltis[bhw] instruction of the specified element size, return 2050 /// the constant being splatted. The ByteSize field indicates the number of 2051 /// bytes of each element [124] -> [bhw]. 2052 SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { 2053 SDValue OpVal(nullptr, 0); 2054 2055 // If ByteSize of the splat is bigger than the element size of the 2056 // build_vector, then we have a case where we are checking for a splat where 2057 // multiple elements of the buildvector are folded together into a single 2058 // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8). 2059 unsigned EltSize = 16/N->getNumOperands(); 2060 if (EltSize < ByteSize) { 2061 unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval. 2062 SDValue UniquedVals[4]; 2063 assert(Multiple > 1 && Multiple <= 4 && "How can this happen?"); 2064 2065 // See if all of the elements in the buildvector agree across. 2066 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 2067 if (N->getOperand(i).isUndef()) continue; 2068 // If the element isn't a constant, bail fully out. 2069 if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue(); 2070 2071 if (!UniquedVals[i&(Multiple-1)].getNode()) 2072 UniquedVals[i&(Multiple-1)] = N->getOperand(i); 2073 else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i)) 2074 return SDValue(); // no match. 2075 } 2076 2077 // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains 2078 // either constant or undef values that are identical for each chunk. See 2079 // if these chunks can form into a larger vspltis*. 2080 2081 // Check to see if all of the leading entries are either 0 or -1. If 2082 // neither, then this won't fit into the immediate field. 2083 bool LeadingZero = true; 2084 bool LeadingOnes = true; 2085 for (unsigned i = 0; i != Multiple-1; ++i) { 2086 if (!UniquedVals[i].getNode()) continue; // Must have been undefs. 2087 2088 LeadingZero &= isNullConstant(UniquedVals[i]); 2089 LeadingOnes &= isAllOnesConstant(UniquedVals[i]); 2090 } 2091 // Finally, check the least significant entry. 2092 if (LeadingZero) { 2093 if (!UniquedVals[Multiple-1].getNode()) 2094 return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef 2095 int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue(); 2096 if (Val < 16) // 0,0,0,4 -> vspltisw(4) 2097 return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); 2098 } 2099 if (LeadingOnes) { 2100 if (!UniquedVals[Multiple-1].getNode()) 2101 return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef 2102 int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue(); 2103 if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2) 2104 return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); 2105 } 2106 2107 return SDValue(); 2108 } 2109 2110 // Check to see if this buildvec has a single non-undef value in its elements. 2111 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 2112 if (N->getOperand(i).isUndef()) continue; 2113 if (!OpVal.getNode()) 2114 OpVal = N->getOperand(i); 2115 else if (OpVal != N->getOperand(i)) 2116 return SDValue(); 2117 } 2118 2119 if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def. 2120 2121 unsigned ValSizeInBytes = EltSize; 2122 uint64_t Value = 0; 2123 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) { 2124 Value = CN->getZExtValue(); 2125 } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) { 2126 assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!"); 2127 Value = FloatToBits(CN->getValueAPF().convertToFloat()); 2128 } 2129 2130 // If the splat value is larger than the element value, then we can never do 2131 // this splat. The only case that we could fit the replicated bits into our 2132 // immediate field for would be zero, and we prefer to use vxor for it. 2133 if (ValSizeInBytes < ByteSize) return SDValue(); 2134 2135 // If the element value is larger than the splat value, check if it consists 2136 // of a repeated bit pattern of size ByteSize. 2137 if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8)) 2138 return SDValue(); 2139 2140 // Properly sign extend the value. 2141 int MaskVal = SignExtend32(Value, ByteSize * 8); 2142 2143 // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros. 2144 if (MaskVal == 0) return SDValue(); 2145 2146 // Finally, if this value fits in a 5 bit sext field, return it 2147 if (SignExtend32<5>(MaskVal) == MaskVal) 2148 return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32); 2149 return SDValue(); 2150 } 2151 2152 /// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift 2153 /// amount, otherwise return -1. 2154 int PPC::isQVALIGNIShuffleMask(SDNode *N) { 2155 EVT VT = N->getValueType(0); 2156 if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1) 2157 return -1; 2158 2159 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2160 2161 // Find the first non-undef value in the shuffle mask. 2162 unsigned i; 2163 for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i) 2164 /*search*/; 2165 2166 if (i == 4) return -1; // all undef. 2167 2168 // Otherwise, check to see if the rest of the elements are consecutively 2169 // numbered from this value. 2170 unsigned ShiftAmt = SVOp->getMaskElt(i); 2171 if (ShiftAmt < i) return -1; 2172 ShiftAmt -= i; 2173 2174 // Check the rest of the elements to see if they are consecutive. 2175 for (++i; i != 4; ++i) 2176 if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) 2177 return -1; 2178 2179 return ShiftAmt; 2180 } 2181 2182 //===----------------------------------------------------------------------===// 2183 // Addressing Mode Selection 2184 //===----------------------------------------------------------------------===// 2185 2186 /// isIntS16Immediate - This method tests to see if the node is either a 32-bit 2187 /// or 64-bit immediate, and if the value can be accurately represented as a 2188 /// sign extension from a 16-bit value. If so, this returns true and the 2189 /// immediate. 2190 bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) { 2191 if (!isa<ConstantSDNode>(N)) 2192 return false; 2193 2194 Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue(); 2195 if (N->getValueType(0) == MVT::i32) 2196 return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); 2197 else 2198 return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); 2199 } 2200 bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) { 2201 return isIntS16Immediate(Op.getNode(), Imm); 2202 } 2203 2204 /// SelectAddressRegReg - Given the specified addressed, check to see if it 2205 /// can be represented as an indexed [r+r] operation. Returns false if it 2206 /// can be more efficiently represented with [r+imm]. 2207 bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, 2208 SDValue &Index, 2209 SelectionDAG &DAG) const { 2210 int16_t imm = 0; 2211 if (N.getOpcode() == ISD::ADD) { 2212 if (isIntS16Immediate(N.getOperand(1), imm)) 2213 return false; // r+i 2214 if (N.getOperand(1).getOpcode() == PPCISD::Lo) 2215 return false; // r+i 2216 2217 Base = N.getOperand(0); 2218 Index = N.getOperand(1); 2219 return true; 2220 } else if (N.getOpcode() == ISD::OR) { 2221 if (isIntS16Immediate(N.getOperand(1), imm)) 2222 return false; // r+i can fold it if we can. 2223 2224 // If this is an or of disjoint bitfields, we can codegen this as an add 2225 // (for better address arithmetic) if the LHS and RHS of the OR are provably 2226 // disjoint. 2227 KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0)); 2228 2229 if (LHSKnown.Zero.getBoolValue()) { 2230 KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1)); 2231 // If all of the bits are known zero on the LHS or RHS, the add won't 2232 // carry. 2233 if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) { 2234 Base = N.getOperand(0); 2235 Index = N.getOperand(1); 2236 return true; 2237 } 2238 } 2239 } 2240 2241 return false; 2242 } 2243 2244 // If we happen to be doing an i64 load or store into a stack slot that has 2245 // less than a 4-byte alignment, then the frame-index elimination may need to 2246 // use an indexed load or store instruction (because the offset may not be a 2247 // multiple of 4). The extra register needed to hold the offset comes from the 2248 // register scavenger, and it is possible that the scavenger will need to use 2249 // an emergency spill slot. As a result, we need to make sure that a spill slot 2250 // is allocated when doing an i64 load/store into a less-than-4-byte-aligned 2251 // stack slot. 2252 static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { 2253 // FIXME: This does not handle the LWA case. 2254 if (VT != MVT::i64) 2255 return; 2256 2257 // NOTE: We'll exclude negative FIs here, which come from argument 2258 // lowering, because there are no known test cases triggering this problem 2259 // using packed structures (or similar). We can remove this exclusion if 2260 // we find such a test case. The reason why this is so test-case driven is 2261 // because this entire 'fixup' is only to prevent crashes (from the 2262 // register scavenger) on not-really-valid inputs. For example, if we have: 2263 // %a = alloca i1 2264 // %b = bitcast i1* %a to i64* 2265 // store i64* a, i64 b 2266 // then the store should really be marked as 'align 1', but is not. If it 2267 // were marked as 'align 1' then the indexed form would have been 2268 // instruction-selected initially, and the problem this 'fixup' is preventing 2269 // won't happen regardless. 2270 if (FrameIdx < 0) 2271 return; 2272 2273 MachineFunction &MF = DAG.getMachineFunction(); 2274 MachineFrameInfo &MFI = MF.getFrameInfo(); 2275 2276 unsigned Align = MFI.getObjectAlignment(FrameIdx); 2277 if (Align >= 4) 2278 return; 2279 2280 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 2281 FuncInfo->setHasNonRISpills(); 2282 } 2283 2284 /// Returns true if the address N can be represented by a base register plus 2285 /// a signed 16-bit displacement [r+imm], and if it is not better 2286 /// represented as reg+reg. If \p Alignment is non-zero, only accept 2287 /// displacements that are multiples of that value. 2288 bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, 2289 SDValue &Base, 2290 SelectionDAG &DAG, 2291 unsigned Alignment) const { 2292 // FIXME dl should come from parent load or store, not from address 2293 SDLoc dl(N); 2294 // If this can be more profitably realized as r+r, fail. 2295 if (SelectAddressRegReg(N, Disp, Base, DAG)) 2296 return false; 2297 2298 if (N.getOpcode() == ISD::ADD) { 2299 int16_t imm = 0; 2300 if (isIntS16Immediate(N.getOperand(1), imm) && 2301 (!Alignment || (imm % Alignment) == 0)) { 2302 Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); 2303 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { 2304 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); 2305 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); 2306 } else { 2307 Base = N.getOperand(0); 2308 } 2309 return true; // [r+i] 2310 } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { 2311 // Match LOAD (ADD (X, Lo(G))). 2312 assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue() 2313 && "Cannot handle constant offsets yet!"); 2314 Disp = N.getOperand(1).getOperand(0); // The global address. 2315 assert(Disp.getOpcode() == ISD::TargetGlobalAddress || 2316 Disp.getOpcode() == ISD::TargetGlobalTLSAddress || 2317 Disp.getOpcode() == ISD::TargetConstantPool || 2318 Disp.getOpcode() == ISD::TargetJumpTable); 2319 Base = N.getOperand(0); 2320 return true; // [&g+r] 2321 } 2322 } else if (N.getOpcode() == ISD::OR) { 2323 int16_t imm = 0; 2324 if (isIntS16Immediate(N.getOperand(1), imm) && 2325 (!Alignment || (imm % Alignment) == 0)) { 2326 // If this is an or of disjoint bitfields, we can codegen this as an add 2327 // (for better address arithmetic) if the LHS and RHS of the OR are 2328 // provably disjoint. 2329 KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0)); 2330 2331 if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { 2332 // If all of the bits are known zero on the LHS or RHS, the add won't 2333 // carry. 2334 if (FrameIndexSDNode *FI = 2335 dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { 2336 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); 2337 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); 2338 } else { 2339 Base = N.getOperand(0); 2340 } 2341 Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); 2342 return true; 2343 } 2344 } 2345 } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { 2346 // Loading from a constant address. 2347 2348 // If this address fits entirely in a 16-bit sext immediate field, codegen 2349 // this as "d, 0" 2350 int16_t Imm; 2351 if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) { 2352 Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); 2353 Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, 2354 CN->getValueType(0)); 2355 return true; 2356 } 2357 2358 // Handle 32-bit sext immediates with LIS + addr mode. 2359 if ((CN->getValueType(0) == MVT::i32 || 2360 (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && 2361 (!Alignment || (CN->getZExtValue() % Alignment) == 0)) { 2362 int Addr = (int)CN->getZExtValue(); 2363 2364 // Otherwise, break this down into an LIS + disp. 2365 Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32); 2366 2367 Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl, 2368 MVT::i32); 2369 unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8; 2370 Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0); 2371 return true; 2372 } 2373 } 2374 2375 Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout())); 2376 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) { 2377 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); 2378 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); 2379 } else 2380 Base = N; 2381 return true; // [r+0] 2382 } 2383 2384 /// SelectAddressRegRegOnly - Given the specified addressed, force it to be 2385 /// represented as an indexed [r+r] operation. 2386 bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, 2387 SDValue &Index, 2388 SelectionDAG &DAG) const { 2389 // Check to see if we can easily represent this as an [r+r] address. This 2390 // will fail if it thinks that the address is more profitably represented as 2391 // reg+imm, e.g. where imm = 0. 2392 if (SelectAddressRegReg(N, Base, Index, DAG)) 2393 return true; 2394 2395 // If the address is the result of an add, we will utilize the fact that the 2396 // address calculation includes an implicit add. However, we can reduce 2397 // register pressure if we do not materialize a constant just for use as the 2398 // index register. We only get rid of the add if it is not an add of a 2399 // value and a 16-bit signed constant and both have a single use. 2400 int16_t imm = 0; 2401 if (N.getOpcode() == ISD::ADD && 2402 (!isIntS16Immediate(N.getOperand(1), imm) || 2403 !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) { 2404 Base = N.getOperand(0); 2405 Index = N.getOperand(1); 2406 return true; 2407 } 2408 2409 // Otherwise, do it the hard way, using R0 as the base register. 2410 Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, 2411 N.getValueType()); 2412 Index = N; 2413 return true; 2414 } 2415 2416 /// Returns true if we should use a direct load into vector instruction 2417 /// (such as lxsd or lfd), instead of a load into gpr + direct move sequence. 2418 static bool usePartialVectorLoads(SDNode *N) { 2419 if (!N->hasOneUse()) 2420 return false; 2421 2422 // If there are any other uses other than scalar to vector, then we should 2423 // keep it as a scalar load -> direct move pattern to prevent multiple 2424 // loads. Currently, only check for i64 since we have lxsd/lfd to do this 2425 // efficiently, but no update equivalent. 2426 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 2427 EVT MemVT = LD->getMemoryVT(); 2428 if (MemVT.isSimple() && MemVT.getSimpleVT().SimpleTy == MVT::i64) { 2429 SDNode *User = *(LD->use_begin()); 2430 if (User->getOpcode() == ISD::SCALAR_TO_VECTOR) 2431 return true; 2432 } 2433 } 2434 2435 return false; 2436 } 2437 2438 /// getPreIndexedAddressParts - returns true by value, base pointer and 2439 /// offset pointer and addressing mode by reference if the node's address 2440 /// can be legally represented as pre-indexed load / store address. 2441 bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 2442 SDValue &Offset, 2443 ISD::MemIndexedMode &AM, 2444 SelectionDAG &DAG) const { 2445 if (DisablePPCPreinc) return false; 2446 2447 bool isLoad = true; 2448 SDValue Ptr; 2449 EVT VT; 2450 unsigned Alignment; 2451 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 2452 Ptr = LD->getBasePtr(); 2453 VT = LD->getMemoryVT(); 2454 Alignment = LD->getAlignment(); 2455 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 2456 Ptr = ST->getBasePtr(); 2457 VT = ST->getMemoryVT(); 2458 Alignment = ST->getAlignment(); 2459 isLoad = false; 2460 } else 2461 return false; 2462 2463 // Do not generate pre-inc forms for specific loads that feed scalar_to_vector 2464 // instructions because we can fold these into a more efficient instruction 2465 // instead, (such as LXSD). 2466 if (isLoad && usePartialVectorLoads(N)) { 2467 return false; 2468 } 2469 2470 // PowerPC doesn't have preinc load/store instructions for vectors (except 2471 // for QPX, which does have preinc r+r forms). 2472 if (VT.isVector()) { 2473 if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) { 2474 return false; 2475 } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) { 2476 AM = ISD::PRE_INC; 2477 return true; 2478 } 2479 } 2480 2481 if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { 2482 // Common code will reject creating a pre-inc form if the base pointer 2483 // is a frame index, or if N is a store and the base pointer is either 2484 // the same as or a predecessor of the value being stored. Check for 2485 // those situations here, and try with swapped Base/Offset instead. 2486 bool Swap = false; 2487 2488 if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base)) 2489 Swap = true; 2490 else if (!isLoad) { 2491 SDValue Val = cast<StoreSDNode>(N)->getValue(); 2492 if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode())) 2493 Swap = true; 2494 } 2495 2496 if (Swap) 2497 std::swap(Base, Offset); 2498 2499 AM = ISD::PRE_INC; 2500 return true; 2501 } 2502 2503 // LDU/STU can only handle immediates that are a multiple of 4. 2504 if (VT != MVT::i64) { 2505 if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0)) 2506 return false; 2507 } else { 2508 // LDU/STU need an address with at least 4-byte alignment. 2509 if (Alignment < 4) 2510 return false; 2511 2512 if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4)) 2513 return false; 2514 } 2515 2516 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 2517 // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of 2518 // sext i32 to i64 when addr mode is r+i. 2519 if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 && 2520 LD->getExtensionType() == ISD::SEXTLOAD && 2521 isa<ConstantSDNode>(Offset)) 2522 return false; 2523 } 2524 2525 AM = ISD::PRE_INC; 2526 return true; 2527 } 2528 2529 //===----------------------------------------------------------------------===// 2530 // LowerOperation implementation 2531 //===----------------------------------------------------------------------===// 2532 2533 /// Return true if we should reference labels using a PICBase, set the HiOpFlags 2534 /// and LoOpFlags to the target MO flags. 2535 static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget, 2536 unsigned &HiOpFlags, unsigned &LoOpFlags, 2537 const GlobalValue *GV = nullptr) { 2538 HiOpFlags = PPCII::MO_HA; 2539 LoOpFlags = PPCII::MO_LO; 2540 2541 // Don't use the pic base if not in PIC relocation model. 2542 if (IsPIC) { 2543 HiOpFlags |= PPCII::MO_PIC_FLAG; 2544 LoOpFlags |= PPCII::MO_PIC_FLAG; 2545 } 2546 2547 // If this is a reference to a global value that requires a non-lazy-ptr, make 2548 // sure that instruction lowering adds it. 2549 if (GV && Subtarget.hasLazyResolverStub(GV)) { 2550 HiOpFlags |= PPCII::MO_NLP_FLAG; 2551 LoOpFlags |= PPCII::MO_NLP_FLAG; 2552 2553 if (GV->hasHiddenVisibility()) { 2554 HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; 2555 LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; 2556 } 2557 } 2558 } 2559 2560 static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC, 2561 SelectionDAG &DAG) { 2562 SDLoc DL(HiPart); 2563 EVT PtrVT = HiPart.getValueType(); 2564 SDValue Zero = DAG.getConstant(0, DL, PtrVT); 2565 2566 SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero); 2567 SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero); 2568 2569 // With PIC, the first instruction is actually "GR+hi(&G)". 2570 if (isPIC) 2571 Hi = DAG.getNode(ISD::ADD, DL, PtrVT, 2572 DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi); 2573 2574 // Generate non-pic code that has direct accesses to the constant pool. 2575 // The address of the global is just (hi(&g)+lo(&g)). 2576 return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo); 2577 } 2578 2579 static void setUsesTOCBasePtr(MachineFunction &MF) { 2580 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 2581 FuncInfo->setUsesTOCBasePtr(); 2582 } 2583 2584 static void setUsesTOCBasePtr(SelectionDAG &DAG) { 2585 setUsesTOCBasePtr(DAG.getMachineFunction()); 2586 } 2587 2588 static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit, 2589 SDValue GA) { 2590 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 2591 SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) : 2592 DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); 2593 2594 SDValue Ops[] = { GA, Reg }; 2595 return DAG.getMemIntrinsicNode( 2596 PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, 2597 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, 2598 MachineMemOperand::MOLoad); 2599 } 2600 2601 SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, 2602 SelectionDAG &DAG) const { 2603 EVT PtrVT = Op.getValueType(); 2604 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 2605 const Constant *C = CP->getConstVal(); 2606 2607 // 64-bit SVR4 ABI code is always position-independent. 2608 // The actual address of the GlobalValue is stored in the TOC. 2609 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2610 setUsesTOCBasePtr(DAG); 2611 SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); 2612 return getTOCEntry(DAG, SDLoc(CP), true, GA); 2613 } 2614 2615 unsigned MOHiFlag, MOLoFlag; 2616 bool IsPIC = isPositionIndependent(); 2617 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); 2618 2619 if (IsPIC && Subtarget.isSVR4ABI()) { 2620 SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 2621 PPCII::MO_PIC_FLAG); 2622 return getTOCEntry(DAG, SDLoc(CP), false, GA); 2623 } 2624 2625 SDValue CPIHi = 2626 DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag); 2627 SDValue CPILo = 2628 DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag); 2629 return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG); 2630 } 2631 2632 // For 64-bit PowerPC, prefer the more compact relative encodings. 2633 // This trades 32 bits per jump table entry for one or two instructions 2634 // on the jump site. 2635 unsigned PPCTargetLowering::getJumpTableEncoding() const { 2636 if (isJumpTableRelative()) 2637 return MachineJumpTableInfo::EK_LabelDifference32; 2638 2639 return TargetLowering::getJumpTableEncoding(); 2640 } 2641 2642 bool PPCTargetLowering::isJumpTableRelative() const { 2643 if (Subtarget.isPPC64()) 2644 return true; 2645 return TargetLowering::isJumpTableRelative(); 2646 } 2647 2648 SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table, 2649 SelectionDAG &DAG) const { 2650 if (!Subtarget.isPPC64()) 2651 return TargetLowering::getPICJumpTableRelocBase(Table, DAG); 2652 2653 switch (getTargetMachine().getCodeModel()) { 2654 case CodeModel::Small: 2655 case CodeModel::Medium: 2656 return TargetLowering::getPICJumpTableRelocBase(Table, DAG); 2657 default: 2658 return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(), 2659 getPointerTy(DAG.getDataLayout())); 2660 } 2661 } 2662 2663 const MCExpr * 2664 PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF, 2665 unsigned JTI, 2666 MCContext &Ctx) const { 2667 if (!Subtarget.isPPC64()) 2668 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 2669 2670 switch (getTargetMachine().getCodeModel()) { 2671 case CodeModel::Small: 2672 case CodeModel::Medium: 2673 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 2674 default: 2675 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); 2676 } 2677 } 2678 2679 SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 2680 EVT PtrVT = Op.getValueType(); 2681 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 2682 2683 // 64-bit SVR4 ABI code is always position-independent. 2684 // The actual address of the GlobalValue is stored in the TOC. 2685 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2686 setUsesTOCBasePtr(DAG); 2687 SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); 2688 return getTOCEntry(DAG, SDLoc(JT), true, GA); 2689 } 2690 2691 unsigned MOHiFlag, MOLoFlag; 2692 bool IsPIC = isPositionIndependent(); 2693 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); 2694 2695 if (IsPIC && Subtarget.isSVR4ABI()) { 2696 SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 2697 PPCII::MO_PIC_FLAG); 2698 return getTOCEntry(DAG, SDLoc(GA), false, GA); 2699 } 2700 2701 SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); 2702 SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag); 2703 return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG); 2704 } 2705 2706 SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, 2707 SelectionDAG &DAG) const { 2708 EVT PtrVT = Op.getValueType(); 2709 BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op); 2710 const BlockAddress *BA = BASDN->getBlockAddress(); 2711 2712 // 64-bit SVR4 ABI code is always position-independent. 2713 // The actual BlockAddress is stored in the TOC. 2714 if (Subtarget.isSVR4ABI() && 2715 (Subtarget.isPPC64() || isPositionIndependent())) { 2716 if (Subtarget.isPPC64()) 2717 setUsesTOCBasePtr(DAG); 2718 SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()); 2719 return getTOCEntry(DAG, SDLoc(BASDN), Subtarget.isPPC64(), GA); 2720 } 2721 2722 unsigned MOHiFlag, MOLoFlag; 2723 bool IsPIC = isPositionIndependent(); 2724 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag); 2725 SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag); 2726 SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag); 2727 return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG); 2728 } 2729 2730 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, 2731 SelectionDAG &DAG) const { 2732 // FIXME: TLS addresses currently use medium model code sequences, 2733 // which is the most useful form. Eventually support for small and 2734 // large models could be added if users need it, at the cost of 2735 // additional complexity. 2736 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 2737 if (DAG.getTarget().useEmulatedTLS()) 2738 return LowerToTLSEmulatedModel(GA, DAG); 2739 2740 SDLoc dl(GA); 2741 const GlobalValue *GV = GA->getGlobal(); 2742 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2743 bool is64bit = Subtarget.isPPC64(); 2744 const Module *M = DAG.getMachineFunction().getFunction().getParent(); 2745 PICLevel::Level picLevel = M->getPICLevel(); 2746 2747 TLSModel::Model Model = getTargetMachine().getTLSModel(GV); 2748 2749 if (Model == TLSModel::LocalExec) { 2750 SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2751 PPCII::MO_TPREL_HA); 2752 SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2753 PPCII::MO_TPREL_LO); 2754 SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64) 2755 : DAG.getRegister(PPC::R2, MVT::i32); 2756 2757 SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); 2758 return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); 2759 } 2760 2761 if (Model == TLSModel::InitialExec) { 2762 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); 2763 SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2764 PPCII::MO_TLS); 2765 SDValue GOTPtr; 2766 if (is64bit) { 2767 setUsesTOCBasePtr(DAG); 2768 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); 2769 GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, 2770 PtrVT, GOTReg, TGA); 2771 } else 2772 GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); 2773 SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, 2774 PtrVT, TGA, GOTPtr); 2775 return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS); 2776 } 2777 2778 if (Model == TLSModel::GeneralDynamic) { 2779 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); 2780 SDValue GOTPtr; 2781 if (is64bit) { 2782 setUsesTOCBasePtr(DAG); 2783 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); 2784 GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, 2785 GOTReg, TGA); 2786 } else { 2787 if (picLevel == PICLevel::SmallPIC) 2788 GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); 2789 else 2790 GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); 2791 } 2792 return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT, 2793 GOTPtr, TGA, TGA); 2794 } 2795 2796 if (Model == TLSModel::LocalDynamic) { 2797 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); 2798 SDValue GOTPtr; 2799 if (is64bit) { 2800 setUsesTOCBasePtr(DAG); 2801 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); 2802 GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, 2803 GOTReg, TGA); 2804 } else { 2805 if (picLevel == PICLevel::SmallPIC) 2806 GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); 2807 else 2808 GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); 2809 } 2810 SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl, 2811 PtrVT, GOTPtr, TGA, TGA); 2812 SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, 2813 PtrVT, TLSAddr, TGA); 2814 return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); 2815 } 2816 2817 llvm_unreachable("Unknown TLS model!"); 2818 } 2819 2820 SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, 2821 SelectionDAG &DAG) const { 2822 EVT PtrVT = Op.getValueType(); 2823 GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op); 2824 SDLoc DL(GSDN); 2825 const GlobalValue *GV = GSDN->getGlobal(); 2826 2827 // 64-bit SVR4 ABI code is always position-independent. 2828 // The actual address of the GlobalValue is stored in the TOC. 2829 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { 2830 setUsesTOCBasePtr(DAG); 2831 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset()); 2832 return getTOCEntry(DAG, DL, true, GA); 2833 } 2834 2835 unsigned MOHiFlag, MOLoFlag; 2836 bool IsPIC = isPositionIndependent(); 2837 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV); 2838 2839 if (IsPIC && Subtarget.isSVR4ABI()) { 2840 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 2841 GSDN->getOffset(), 2842 PPCII::MO_PIC_FLAG); 2843 return getTOCEntry(DAG, DL, false, GA); 2844 } 2845 2846 SDValue GAHi = 2847 DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag); 2848 SDValue GALo = 2849 DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag); 2850 2851 SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG); 2852 2853 // If the global reference is actually to a non-lazy-pointer, we have to do an 2854 // extra load to get the address of the global. 2855 if (MOHiFlag & PPCII::MO_NLP_FLAG) 2856 Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo()); 2857 return Ptr; 2858 } 2859 2860 SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 2861 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 2862 SDLoc dl(Op); 2863 2864 if (Op.getValueType() == MVT::v2i64) { 2865 // When the operands themselves are v2i64 values, we need to do something 2866 // special because VSX has no underlying comparison operations for these. 2867 if (Op.getOperand(0).getValueType() == MVT::v2i64) { 2868 // Equality can be handled by casting to the legal type for Altivec 2869 // comparisons, everything else needs to be expanded. 2870 if (CC == ISD::SETEQ || CC == ISD::SETNE) { 2871 return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 2872 DAG.getSetCC(dl, MVT::v4i32, 2873 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), 2874 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)), 2875 CC)); 2876 } 2877 2878 return SDValue(); 2879 } 2880 2881 // We handle most of these in the usual way. 2882 return Op; 2883 } 2884 2885 // If we're comparing for equality to zero, expose the fact that this is 2886 // implemented as a ctlz/srl pair on ppc, so that the dag combiner can 2887 // fold the new nodes. 2888 if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG)) 2889 return V; 2890 2891 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 2892 // Leave comparisons against 0 and -1 alone for now, since they're usually 2893 // optimized. FIXME: revisit this when we can custom lower all setcc 2894 // optimizations. 2895 if (C->isAllOnesValue() || C->isNullValue()) 2896 return SDValue(); 2897 } 2898 2899 // If we have an integer seteq/setne, turn it into a compare against zero 2900 // by xor'ing the rhs with the lhs, which is faster than setting a 2901 // condition register, reading it back out, and masking the correct bit. The 2902 // normal approach here uses sub to do this instead of xor. Using xor exposes 2903 // the result to other bit-twiddling opportunities. 2904 EVT LHSVT = Op.getOperand(0).getValueType(); 2905 if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 2906 EVT VT = Op.getValueType(); 2907 SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0), 2908 Op.getOperand(1)); 2909 return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC); 2910 } 2911 return SDValue(); 2912 } 2913 2914 SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 2915 SDNode *Node = Op.getNode(); 2916 EVT VT = Node->getValueType(0); 2917 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2918 SDValue InChain = Node->getOperand(0); 2919 SDValue VAListPtr = Node->getOperand(1); 2920 const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); 2921 SDLoc dl(Node); 2922 2923 assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only"); 2924 2925 // gpr_index 2926 SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, 2927 VAListPtr, MachinePointerInfo(SV), MVT::i8); 2928 InChain = GprIndex.getValue(1); 2929 2930 if (VT == MVT::i64) { 2931 // Check if GprIndex is even 2932 SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex, 2933 DAG.getConstant(1, dl, MVT::i32)); 2934 SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd, 2935 DAG.getConstant(0, dl, MVT::i32), ISD::SETNE); 2936 SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex, 2937 DAG.getConstant(1, dl, MVT::i32)); 2938 // Align GprIndex to be even if it isn't 2939 GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne, 2940 GprIndex); 2941 } 2942 2943 // fpr index is 1 byte after gpr 2944 SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, 2945 DAG.getConstant(1, dl, MVT::i32)); 2946 2947 // fpr 2948 SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, 2949 FprPtr, MachinePointerInfo(SV), MVT::i8); 2950 InChain = FprIndex.getValue(1); 2951 2952 SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, 2953 DAG.getConstant(8, dl, MVT::i32)); 2954 2955 SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, 2956 DAG.getConstant(4, dl, MVT::i32)); 2957 2958 // areas 2959 SDValue OverflowArea = 2960 DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo()); 2961 InChain = OverflowArea.getValue(1); 2962 2963 SDValue RegSaveArea = 2964 DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo()); 2965 InChain = RegSaveArea.getValue(1); 2966 2967 // select overflow_area if index > 8 2968 SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex, 2969 DAG.getConstant(8, dl, MVT::i32), ISD::SETLT); 2970 2971 // adjustment constant gpr_index * 4/8 2972 SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32, 2973 VT.isInteger() ? GprIndex : FprIndex, 2974 DAG.getConstant(VT.isInteger() ? 4 : 8, dl, 2975 MVT::i32)); 2976 2977 // OurReg = RegSaveArea + RegConstant 2978 SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea, 2979 RegConstant); 2980 2981 // Floating types are 32 bytes into RegSaveArea 2982 if (VT.isFloatingPoint()) 2983 OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg, 2984 DAG.getConstant(32, dl, MVT::i32)); 2985 2986 // increase {f,g}pr_index by 1 (or 2 if VT is i64) 2987 SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32, 2988 VT.isInteger() ? GprIndex : FprIndex, 2989 DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl, 2990 MVT::i32)); 2991 2992 InChain = DAG.getTruncStore(InChain, dl, IndexPlus1, 2993 VT.isInteger() ? VAListPtr : FprPtr, 2994 MachinePointerInfo(SV), MVT::i8); 2995 2996 // determine if we should load from reg_save_area or overflow_area 2997 SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea); 2998 2999 // increase overflow_area by 4/8 if gpr/fpr > 8 3000 SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea, 3001 DAG.getConstant(VT.isInteger() ? 4 : 8, 3002 dl, MVT::i32)); 3003 3004 OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea, 3005 OverflowAreaPlusN); 3006 3007 InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr, 3008 MachinePointerInfo(), MVT::i32); 3009 3010 return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo()); 3011 } 3012 3013 SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 3014 assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only"); 3015 3016 // We have to copy the entire va_list struct: 3017 // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte 3018 return DAG.getMemcpy(Op.getOperand(0), Op, 3019 Op.getOperand(1), Op.getOperand(2), 3020 DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true, 3021 false, MachinePointerInfo(), MachinePointerInfo()); 3022 } 3023 3024 SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, 3025 SelectionDAG &DAG) const { 3026 return Op.getOperand(0); 3027 } 3028 3029 SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, 3030 SelectionDAG &DAG) const { 3031 SDValue Chain = Op.getOperand(0); 3032 SDValue Trmp = Op.getOperand(1); // trampoline 3033 SDValue FPtr = Op.getOperand(2); // nested function 3034 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 3035 SDLoc dl(Op); 3036 3037 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3038 bool isPPC64 = (PtrVT == MVT::i64); 3039 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); 3040 3041 TargetLowering::ArgListTy Args; 3042 TargetLowering::ArgListEntry Entry; 3043 3044 Entry.Ty = IntPtrTy; 3045 Entry.Node = Trmp; Args.push_back(Entry); 3046 3047 // TrampSize == (isPPC64 ? 48 : 40); 3048 Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl, 3049 isPPC64 ? MVT::i64 : MVT::i32); 3050 Args.push_back(Entry); 3051 3052 Entry.Node = FPtr; Args.push_back(Entry); 3053 Entry.Node = Nest; Args.push_back(Entry); 3054 3055 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) 3056 TargetLowering::CallLoweringInfo CLI(DAG); 3057 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 3058 CallingConv::C, Type::getVoidTy(*DAG.getContext()), 3059 DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args)); 3060 3061 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3062 return CallResult.second; 3063 } 3064 3065 SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 3066 MachineFunction &MF = DAG.getMachineFunction(); 3067 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3068 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3069 3070 SDLoc dl(Op); 3071 3072 if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) { 3073 // vastart just stores the address of the VarArgsFrameIndex slot into the 3074 // memory location argument. 3075 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3076 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3077 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 3078 MachinePointerInfo(SV)); 3079 } 3080 3081 // For the 32-bit SVR4 ABI we follow the layout of the va_list struct. 3082 // We suppose the given va_list is already allocated. 3083 // 3084 // typedef struct { 3085 // char gpr; /* index into the array of 8 GPRs 3086 // * stored in the register save area 3087 // * gpr=0 corresponds to r3, 3088 // * gpr=1 to r4, etc. 3089 // */ 3090 // char fpr; /* index into the array of 8 FPRs 3091 // * stored in the register save area 3092 // * fpr=0 corresponds to f1, 3093 // * fpr=1 to f2, etc. 3094 // */ 3095 // char *overflow_arg_area; 3096 // /* location on stack that holds 3097 // * the next overflow argument 3098 // */ 3099 // char *reg_save_area; 3100 // /* where r3:r10 and f1:f8 (if saved) 3101 // * are stored 3102 // */ 3103 // } va_list[1]; 3104 3105 SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32); 3106 SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32); 3107 SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(), 3108 PtrVT); 3109 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 3110 PtrVT); 3111 3112 uint64_t FrameOffset = PtrVT.getSizeInBits()/8; 3113 SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT); 3114 3115 uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1; 3116 SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT); 3117 3118 uint64_t FPROffset = 1; 3119 SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT); 3120 3121 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3122 3123 // Store first byte : number of int regs 3124 SDValue firstStore = 3125 DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1), 3126 MachinePointerInfo(SV), MVT::i8); 3127 uint64_t nextOffset = FPROffset; 3128 SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1), 3129 ConstFPROffset); 3130 3131 // Store second byte : number of float regs 3132 SDValue secondStore = 3133 DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr, 3134 MachinePointerInfo(SV, nextOffset), MVT::i8); 3135 nextOffset += StackOffset; 3136 nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset); 3137 3138 // Store second word : arguments given on stack 3139 SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr, 3140 MachinePointerInfo(SV, nextOffset)); 3141 nextOffset += FrameOffset; 3142 nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset); 3143 3144 // Store third word : arguments given in registers 3145 return DAG.getStore(thirdStore, dl, FR, nextPtr, 3146 MachinePointerInfo(SV, nextOffset)); 3147 } 3148 3149 /// FPR - The set of FP registers that should be allocated for arguments, 3150 /// on Darwin. 3151 static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, 3152 PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, 3153 PPC::F11, PPC::F12, PPC::F13}; 3154 3155 /// QFPR - The set of QPX registers that should be allocated for arguments. 3156 static const MCPhysReg QFPR[] = { 3157 PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, 3158 PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; 3159 3160 /// CalculateStackSlotSize - Calculates the size reserved for this argument on 3161 /// the stack. 3162 static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, 3163 unsigned PtrByteSize) { 3164 unsigned ArgSize = ArgVT.getStoreSize(); 3165 if (Flags.isByVal()) 3166 ArgSize = Flags.getByValSize(); 3167 3168 // Round up to multiples of the pointer size, except for array members, 3169 // which are always packed. 3170 if (!Flags.isInConsecutiveRegs()) 3171 ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3172 3173 return ArgSize; 3174 } 3175 3176 /// CalculateStackSlotAlignment - Calculates the alignment of this argument 3177 /// on the stack. 3178 static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, 3179 ISD::ArgFlagsTy Flags, 3180 unsigned PtrByteSize) { 3181 unsigned Align = PtrByteSize; 3182 3183 // Altivec parameters are padded to a 16 byte boundary. 3184 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || 3185 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || 3186 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || 3187 ArgVT == MVT::v1i128 || ArgVT == MVT::f128) 3188 Align = 16; 3189 // QPX vector types stored in double-precision are padded to a 32 byte 3190 // boundary. 3191 else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1) 3192 Align = 32; 3193 3194 // ByVal parameters are aligned as requested. 3195 if (Flags.isByVal()) { 3196 unsigned BVAlign = Flags.getByValAlign(); 3197 if (BVAlign > PtrByteSize) { 3198 if (BVAlign % PtrByteSize != 0) 3199 llvm_unreachable( 3200 "ByVal alignment is not a multiple of the pointer size"); 3201 3202 Align = BVAlign; 3203 } 3204 } 3205 3206 // Array members are always packed to their original alignment. 3207 if (Flags.isInConsecutiveRegs()) { 3208 // If the array member was split into multiple registers, the first 3209 // needs to be aligned to the size of the full type. (Except for 3210 // ppcf128, which is only aligned as its f64 components.) 3211 if (Flags.isSplit() && OrigVT != MVT::ppcf128) 3212 Align = OrigVT.getStoreSize(); 3213 else 3214 Align = ArgVT.getStoreSize(); 3215 } 3216 3217 return Align; 3218 } 3219 3220 /// CalculateStackSlotUsed - Return whether this argument will use its 3221 /// stack slot (instead of being passed in registers). ArgOffset, 3222 /// AvailableFPRs, and AvailableVRs must hold the current argument 3223 /// position, and will be updated to account for this argument. 3224 static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, 3225 ISD::ArgFlagsTy Flags, 3226 unsigned PtrByteSize, 3227 unsigned LinkageSize, 3228 unsigned ParamAreaSize, 3229 unsigned &ArgOffset, 3230 unsigned &AvailableFPRs, 3231 unsigned &AvailableVRs, bool HasQPX) { 3232 bool UseMemory = false; 3233 3234 // Respect alignment of argument on the stack. 3235 unsigned Align = 3236 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); 3237 ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; 3238 // If there's no space left in the argument save area, we must 3239 // use memory (this check also catches zero-sized arguments). 3240 if (ArgOffset >= LinkageSize + ParamAreaSize) 3241 UseMemory = true; 3242 3243 // Allocate argument on the stack. 3244 ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); 3245 if (Flags.isInConsecutiveRegsLast()) 3246 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3247 // If we overran the argument save area, we must use memory 3248 // (this check catches arguments passed partially in memory) 3249 if (ArgOffset > LinkageSize + ParamAreaSize) 3250 UseMemory = true; 3251 3252 // However, if the argument is actually passed in an FPR or a VR, 3253 // we don't use memory after all. 3254 if (!Flags.isByVal()) { 3255 if (ArgVT == MVT::f32 || ArgVT == MVT::f64 || 3256 // QPX registers overlap with the scalar FP registers. 3257 (HasQPX && (ArgVT == MVT::v4f32 || 3258 ArgVT == MVT::v4f64 || 3259 ArgVT == MVT::v4i1))) 3260 if (AvailableFPRs > 0) { 3261 --AvailableFPRs; 3262 return false; 3263 } 3264 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || 3265 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || 3266 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || 3267 ArgVT == MVT::v1i128 || ArgVT == MVT::f128) 3268 if (AvailableVRs > 0) { 3269 --AvailableVRs; 3270 return false; 3271 } 3272 } 3273 3274 return UseMemory; 3275 } 3276 3277 /// EnsureStackAlignment - Round stack frame size up from NumBytes to 3278 /// ensure minimum alignment required for target. 3279 static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering, 3280 unsigned NumBytes) { 3281 unsigned TargetAlign = Lowering->getStackAlignment(); 3282 unsigned AlignMask = TargetAlign - 1; 3283 NumBytes = (NumBytes + AlignMask) & ~AlignMask; 3284 return NumBytes; 3285 } 3286 3287 SDValue PPCTargetLowering::LowerFormalArguments( 3288 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3289 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3290 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3291 if (Subtarget.isSVR4ABI()) { 3292 if (Subtarget.isPPC64()) 3293 return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, 3294 dl, DAG, InVals); 3295 else 3296 return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, 3297 dl, DAG, InVals); 3298 } else { 3299 return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, 3300 dl, DAG, InVals); 3301 } 3302 } 3303 3304 SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( 3305 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3306 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3307 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3308 3309 // 32-bit SVR4 ABI Stack Frame Layout: 3310 // +-----------------------------------+ 3311 // +--> | Back chain | 3312 // | +-----------------------------------+ 3313 // | | Floating-point register save area | 3314 // | +-----------------------------------+ 3315 // | | General register save area | 3316 // | +-----------------------------------+ 3317 // | | CR save word | 3318 // | +-----------------------------------+ 3319 // | | VRSAVE save word | 3320 // | +-----------------------------------+ 3321 // | | Alignment padding | 3322 // | +-----------------------------------+ 3323 // | | Vector register save area | 3324 // | +-----------------------------------+ 3325 // | | Local variable space | 3326 // | +-----------------------------------+ 3327 // | | Parameter list area | 3328 // | +-----------------------------------+ 3329 // | | LR save word | 3330 // | +-----------------------------------+ 3331 // SP--> +--- | Back chain | 3332 // +-----------------------------------+ 3333 // 3334 // Specifications: 3335 // System V Application Binary Interface PowerPC Processor Supplement 3336 // AltiVec Technology Programming Interface Manual 3337 3338 MachineFunction &MF = DAG.getMachineFunction(); 3339 MachineFrameInfo &MFI = MF.getFrameInfo(); 3340 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3341 3342 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3343 // Potential tail calls could cause overwriting of argument stack slots. 3344 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && 3345 (CallConv == CallingConv::Fast)); 3346 unsigned PtrByteSize = 4; 3347 3348 // Assign locations to all of the incoming arguments. 3349 SmallVector<CCValAssign, 16> ArgLocs; 3350 PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3351 *DAG.getContext()); 3352 3353 // Reserve space for the linkage area on the stack. 3354 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 3355 CCInfo.AllocateStack(LinkageSize, PtrByteSize); 3356 if (useSoftFloat() || hasSPE()) 3357 CCInfo.PreAnalyzeFormalArguments(Ins); 3358 3359 CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); 3360 CCInfo.clearWasPPCF128(); 3361 3362 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3363 CCValAssign &VA = ArgLocs[i]; 3364 3365 // Arguments stored in registers. 3366 if (VA.isRegLoc()) { 3367 const TargetRegisterClass *RC; 3368 EVT ValVT = VA.getValVT(); 3369 3370 switch (ValVT.getSimpleVT().SimpleTy) { 3371 default: 3372 llvm_unreachable("ValVT not supported by formal arguments Lowering"); 3373 case MVT::i1: 3374 case MVT::i32: 3375 RC = &PPC::GPRCRegClass; 3376 break; 3377 case MVT::f32: 3378 if (Subtarget.hasP8Vector()) 3379 RC = &PPC::VSSRCRegClass; 3380 else if (Subtarget.hasSPE()) 3381 RC = &PPC::SPE4RCRegClass; 3382 else 3383 RC = &PPC::F4RCRegClass; 3384 break; 3385 case MVT::f64: 3386 if (Subtarget.hasVSX()) 3387 RC = &PPC::VSFRCRegClass; 3388 else if (Subtarget.hasSPE()) 3389 RC = &PPC::SPERCRegClass; 3390 else 3391 RC = &PPC::F8RCRegClass; 3392 break; 3393 case MVT::v16i8: 3394 case MVT::v8i16: 3395 case MVT::v4i32: 3396 RC = &PPC::VRRCRegClass; 3397 break; 3398 case MVT::v4f32: 3399 RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass; 3400 break; 3401 case MVT::v2f64: 3402 case MVT::v2i64: 3403 RC = &PPC::VRRCRegClass; 3404 break; 3405 case MVT::v4f64: 3406 RC = &PPC::QFRCRegClass; 3407 break; 3408 case MVT::v4i1: 3409 RC = &PPC::QBRCRegClass; 3410 break; 3411 } 3412 3413 // Transform the arguments stored in physical registers into virtual ones. 3414 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3415 SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, 3416 ValVT == MVT::i1 ? MVT::i32 : ValVT); 3417 3418 if (ValVT == MVT::i1) 3419 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); 3420 3421 InVals.push_back(ArgValue); 3422 } else { 3423 // Argument stored in memory. 3424 assert(VA.isMemLoc()); 3425 3426 // Get the extended size of the argument type in stack 3427 unsigned ArgSize = VA.getLocVT().getStoreSize(); 3428 // Get the actual size of the argument type 3429 unsigned ObjSize = VA.getValVT().getStoreSize(); 3430 unsigned ArgOffset = VA.getLocMemOffset(); 3431 // Stack objects in PPC32 are right justified. 3432 ArgOffset += ArgSize - ObjSize; 3433 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, isImmutable); 3434 3435 // Create load nodes to retrieve arguments from the stack. 3436 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3437 InVals.push_back( 3438 DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo())); 3439 } 3440 } 3441 3442 // Assign locations to all of the incoming aggregate by value arguments. 3443 // Aggregates passed by value are stored in the local variable space of the 3444 // caller's stack frame, right above the parameter list area. 3445 SmallVector<CCValAssign, 16> ByValArgLocs; 3446 CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), 3447 ByValArgLocs, *DAG.getContext()); 3448 3449 // Reserve stack space for the allocations in CCInfo. 3450 CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); 3451 3452 CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal); 3453 3454 // Area that is at least reserved in the caller of this function. 3455 unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); 3456 MinReservedArea = std::max(MinReservedArea, LinkageSize); 3457 3458 // Set the size that is at least reserved in caller of this function. Tail 3459 // call optimized function's reserved stack space needs to be aligned so that 3460 // taking the difference between two stack areas will result in an aligned 3461 // stack. 3462 MinReservedArea = 3463 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); 3464 FuncInfo->setMinReservedArea(MinReservedArea); 3465 3466 SmallVector<SDValue, 8> MemOps; 3467 3468 // If the function takes variable number of arguments, make a frame index for 3469 // the start of the first vararg value... for expansion of llvm.va_start. 3470 if (isVarArg) { 3471 static const MCPhysReg GPArgRegs[] = { 3472 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 3473 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 3474 }; 3475 const unsigned NumGPArgRegs = array_lengthof(GPArgRegs); 3476 3477 static const MCPhysReg FPArgRegs[] = { 3478 PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, 3479 PPC::F8 3480 }; 3481 unsigned NumFPArgRegs = array_lengthof(FPArgRegs); 3482 3483 if (useSoftFloat() || hasSPE()) 3484 NumFPArgRegs = 0; 3485 3486 FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs)); 3487 FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs)); 3488 3489 // Make room for NumGPArgRegs and NumFPArgRegs. 3490 int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 + 3491 NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8; 3492 3493 FuncInfo->setVarArgsStackOffset( 3494 MFI.CreateFixedObject(PtrVT.getSizeInBits()/8, 3495 CCInfo.getNextStackOffset(), true)); 3496 3497 FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false)); 3498 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3499 3500 // The fixed integer arguments of a variadic function are stored to the 3501 // VarArgsFrameIndex on the stack so that they may be loaded by 3502 // dereferencing the result of va_next. 3503 for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) { 3504 // Get an existing live-in vreg, or add a new one. 3505 unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]); 3506 if (!VReg) 3507 VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass); 3508 3509 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3510 SDValue Store = 3511 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 3512 MemOps.push_back(Store); 3513 // Increment the address by four for the next argument to store 3514 SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); 3515 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 3516 } 3517 3518 // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6 3519 // is set. 3520 // The double arguments are stored to the VarArgsFrameIndex 3521 // on the stack. 3522 for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) { 3523 // Get an existing live-in vreg, or add a new one. 3524 unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]); 3525 if (!VReg) 3526 VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass); 3527 3528 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64); 3529 SDValue Store = 3530 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 3531 MemOps.push_back(Store); 3532 // Increment the address by eight for the next argument to store 3533 SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl, 3534 PtrVT); 3535 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 3536 } 3537 } 3538 3539 if (!MemOps.empty()) 3540 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3541 3542 return Chain; 3543 } 3544 3545 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote 3546 // value to MVT::i64 and then truncate to the correct register size. 3547 SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, 3548 EVT ObjectVT, SelectionDAG &DAG, 3549 SDValue ArgVal, 3550 const SDLoc &dl) const { 3551 if (Flags.isSExt()) 3552 ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal, 3553 DAG.getValueType(ObjectVT)); 3554 else if (Flags.isZExt()) 3555 ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal, 3556 DAG.getValueType(ObjectVT)); 3557 3558 return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal); 3559 } 3560 3561 SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( 3562 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3563 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3564 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3565 // TODO: add description of PPC stack frame format, or at least some docs. 3566 // 3567 bool isELFv2ABI = Subtarget.isELFv2ABI(); 3568 bool isLittleEndian = Subtarget.isLittleEndian(); 3569 MachineFunction &MF = DAG.getMachineFunction(); 3570 MachineFrameInfo &MFI = MF.getFrameInfo(); 3571 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3572 3573 assert(!(CallConv == CallingConv::Fast && isVarArg) && 3574 "fastcc not supported on varargs functions"); 3575 3576 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3577 // Potential tail calls could cause overwriting of argument stack slots. 3578 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && 3579 (CallConv == CallingConv::Fast)); 3580 unsigned PtrByteSize = 8; 3581 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 3582 3583 static const MCPhysReg GPR[] = { 3584 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 3585 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 3586 }; 3587 static const MCPhysReg VR[] = { 3588 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 3589 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 3590 }; 3591 3592 const unsigned Num_GPR_Regs = array_lengthof(GPR); 3593 const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; 3594 const unsigned Num_VR_Regs = array_lengthof(VR); 3595 const unsigned Num_QFPR_Regs = Num_FPR_Regs; 3596 3597 // Do a first pass over the arguments to determine whether the ABI 3598 // guarantees that our caller has allocated the parameter save area 3599 // on its stack frame. In the ELFv1 ABI, this is always the case; 3600 // in the ELFv2 ABI, it is true if this is a vararg function or if 3601 // any parameter is located in a stack slot. 3602 3603 bool HasParameterArea = !isELFv2ABI || isVarArg; 3604 unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; 3605 unsigned NumBytes = LinkageSize; 3606 unsigned AvailableFPRs = Num_FPR_Regs; 3607 unsigned AvailableVRs = Num_VR_Regs; 3608 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 3609 if (Ins[i].Flags.isNest()) 3610 continue; 3611 3612 if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, 3613 PtrByteSize, LinkageSize, ParamAreaSize, 3614 NumBytes, AvailableFPRs, AvailableVRs, 3615 Subtarget.hasQPX())) 3616 HasParameterArea = true; 3617 } 3618 3619 // Add DAG nodes to load the arguments or copy them out of registers. On 3620 // entry to a function on PPC, the arguments start after the linkage area, 3621 // although the first ones are often in registers. 3622 3623 unsigned ArgOffset = LinkageSize; 3624 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 3625 unsigned &QFPR_idx = FPR_idx; 3626 SmallVector<SDValue, 8> MemOps; 3627 Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); 3628 unsigned CurArgIdx = 0; 3629 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { 3630 SDValue ArgVal; 3631 bool needsLoad = false; 3632 EVT ObjectVT = Ins[ArgNo].VT; 3633 EVT OrigVT = Ins[ArgNo].ArgVT; 3634 unsigned ObjSize = ObjectVT.getStoreSize(); 3635 unsigned ArgSize = ObjSize; 3636 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; 3637 if (Ins[ArgNo].isOrigArg()) { 3638 std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); 3639 CurArgIdx = Ins[ArgNo].getOrigArgIndex(); 3640 } 3641 // We re-align the argument offset for each argument, except when using the 3642 // fast calling convention, when we need to make sure we do that only when 3643 // we'll actually use a stack slot. 3644 unsigned CurArgOffset, Align; 3645 auto ComputeArgOffset = [&]() { 3646 /* Respect alignment of argument on the stack. */ 3647 Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); 3648 ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; 3649 CurArgOffset = ArgOffset; 3650 }; 3651 3652 if (CallConv != CallingConv::Fast) { 3653 ComputeArgOffset(); 3654 3655 /* Compute GPR index associated with argument offset. */ 3656 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; 3657 GPR_idx = std::min(GPR_idx, Num_GPR_Regs); 3658 } 3659 3660 // FIXME the codegen can be much improved in some cases. 3661 // We do not have to keep everything in memory. 3662 if (Flags.isByVal()) { 3663 assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); 3664 3665 if (CallConv == CallingConv::Fast) 3666 ComputeArgOffset(); 3667 3668 // ObjSize is the true size, ArgSize rounded up to multiple of registers. 3669 ObjSize = Flags.getByValSize(); 3670 ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3671 // Empty aggregate parameters do not take up registers. Examples: 3672 // struct { } a; 3673 // union { } b; 3674 // int c[0]; 3675 // etc. However, we have to provide a place-holder in InVals, so 3676 // pretend we have an 8-byte item at the current address for that 3677 // purpose. 3678 if (!ObjSize) { 3679 int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true); 3680 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3681 InVals.push_back(FIN); 3682 continue; 3683 } 3684 3685 // Create a stack object covering all stack doublewords occupied 3686 // by the argument. If the argument is (fully or partially) on 3687 // the stack, or if the argument is fully in registers but the 3688 // caller has allocated the parameter save anyway, we can refer 3689 // directly to the caller's stack frame. Otherwise, create a 3690 // local copy in our own frame. 3691 int FI; 3692 if (HasParameterArea || 3693 ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) 3694 FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true); 3695 else 3696 FI = MFI.CreateStackObject(ArgSize, Align, false); 3697 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3698 3699 // Handle aggregates smaller than 8 bytes. 3700 if (ObjSize < PtrByteSize) { 3701 // The value of the object is its address, which differs from the 3702 // address of the enclosing doubleword on big-endian systems. 3703 SDValue Arg = FIN; 3704 if (!isLittleEndian) { 3705 SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT); 3706 Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff); 3707 } 3708 InVals.push_back(Arg); 3709 3710 if (GPR_idx != Num_GPR_Regs) { 3711 unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); 3712 FuncInfo->addLiveInAttr(VReg, Flags); 3713 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3714 SDValue Store; 3715 3716 if (ObjSize==1 || ObjSize==2 || ObjSize==4) { 3717 EVT ObjType = (ObjSize == 1 ? MVT::i8 : 3718 (ObjSize == 2 ? MVT::i16 : MVT::i32)); 3719 Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, 3720 MachinePointerInfo(&*FuncArg), ObjType); 3721 } else { 3722 // For sizes that don't fit a truncating store (3, 5, 6, 7), 3723 // store the whole register as-is to the parameter save area 3724 // slot. 3725 Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 3726 MachinePointerInfo(&*FuncArg)); 3727 } 3728 3729 MemOps.push_back(Store); 3730 } 3731 // Whether we copied from a register or not, advance the offset 3732 // into the parameter save area by a full doubleword. 3733 ArgOffset += PtrByteSize; 3734 continue; 3735 } 3736 3737 // The value of the object is its address, which is the address of 3738 // its first stack doubleword. 3739 InVals.push_back(FIN); 3740 3741 // Store whatever pieces of the object are in registers to memory. 3742 for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { 3743 if (GPR_idx == Num_GPR_Regs) 3744 break; 3745 3746 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 3747 FuncInfo->addLiveInAttr(VReg, Flags); 3748 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3749 SDValue Addr = FIN; 3750 if (j) { 3751 SDValue Off = DAG.getConstant(j, dl, PtrVT); 3752 Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); 3753 } 3754 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, 3755 MachinePointerInfo(&*FuncArg, j)); 3756 MemOps.push_back(Store); 3757 ++GPR_idx; 3758 } 3759 ArgOffset += ArgSize; 3760 continue; 3761 } 3762 3763 switch (ObjectVT.getSimpleVT().SimpleTy) { 3764 default: llvm_unreachable("Unhandled argument type!"); 3765 case MVT::i1: 3766 case MVT::i32: 3767 case MVT::i64: 3768 if (Flags.isNest()) { 3769 // The 'nest' parameter, if any, is passed in R11. 3770 unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass); 3771 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 3772 3773 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) 3774 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); 3775 3776 break; 3777 } 3778 3779 // These can be scalar arguments or elements of an integer array type 3780 // passed directly. Clang may use those instead of "byval" aggregate 3781 // types to avoid forcing arguments to memory unnecessarily. 3782 if (GPR_idx != Num_GPR_Regs) { 3783 unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); 3784 FuncInfo->addLiveInAttr(VReg, Flags); 3785 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 3786 3787 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) 3788 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote 3789 // value to MVT::i64 and then truncate to the correct register size. 3790 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); 3791 } else { 3792 if (CallConv == CallingConv::Fast) 3793 ComputeArgOffset(); 3794 3795 needsLoad = true; 3796 ArgSize = PtrByteSize; 3797 } 3798 if (CallConv != CallingConv::Fast || needsLoad) 3799 ArgOffset += 8; 3800 break; 3801 3802 case MVT::f32: 3803 case MVT::f64: 3804 // These can be scalar arguments or elements of a float array type 3805 // passed directly. The latter are used to implement ELFv2 homogenous 3806 // float aggregates. 3807 if (FPR_idx != Num_FPR_Regs) { 3808 unsigned VReg; 3809 3810 if (ObjectVT == MVT::f32) 3811 VReg = MF.addLiveIn(FPR[FPR_idx], 3812 Subtarget.hasP8Vector() 3813 ? &PPC::VSSRCRegClass 3814 : &PPC::F4RCRegClass); 3815 else 3816 VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() 3817 ? &PPC::VSFRCRegClass 3818 : &PPC::F8RCRegClass); 3819 3820 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 3821 ++FPR_idx; 3822 } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) { 3823 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 3824 // once we support fp <-> gpr moves. 3825 3826 // This can only ever happen in the presence of f32 array types, 3827 // since otherwise we never run out of FPRs before running out 3828 // of GPRs. 3829 unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass); 3830 FuncInfo->addLiveInAttr(VReg, Flags); 3831 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 3832 3833 if (ObjectVT == MVT::f32) { 3834 if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) 3835 ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, 3836 DAG.getConstant(32, dl, MVT::i32)); 3837 ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); 3838 } 3839 3840 ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); 3841 } else { 3842 if (CallConv == CallingConv::Fast) 3843 ComputeArgOffset(); 3844 3845 needsLoad = true; 3846 } 3847 3848 // When passing an array of floats, the array occupies consecutive 3849 // space in the argument area; only round up to the next doubleword 3850 // at the end of the array. Otherwise, each float takes 8 bytes. 3851 if (CallConv != CallingConv::Fast || needsLoad) { 3852 ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; 3853 ArgOffset += ArgSize; 3854 if (Flags.isInConsecutiveRegsLast()) 3855 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 3856 } 3857 break; 3858 case MVT::v4f32: 3859 case MVT::v4i32: 3860 case MVT::v8i16: 3861 case MVT::v16i8: 3862 case MVT::v2f64: 3863 case MVT::v2i64: 3864 case MVT::v1i128: 3865 case MVT::f128: 3866 if (!Subtarget.hasQPX()) { 3867 // These can be scalar arguments or elements of a vector array type 3868 // passed directly. The latter are used to implement ELFv2 homogenous 3869 // vector aggregates. 3870 if (VR_idx != Num_VR_Regs) { 3871 unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); 3872 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 3873 ++VR_idx; 3874 } else { 3875 if (CallConv == CallingConv::Fast) 3876 ComputeArgOffset(); 3877 needsLoad = true; 3878 } 3879 if (CallConv != CallingConv::Fast || needsLoad) 3880 ArgOffset += 16; 3881 break; 3882 } // not QPX 3883 3884 assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && 3885 "Invalid QPX parameter type"); 3886 LLVM_FALLTHROUGH; 3887 3888 case MVT::v4f64: 3889 case MVT::v4i1: 3890 // QPX vectors are treated like their scalar floating-point subregisters 3891 // (except that they're larger). 3892 unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32; 3893 if (QFPR_idx != Num_QFPR_Regs) { 3894 const TargetRegisterClass *RC; 3895 switch (ObjectVT.getSimpleVT().SimpleTy) { 3896 case MVT::v4f64: RC = &PPC::QFRCRegClass; break; 3897 case MVT::v4f32: RC = &PPC::QSRCRegClass; break; 3898 default: RC = &PPC::QBRCRegClass; break; 3899 } 3900 3901 unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC); 3902 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 3903 ++QFPR_idx; 3904 } else { 3905 if (CallConv == CallingConv::Fast) 3906 ComputeArgOffset(); 3907 needsLoad = true; 3908 } 3909 if (CallConv != CallingConv::Fast || needsLoad) 3910 ArgOffset += Sz; 3911 break; 3912 } 3913 3914 // We need to load the argument to a virtual register if we determined 3915 // above that we ran out of physical registers of the appropriate type. 3916 if (needsLoad) { 3917 if (ObjSize < ArgSize && !isLittleEndian) 3918 CurArgOffset += ArgSize - ObjSize; 3919 int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable); 3920 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 3921 ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); 3922 } 3923 3924 InVals.push_back(ArgVal); 3925 } 3926 3927 // Area that is at least reserved in the caller of this function. 3928 unsigned MinReservedArea; 3929 if (HasParameterArea) 3930 MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); 3931 else 3932 MinReservedArea = LinkageSize; 3933 3934 // Set the size that is at least reserved in caller of this function. Tail 3935 // call optimized functions' reserved stack space needs to be aligned so that 3936 // taking the difference between two stack areas will result in an aligned 3937 // stack. 3938 MinReservedArea = 3939 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); 3940 FuncInfo->setMinReservedArea(MinReservedArea); 3941 3942 // If the function takes variable number of arguments, make a frame index for 3943 // the start of the first vararg value... for expansion of llvm.va_start. 3944 if (isVarArg) { 3945 int Depth = ArgOffset; 3946 3947 FuncInfo->setVarArgsFrameIndex( 3948 MFI.CreateFixedObject(PtrByteSize, Depth, true)); 3949 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 3950 3951 // If this function is vararg, store any remaining integer argument regs 3952 // to their spots on the stack so that they may be loaded by dereferencing 3953 // the result of va_next. 3954 for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; 3955 GPR_idx < Num_GPR_Regs; ++GPR_idx) { 3956 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 3957 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 3958 SDValue Store = 3959 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 3960 MemOps.push_back(Store); 3961 // Increment the address by four for the next argument to store 3962 SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT); 3963 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 3964 } 3965 } 3966 3967 if (!MemOps.empty()) 3968 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 3969 3970 return Chain; 3971 } 3972 3973 SDValue PPCTargetLowering::LowerFormalArguments_Darwin( 3974 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3975 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3976 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3977 // TODO: add description of PPC stack frame format, or at least some docs. 3978 // 3979 MachineFunction &MF = DAG.getMachineFunction(); 3980 MachineFrameInfo &MFI = MF.getFrameInfo(); 3981 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 3982 3983 EVT PtrVT = getPointerTy(MF.getDataLayout()); 3984 bool isPPC64 = PtrVT == MVT::i64; 3985 // Potential tail calls could cause overwriting of argument stack slots. 3986 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && 3987 (CallConv == CallingConv::Fast)); 3988 unsigned PtrByteSize = isPPC64 ? 8 : 4; 3989 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 3990 unsigned ArgOffset = LinkageSize; 3991 // Area that is at least reserved in caller of this function. 3992 unsigned MinReservedArea = ArgOffset; 3993 3994 static const MCPhysReg GPR_32[] = { // 32-bit registers. 3995 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 3996 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 3997 }; 3998 static const MCPhysReg GPR_64[] = { // 64-bit registers. 3999 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 4000 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 4001 }; 4002 static const MCPhysReg VR[] = { 4003 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 4004 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 4005 }; 4006 4007 const unsigned Num_GPR_Regs = array_lengthof(GPR_32); 4008 const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; 4009 const unsigned Num_VR_Regs = array_lengthof( VR); 4010 4011 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 4012 4013 const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; 4014 4015 // In 32-bit non-varargs functions, the stack space for vectors is after the 4016 // stack space for non-vectors. We do not use this space unless we have 4017 // too many vectors to fit in registers, something that only occurs in 4018 // constructed examples:), but we have to walk the arglist to figure 4019 // that out...for the pathological case, compute VecArgOffset as the 4020 // start of the vector parameter area. Computing VecArgOffset is the 4021 // entire point of the following loop. 4022 unsigned VecArgOffset = ArgOffset; 4023 if (!isVarArg && !isPPC64) { 4024 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; 4025 ++ArgNo) { 4026 EVT ObjectVT = Ins[ArgNo].VT; 4027 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; 4028 4029 if (Flags.isByVal()) { 4030 // ObjSize is the true size, ArgSize rounded up to multiple of regs. 4031 unsigned ObjSize = Flags.getByValSize(); 4032 unsigned ArgSize = 4033 ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 4034 VecArgOffset += ArgSize; 4035 continue; 4036 } 4037 4038 switch(ObjectVT.getSimpleVT().SimpleTy) { 4039 default: llvm_unreachable("Unhandled argument type!"); 4040 case MVT::i1: 4041 case MVT::i32: 4042 case MVT::f32: 4043 VecArgOffset += 4; 4044 break; 4045 case MVT::i64: // PPC64 4046 case MVT::f64: 4047 // FIXME: We are guaranteed to be !isPPC64 at this point. 4048 // Does MVT::i64 apply? 4049 VecArgOffset += 8; 4050 break; 4051 case MVT::v4f32: 4052 case MVT::v4i32: 4053 case MVT::v8i16: 4054 case MVT::v16i8: 4055 // Nothing to do, we're only looking at Nonvector args here. 4056 break; 4057 } 4058 } 4059 } 4060 // We've found where the vector parameter area in memory is. Skip the 4061 // first 12 parameters; these don't use that memory. 4062 VecArgOffset = ((VecArgOffset+15)/16)*16; 4063 VecArgOffset += 12*16; 4064 4065 // Add DAG nodes to load the arguments or copy them out of registers. On 4066 // entry to a function on PPC, the arguments start after the linkage area, 4067 // although the first ones are often in registers. 4068 4069 SmallVector<SDValue, 8> MemOps; 4070 unsigned nAltivecParamsAtEnd = 0; 4071 Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); 4072 unsigned CurArgIdx = 0; 4073 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { 4074 SDValue ArgVal; 4075 bool needsLoad = false; 4076 EVT ObjectVT = Ins[ArgNo].VT; 4077 unsigned ObjSize = ObjectVT.getSizeInBits()/8; 4078 unsigned ArgSize = ObjSize; 4079 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; 4080 if (Ins[ArgNo].isOrigArg()) { 4081 std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); 4082 CurArgIdx = Ins[ArgNo].getOrigArgIndex(); 4083 } 4084 unsigned CurArgOffset = ArgOffset; 4085 4086 // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. 4087 if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || 4088 ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) { 4089 if (isVarArg || isPPC64) { 4090 MinReservedArea = ((MinReservedArea+15)/16)*16; 4091 MinReservedArea += CalculateStackSlotSize(ObjectVT, 4092 Flags, 4093 PtrByteSize); 4094 } else nAltivecParamsAtEnd++; 4095 } else 4096 // Calculate min reserved area. 4097 MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT, 4098 Flags, 4099 PtrByteSize); 4100 4101 // FIXME the codegen can be much improved in some cases. 4102 // We do not have to keep everything in memory. 4103 if (Flags.isByVal()) { 4104 assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); 4105 4106 // ObjSize is the true size, ArgSize rounded up to multiple of registers. 4107 ObjSize = Flags.getByValSize(); 4108 ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 4109 // Objects of size 1 and 2 are right justified, everything else is 4110 // left justified. This means the memory address is adjusted forwards. 4111 if (ObjSize==1 || ObjSize==2) { 4112 CurArgOffset = CurArgOffset + (4 - ObjSize); 4113 } 4114 // The value of the object is its address. 4115 int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true); 4116 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4117 InVals.push_back(FIN); 4118 if (ObjSize==1 || ObjSize==2) { 4119 if (GPR_idx != Num_GPR_Regs) { 4120 unsigned VReg; 4121 if (isPPC64) 4122 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4123 else 4124 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4125 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 4126 EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16; 4127 SDValue Store = 4128 DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, 4129 MachinePointerInfo(&*FuncArg), ObjType); 4130 MemOps.push_back(Store); 4131 ++GPR_idx; 4132 } 4133 4134 ArgOffset += PtrByteSize; 4135 4136 continue; 4137 } 4138 for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { 4139 // Store whatever pieces of the object are in registers 4140 // to memory. ArgOffset will be the address of the beginning 4141 // of the object. 4142 if (GPR_idx != Num_GPR_Regs) { 4143 unsigned VReg; 4144 if (isPPC64) 4145 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4146 else 4147 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4148 int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true); 4149 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4150 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 4151 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 4152 MachinePointerInfo(&*FuncArg, j)); 4153 MemOps.push_back(Store); 4154 ++GPR_idx; 4155 ArgOffset += PtrByteSize; 4156 } else { 4157 ArgOffset += ArgSize - (ArgOffset-CurArgOffset); 4158 break; 4159 } 4160 } 4161 continue; 4162 } 4163 4164 switch (ObjectVT.getSimpleVT().SimpleTy) { 4165 default: llvm_unreachable("Unhandled argument type!"); 4166 case MVT::i1: 4167 case MVT::i32: 4168 if (!isPPC64) { 4169 if (GPR_idx != Num_GPR_Regs) { 4170 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4171 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 4172 4173 if (ObjectVT == MVT::i1) 4174 ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal); 4175 4176 ++GPR_idx; 4177 } else { 4178 needsLoad = true; 4179 ArgSize = PtrByteSize; 4180 } 4181 // All int arguments reserve stack space in the Darwin ABI. 4182 ArgOffset += PtrByteSize; 4183 break; 4184 } 4185 LLVM_FALLTHROUGH; 4186 case MVT::i64: // PPC64 4187 if (GPR_idx != Num_GPR_Regs) { 4188 unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4189 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 4190 4191 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) 4192 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote 4193 // value to MVT::i64 and then truncate to the correct register size. 4194 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); 4195 4196 ++GPR_idx; 4197 } else { 4198 needsLoad = true; 4199 ArgSize = PtrByteSize; 4200 } 4201 // All int arguments reserve stack space in the Darwin ABI. 4202 ArgOffset += 8; 4203 break; 4204 4205 case MVT::f32: 4206 case MVT::f64: 4207 // Every 4 bytes of argument space consumes one of the GPRs available for 4208 // argument passing. 4209 if (GPR_idx != Num_GPR_Regs) { 4210 ++GPR_idx; 4211 if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64) 4212 ++GPR_idx; 4213 } 4214 if (FPR_idx != Num_FPR_Regs) { 4215 unsigned VReg; 4216 4217 if (ObjectVT == MVT::f32) 4218 VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); 4219 else 4220 VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass); 4221 4222 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 4223 ++FPR_idx; 4224 } else { 4225 needsLoad = true; 4226 } 4227 4228 // All FP arguments reserve stack space in the Darwin ABI. 4229 ArgOffset += isPPC64 ? 8 : ObjSize; 4230 break; 4231 case MVT::v4f32: 4232 case MVT::v4i32: 4233 case MVT::v8i16: 4234 case MVT::v16i8: 4235 // Note that vector arguments in registers don't reserve stack space, 4236 // except in varargs functions. 4237 if (VR_idx != Num_VR_Regs) { 4238 unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); 4239 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); 4240 if (isVarArg) { 4241 while ((ArgOffset % 16) != 0) { 4242 ArgOffset += PtrByteSize; 4243 if (GPR_idx != Num_GPR_Regs) 4244 GPR_idx++; 4245 } 4246 ArgOffset += 16; 4247 GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64? 4248 } 4249 ++VR_idx; 4250 } else { 4251 if (!isVarArg && !isPPC64) { 4252 // Vectors go after all the nonvectors. 4253 CurArgOffset = VecArgOffset; 4254 VecArgOffset += 16; 4255 } else { 4256 // Vectors are aligned. 4257 ArgOffset = ((ArgOffset+15)/16)*16; 4258 CurArgOffset = ArgOffset; 4259 ArgOffset += 16; 4260 } 4261 needsLoad = true; 4262 } 4263 break; 4264 } 4265 4266 // We need to load the argument to a virtual register if we determined above 4267 // that we ran out of physical registers of the appropriate type. 4268 if (needsLoad) { 4269 int FI = MFI.CreateFixedObject(ObjSize, 4270 CurArgOffset + (ArgSize - ObjSize), 4271 isImmutable); 4272 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4273 ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); 4274 } 4275 4276 InVals.push_back(ArgVal); 4277 } 4278 4279 // Allow for Altivec parameters at the end, if needed. 4280 if (nAltivecParamsAtEnd) { 4281 MinReservedArea = ((MinReservedArea+15)/16)*16; 4282 MinReservedArea += 16*nAltivecParamsAtEnd; 4283 } 4284 4285 // Area that is at least reserved in the caller of this function. 4286 MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); 4287 4288 // Set the size that is at least reserved in caller of this function. Tail 4289 // call optimized functions' reserved stack space needs to be aligned so that 4290 // taking the difference between two stack areas will result in an aligned 4291 // stack. 4292 MinReservedArea = 4293 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); 4294 FuncInfo->setMinReservedArea(MinReservedArea); 4295 4296 // If the function takes variable number of arguments, make a frame index for 4297 // the start of the first vararg value... for expansion of llvm.va_start. 4298 if (isVarArg) { 4299 int Depth = ArgOffset; 4300 4301 FuncInfo->setVarArgsFrameIndex( 4302 MFI.CreateFixedObject(PtrVT.getSizeInBits()/8, 4303 Depth, true)); 4304 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 4305 4306 // If this function is vararg, store any remaining integer argument regs 4307 // to their spots on the stack so that they may be loaded by dereferencing 4308 // the result of va_next. 4309 for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { 4310 unsigned VReg; 4311 4312 if (isPPC64) 4313 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); 4314 else 4315 VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); 4316 4317 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); 4318 SDValue Store = 4319 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); 4320 MemOps.push_back(Store); 4321 // Increment the address by four for the next argument to store 4322 SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); 4323 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); 4324 } 4325 } 4326 4327 if (!MemOps.empty()) 4328 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 4329 4330 return Chain; 4331 } 4332 4333 /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be 4334 /// adjusted to accommodate the arguments for the tailcall. 4335 static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, 4336 unsigned ParamSize) { 4337 4338 if (!isTailCall) return 0; 4339 4340 PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>(); 4341 unsigned CallerMinReservedArea = FI->getMinReservedArea(); 4342 int SPDiff = (int)CallerMinReservedArea - (int)ParamSize; 4343 // Remember only if the new adjustment is bigger. 4344 if (SPDiff < FI->getTailCallSPDelta()) 4345 FI->setTailCallSPDelta(SPDiff); 4346 4347 return SPDiff; 4348 } 4349 4350 static bool isFunctionGlobalAddress(SDValue Callee); 4351 4352 static bool 4353 callsShareTOCBase(const Function *Caller, SDValue Callee, 4354 const TargetMachine &TM) { 4355 // If !G, Callee can be an external symbol. 4356 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 4357 if (!G) 4358 return false; 4359 4360 // The medium and large code models are expected to provide a sufficiently 4361 // large TOC to provide all data addressing needs of a module with a 4362 // single TOC. Since each module will be addressed with a single TOC then we 4363 // only need to check that caller and callee don't cross dso boundaries. 4364 if (CodeModel::Medium == TM.getCodeModel() || 4365 CodeModel::Large == TM.getCodeModel()) 4366 return TM.shouldAssumeDSOLocal(*Caller->getParent(), G->getGlobal()); 4367 4368 // Otherwise we need to ensure callee and caller are in the same section, 4369 // since the linker may allocate multiple TOCs, and we don't know which 4370 // sections will belong to the same TOC base. 4371 4372 const GlobalValue *GV = G->getGlobal(); 4373 if (!GV->isStrongDefinitionForLinker()) 4374 return false; 4375 4376 // Any explicitly-specified sections and section prefixes must also match. 4377 // Also, if we're using -ffunction-sections, then each function is always in 4378 // a different section (the same is true for COMDAT functions). 4379 if (TM.getFunctionSections() || GV->hasComdat() || Caller->hasComdat() || 4380 GV->getSection() != Caller->getSection()) 4381 return false; 4382 if (const auto *F = dyn_cast<Function>(GV)) { 4383 if (F->getSectionPrefix() != Caller->getSectionPrefix()) 4384 return false; 4385 } 4386 4387 // If the callee might be interposed, then we can't assume the ultimate call 4388 // target will be in the same section. Even in cases where we can assume that 4389 // interposition won't happen, in any case where the linker might insert a 4390 // stub to allow for interposition, we must generate code as though 4391 // interposition might occur. To understand why this matters, consider a 4392 // situation where: a -> b -> c where the arrows indicate calls. b and c are 4393 // in the same section, but a is in a different module (i.e. has a different 4394 // TOC base pointer). If the linker allows for interposition between b and c, 4395 // then it will generate a stub for the call edge between b and c which will 4396 // save the TOC pointer into the designated stack slot allocated by b. If we 4397 // return true here, and therefore allow a tail call between b and c, that 4398 // stack slot won't exist and the b -> c stub will end up saving b'c TOC base 4399 // pointer into the stack slot allocated by a (where the a -> b stub saved 4400 // a's TOC base pointer). If we're not considering a tail call, but rather, 4401 // whether a nop is needed after the call instruction in b, because the linker 4402 // will insert a stub, it might complain about a missing nop if we omit it 4403 // (although many don't complain in this case). 4404 if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV)) 4405 return false; 4406 4407 return true; 4408 } 4409 4410 static bool 4411 needStackSlotPassParameters(const PPCSubtarget &Subtarget, 4412 const SmallVectorImpl<ISD::OutputArg> &Outs) { 4413 assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64()); 4414 4415 const unsigned PtrByteSize = 8; 4416 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 4417 4418 static const MCPhysReg GPR[] = { 4419 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 4420 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 4421 }; 4422 static const MCPhysReg VR[] = { 4423 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 4424 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 4425 }; 4426 4427 const unsigned NumGPRs = array_lengthof(GPR); 4428 const unsigned NumFPRs = 13; 4429 const unsigned NumVRs = array_lengthof(VR); 4430 const unsigned ParamAreaSize = NumGPRs * PtrByteSize; 4431 4432 unsigned NumBytes = LinkageSize; 4433 unsigned AvailableFPRs = NumFPRs; 4434 unsigned AvailableVRs = NumVRs; 4435 4436 for (const ISD::OutputArg& Param : Outs) { 4437 if (Param.Flags.isNest()) continue; 4438 4439 if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, 4440 PtrByteSize, LinkageSize, ParamAreaSize, 4441 NumBytes, AvailableFPRs, AvailableVRs, 4442 Subtarget.hasQPX())) 4443 return true; 4444 } 4445 return false; 4446 } 4447 4448 static bool 4449 hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) { 4450 if (CS.arg_size() != CallerFn->arg_size()) 4451 return false; 4452 4453 ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin(); 4454 ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end(); 4455 Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin(); 4456 4457 for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) { 4458 const Value* CalleeArg = *CalleeArgIter; 4459 const Value* CallerArg = &(*CallerArgIter); 4460 if (CalleeArg == CallerArg) 4461 continue; 4462 4463 // e.g. @caller([4 x i64] %a, [4 x i64] %b) { 4464 // tail call @callee([4 x i64] undef, [4 x i64] %b) 4465 // } 4466 // 1st argument of callee is undef and has the same type as caller. 4467 if (CalleeArg->getType() == CallerArg->getType() && 4468 isa<UndefValue>(CalleeArg)) 4469 continue; 4470 4471 return false; 4472 } 4473 4474 return true; 4475 } 4476 4477 // Returns true if TCO is possible between the callers and callees 4478 // calling conventions. 4479 static bool 4480 areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC, 4481 CallingConv::ID CalleeCC) { 4482 // Tail calls are possible with fastcc and ccc. 4483 auto isTailCallableCC = [] (CallingConv::ID CC){ 4484 return CC == CallingConv::C || CC == CallingConv::Fast; 4485 }; 4486 if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC)) 4487 return false; 4488 4489 // We can safely tail call both fastcc and ccc callees from a c calling 4490 // convention caller. If the caller is fastcc, we may have less stack space 4491 // than a non-fastcc caller with the same signature so disable tail-calls in 4492 // that case. 4493 return CallerCC == CallingConv::C || CallerCC == CalleeCC; 4494 } 4495 4496 bool 4497 PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( 4498 SDValue Callee, 4499 CallingConv::ID CalleeCC, 4500 ImmutableCallSite CS, 4501 bool isVarArg, 4502 const SmallVectorImpl<ISD::OutputArg> &Outs, 4503 const SmallVectorImpl<ISD::InputArg> &Ins, 4504 SelectionDAG& DAG) const { 4505 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; 4506 4507 if (DisableSCO && !TailCallOpt) return false; 4508 4509 // Variadic argument functions are not supported. 4510 if (isVarArg) return false; 4511 4512 auto &Caller = DAG.getMachineFunction().getFunction(); 4513 // Check that the calling conventions are compatible for tco. 4514 if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC)) 4515 return false; 4516 4517 // Caller contains any byval parameter is not supported. 4518 if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); })) 4519 return false; 4520 4521 // Callee contains any byval parameter is not supported, too. 4522 // Note: This is a quick work around, because in some cases, e.g. 4523 // caller's stack size > callee's stack size, we are still able to apply 4524 // sibling call optimization. For example, gcc is able to do SCO for caller1 4525 // in the following example, but not for caller2. 4526 // struct test { 4527 // long int a; 4528 // char ary[56]; 4529 // } gTest; 4530 // __attribute__((noinline)) int callee(struct test v, struct test *b) { 4531 // b->a = v.a; 4532 // return 0; 4533 // } 4534 // void caller1(struct test a, struct test c, struct test *b) { 4535 // callee(gTest, b); } 4536 // void caller2(struct test *b) { callee(gTest, b); } 4537 if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); })) 4538 return false; 4539 4540 // If callee and caller use different calling conventions, we cannot pass 4541 // parameters on stack since offsets for the parameter area may be different. 4542 if (Caller.getCallingConv() != CalleeCC && 4543 needStackSlotPassParameters(Subtarget, Outs)) 4544 return false; 4545 4546 // No TCO/SCO on indirect call because Caller have to restore its TOC 4547 if (!isFunctionGlobalAddress(Callee) && 4548 !isa<ExternalSymbolSDNode>(Callee)) 4549 return false; 4550 4551 // If the caller and callee potentially have different TOC bases then we 4552 // cannot tail call since we need to restore the TOC pointer after the call. 4553 // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977 4554 if (!callsShareTOCBase(&Caller, Callee, getTargetMachine())) 4555 return false; 4556 4557 // TCO allows altering callee ABI, so we don't have to check further. 4558 if (CalleeCC == CallingConv::Fast && TailCallOpt) 4559 return true; 4560 4561 if (DisableSCO) return false; 4562 4563 // If callee use the same argument list that caller is using, then we can 4564 // apply SCO on this case. If it is not, then we need to check if callee needs 4565 // stack for passing arguments. 4566 if (!hasSameArgumentList(&Caller, CS) && 4567 needStackSlotPassParameters(Subtarget, Outs)) { 4568 return false; 4569 } 4570 4571 return true; 4572 } 4573 4574 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 4575 /// for tail call optimization. Targets which want to do tail call 4576 /// optimization should implement this function. 4577 bool 4578 PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 4579 CallingConv::ID CalleeCC, 4580 bool isVarArg, 4581 const SmallVectorImpl<ISD::InputArg> &Ins, 4582 SelectionDAG& DAG) const { 4583 if (!getTargetMachine().Options.GuaranteedTailCallOpt) 4584 return false; 4585 4586 // Variable argument functions are not supported. 4587 if (isVarArg) 4588 return false; 4589 4590 MachineFunction &MF = DAG.getMachineFunction(); 4591 CallingConv::ID CallerCC = MF.getFunction().getCallingConv(); 4592 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { 4593 // Functions containing by val parameters are not supported. 4594 for (unsigned i = 0; i != Ins.size(); i++) { 4595 ISD::ArgFlagsTy Flags = Ins[i].Flags; 4596 if (Flags.isByVal()) return false; 4597 } 4598 4599 // Non-PIC/GOT tail calls are supported. 4600 if (getTargetMachine().getRelocationModel() != Reloc::PIC_) 4601 return true; 4602 4603 // At the moment we can only do local tail calls (in same module, hidden 4604 // or protected) if we are generating PIC. 4605 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 4606 return G->getGlobal()->hasHiddenVisibility() 4607 || G->getGlobal()->hasProtectedVisibility(); 4608 } 4609 4610 return false; 4611 } 4612 4613 /// isCallCompatibleAddress - Return the immediate to use if the specified 4614 /// 32-bit value is representable in the immediate field of a BxA instruction. 4615 static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) { 4616 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4617 if (!C) return nullptr; 4618 4619 int Addr = C->getZExtValue(); 4620 if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero. 4621 SignExtend32<26>(Addr) != Addr) 4622 return nullptr; // Top 6 bits have to be sext of immediate. 4623 4624 return DAG 4625 .getConstant( 4626 (int)C->getZExtValue() >> 2, SDLoc(Op), 4627 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout())) 4628 .getNode(); 4629 } 4630 4631 namespace { 4632 4633 struct TailCallArgumentInfo { 4634 SDValue Arg; 4635 SDValue FrameIdxOp; 4636 int FrameIdx = 0; 4637 4638 TailCallArgumentInfo() = default; 4639 }; 4640 4641 } // end anonymous namespace 4642 4643 /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. 4644 static void StoreTailCallArgumentsToStackSlot( 4645 SelectionDAG &DAG, SDValue Chain, 4646 const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs, 4647 SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) { 4648 for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) { 4649 SDValue Arg = TailCallArgs[i].Arg; 4650 SDValue FIN = TailCallArgs[i].FrameIdxOp; 4651 int FI = TailCallArgs[i].FrameIdx; 4652 // Store relative to framepointer. 4653 MemOpChains.push_back(DAG.getStore( 4654 Chain, dl, Arg, FIN, 4655 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); 4656 } 4657 } 4658 4659 /// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to 4660 /// the appropriate stack slot for the tail call optimized function call. 4661 static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain, 4662 SDValue OldRetAddr, SDValue OldFP, 4663 int SPDiff, const SDLoc &dl) { 4664 if (SPDiff) { 4665 // Calculate the new stack slot for the return address. 4666 MachineFunction &MF = DAG.getMachineFunction(); 4667 const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); 4668 const PPCFrameLowering *FL = Subtarget.getFrameLowering(); 4669 bool isPPC64 = Subtarget.isPPC64(); 4670 int SlotSize = isPPC64 ? 8 : 4; 4671 int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset(); 4672 int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize, 4673 NewRetAddrLoc, true); 4674 EVT VT = isPPC64 ? MVT::i64 : MVT::i32; 4675 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT); 4676 Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx, 4677 MachinePointerInfo::getFixedStack(MF, NewRetAddr)); 4678 4679 // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack 4680 // slot as the FP is never overwritten. 4681 if (Subtarget.isDarwinABI()) { 4682 int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset(); 4683 int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc, 4684 true); 4685 SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT); 4686 Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx, 4687 MachinePointerInfo::getFixedStack( 4688 DAG.getMachineFunction(), NewFPIdx)); 4689 } 4690 } 4691 return Chain; 4692 } 4693 4694 /// CalculateTailCallArgDest - Remember Argument for later processing. Calculate 4695 /// the position of the argument. 4696 static void 4697 CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64, 4698 SDValue Arg, int SPDiff, unsigned ArgOffset, 4699 SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) { 4700 int Offset = ArgOffset + SPDiff; 4701 uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8; 4702 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); 4703 EVT VT = isPPC64 ? MVT::i64 : MVT::i32; 4704 SDValue FIN = DAG.getFrameIndex(FI, VT); 4705 TailCallArgumentInfo Info; 4706 Info.Arg = Arg; 4707 Info.FrameIdxOp = FIN; 4708 Info.FrameIdx = FI; 4709 TailCallArguments.push_back(Info); 4710 } 4711 4712 /// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address 4713 /// stack slot. Returns the chain as result and the loaded frame pointers in 4714 /// LROpOut/FPOpout. Used when tail calling. 4715 SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr( 4716 SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut, 4717 SDValue &FPOpOut, const SDLoc &dl) const { 4718 if (SPDiff) { 4719 // Load the LR and FP stack slot for later adjusting. 4720 EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; 4721 LROpOut = getReturnAddrFrameIndex(DAG); 4722 LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo()); 4723 Chain = SDValue(LROpOut.getNode(), 1); 4724 4725 // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack 4726 // slot as the FP is never overwritten. 4727 if (Subtarget.isDarwinABI()) { 4728 FPOpOut = getFramePointerFrameIndex(DAG); 4729 FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo()); 4730 Chain = SDValue(FPOpOut.getNode(), 1); 4731 } 4732 } 4733 return Chain; 4734 } 4735 4736 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 4737 /// by "Src" to address "Dst" of size "Size". Alignment information is 4738 /// specified by the specific parameter attribute. The copy will be passed as 4739 /// a byval function parameter. 4740 /// Sometimes what we are copying is the end of a larger object, the part that 4741 /// does not fit in registers. 4742 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, 4743 SDValue Chain, ISD::ArgFlagsTy Flags, 4744 SelectionDAG &DAG, const SDLoc &dl) { 4745 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); 4746 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 4747 false, false, false, MachinePointerInfo(), 4748 MachinePointerInfo()); 4749 } 4750 4751 /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of 4752 /// tail calls. 4753 static void LowerMemOpCallTo( 4754 SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg, 4755 SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64, 4756 bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains, 4757 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) { 4758 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 4759 if (!isTailCall) { 4760 if (isVector) { 4761 SDValue StackPtr; 4762 if (isPPC64) 4763 StackPtr = DAG.getRegister(PPC::X1, MVT::i64); 4764 else 4765 StackPtr = DAG.getRegister(PPC::R1, MVT::i32); 4766 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, 4767 DAG.getConstant(ArgOffset, dl, PtrVT)); 4768 } 4769 MemOpChains.push_back( 4770 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); 4771 // Calculate and remember argument location. 4772 } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset, 4773 TailCallArguments); 4774 } 4775 4776 static void 4777 PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, 4778 const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp, 4779 SDValue FPOp, 4780 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) { 4781 // Emit a sequence of copyto/copyfrom virtual registers for arguments that 4782 // might overwrite each other in case of tail call optimization. 4783 SmallVector<SDValue, 8> MemOpChains2; 4784 // Do not flag preceding copytoreg stuff together with the following stuff. 4785 InFlag = SDValue(); 4786 StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments, 4787 MemOpChains2, dl); 4788 if (!MemOpChains2.empty()) 4789 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); 4790 4791 // Store the return address to the appropriate stack slot. 4792 Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl); 4793 4794 // Emit callseq_end just before tailcall node. 4795 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 4796 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 4797 InFlag = Chain.getValue(1); 4798 } 4799 4800 // Is this global address that of a function that can be called by name? (as 4801 // opposed to something that must hold a descriptor for an indirect call). 4802 static bool isFunctionGlobalAddress(SDValue Callee) { 4803 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 4804 if (Callee.getOpcode() == ISD::GlobalTLSAddress || 4805 Callee.getOpcode() == ISD::TargetGlobalTLSAddress) 4806 return false; 4807 4808 return G->getGlobal()->getValueType()->isFunctionTy(); 4809 } 4810 4811 return false; 4812 } 4813 4814 static unsigned 4815 PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, 4816 SDValue CallSeqStart, const SDLoc &dl, int SPDiff, bool isTailCall, 4817 bool isPatchPoint, bool hasNest, 4818 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, 4819 SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys, 4820 ImmutableCallSite CS, const PPCSubtarget &Subtarget) { 4821 bool isPPC64 = Subtarget.isPPC64(); 4822 bool isSVR4ABI = Subtarget.isSVR4ABI(); 4823 bool isELFv2ABI = Subtarget.isELFv2ABI(); 4824 4825 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 4826 NodeTys.push_back(MVT::Other); // Returns a chain 4827 NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use. 4828 4829 unsigned CallOpc = PPCISD::CALL; 4830 4831 bool needIndirectCall = true; 4832 if (!isSVR4ABI || !isPPC64) 4833 if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { 4834 // If this is an absolute destination address, use the munged value. 4835 Callee = SDValue(Dest, 0); 4836 needIndirectCall = false; 4837 } 4838 4839 // PC-relative references to external symbols should go through $stub, unless 4840 // we're building with the leopard linker or later, which automatically 4841 // synthesizes these stubs. 4842 const TargetMachine &TM = DAG.getTarget(); 4843 const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); 4844 const GlobalValue *GV = nullptr; 4845 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) 4846 GV = G->getGlobal(); 4847 bool Local = TM.shouldAssumeDSOLocal(*Mod, GV); 4848 bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64; 4849 4850 if (isFunctionGlobalAddress(Callee)) { 4851 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee); 4852 // A call to a TLS address is actually an indirect call to a 4853 // thread-specific pointer. 4854 unsigned OpFlags = 0; 4855 if (UsePlt) 4856 OpFlags = PPCII::MO_PLT; 4857 4858 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, 4859 // every direct call is) turn it into a TargetGlobalAddress / 4860 // TargetExternalSymbol node so that legalize doesn't hack it. 4861 Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, 4862 Callee.getValueType(), 0, OpFlags); 4863 needIndirectCall = false; 4864 } 4865 4866 if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 4867 unsigned char OpFlags = 0; 4868 4869 if (UsePlt) 4870 OpFlags = PPCII::MO_PLT; 4871 4872 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(), 4873 OpFlags); 4874 needIndirectCall = false; 4875 } 4876 4877 if (isPatchPoint) { 4878 // We'll form an invalid direct call when lowering a patchpoint; the full 4879 // sequence for an indirect call is complicated, and many of the 4880 // instructions introduced might have side effects (and, thus, can't be 4881 // removed later). The call itself will be removed as soon as the 4882 // argument/return lowering is complete, so the fact that it has the wrong 4883 // kind of operands should not really matter. 4884 needIndirectCall = false; 4885 } 4886 4887 if (needIndirectCall) { 4888 // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair 4889 // to do the call, we can't use PPCISD::CALL. 4890 SDValue MTCTROps[] = {Chain, Callee, InFlag}; 4891 4892 if (isSVR4ABI && isPPC64 && !isELFv2ABI) { 4893 // Function pointers in the 64-bit SVR4 ABI do not point to the function 4894 // entry point, but to the function descriptor (the function entry point 4895 // address is part of the function descriptor though). 4896 // The function descriptor is a three doubleword structure with the 4897 // following fields: function entry point, TOC base address and 4898 // environment pointer. 4899 // Thus for a call through a function pointer, the following actions need 4900 // to be performed: 4901 // 1. Save the TOC of the caller in the TOC save area of its stack 4902 // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()). 4903 // 2. Load the address of the function entry point from the function 4904 // descriptor. 4905 // 3. Load the TOC of the callee from the function descriptor into r2. 4906 // 4. Load the environment pointer from the function descriptor into 4907 // r11. 4908 // 5. Branch to the function entry point address. 4909 // 6. On return of the callee, the TOC of the caller needs to be 4910 // restored (this is done in FinishCall()). 4911 // 4912 // The loads are scheduled at the beginning of the call sequence, and the 4913 // register copies are flagged together to ensure that no other 4914 // operations can be scheduled in between. E.g. without flagging the 4915 // copies together, a TOC access in the caller could be scheduled between 4916 // the assignment of the callee TOC and the branch to the callee, which 4917 // results in the TOC access going through the TOC of the callee instead 4918 // of going through the TOC of the caller, which leads to incorrect code. 4919 4920 // Load the address of the function entry point from the function 4921 // descriptor. 4922 SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1); 4923 if (LDChain.getValueType() == MVT::Glue) 4924 LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2); 4925 4926 auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors() 4927 ? (MachineMemOperand::MODereferenceable | 4928 MachineMemOperand::MOInvariant) 4929 : MachineMemOperand::MONone; 4930 4931 MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr); 4932 SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI, 4933 /* Alignment = */ 8, MMOFlags); 4934 4935 // Load environment pointer into r11. 4936 SDValue PtrOff = DAG.getIntPtrConstant(16, dl); 4937 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff); 4938 SDValue LoadEnvPtr = 4939 DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16), 4940 /* Alignment = */ 8, MMOFlags); 4941 4942 SDValue TOCOff = DAG.getIntPtrConstant(8, dl); 4943 SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); 4944 SDValue TOCPtr = 4945 DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8), 4946 /* Alignment = */ 8, MMOFlags); 4947 4948 setUsesTOCBasePtr(DAG); 4949 SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr, 4950 InFlag); 4951 Chain = TOCVal.getValue(0); 4952 InFlag = TOCVal.getValue(1); 4953 4954 // If the function call has an explicit 'nest' parameter, it takes the 4955 // place of the environment pointer. 4956 if (!hasNest) { 4957 SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, 4958 InFlag); 4959 4960 Chain = EnvVal.getValue(0); 4961 InFlag = EnvVal.getValue(1); 4962 } 4963 4964 MTCTROps[0] = Chain; 4965 MTCTROps[1] = LoadFuncPtr; 4966 MTCTROps[2] = InFlag; 4967 } 4968 4969 Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, 4970 makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2)); 4971 InFlag = Chain.getValue(1); 4972 4973 NodeTys.clear(); 4974 NodeTys.push_back(MVT::Other); 4975 NodeTys.push_back(MVT::Glue); 4976 Ops.push_back(Chain); 4977 CallOpc = PPCISD::BCTRL; 4978 Callee.setNode(nullptr); 4979 // Add use of X11 (holding environment pointer) 4980 if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest) 4981 Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); 4982 // Add CTR register as callee so a bctr can be emitted later. 4983 if (isTailCall) 4984 Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT)); 4985 } 4986 4987 // If this is a direct call, pass the chain and the callee. 4988 if (Callee.getNode()) { 4989 Ops.push_back(Chain); 4990 Ops.push_back(Callee); 4991 } 4992 // If this is a tail call add stack pointer delta. 4993 if (isTailCall) 4994 Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32)); 4995 4996 // Add argument registers to the end of the list so that they are known live 4997 // into the call. 4998 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 4999 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 5000 RegsToPass[i].second.getValueType())); 5001 5002 // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live 5003 // into the call. 5004 // We do need to reserve X2 to appease the verifier for the PATCHPOINT. 5005 if (isSVR4ABI && isPPC64) { 5006 setUsesTOCBasePtr(DAG); 5007 5008 // We cannot add X2 as an operand here for PATCHPOINT, because there is no 5009 // way to mark dependencies as implicit here. We will add the X2 dependency 5010 // in EmitInstrWithCustomInserter. 5011 if (!isPatchPoint) 5012 Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); 5013 } 5014 5015 return CallOpc; 5016 } 5017 5018 SDValue PPCTargetLowering::LowerCallResult( 5019 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 5020 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 5021 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 5022 SmallVector<CCValAssign, 16> RVLocs; 5023 CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 5024 *DAG.getContext()); 5025 5026 CCRetInfo.AnalyzeCallResult( 5027 Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold) 5028 ? RetCC_PPC_Cold 5029 : RetCC_PPC); 5030 5031 // Copy all of the result registers out of their specified physreg. 5032 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 5033 CCValAssign &VA = RVLocs[i]; 5034 assert(VA.isRegLoc() && "Can only return in registers!"); 5035 5036 SDValue Val = DAG.getCopyFromReg(Chain, dl, 5037 VA.getLocReg(), VA.getLocVT(), InFlag); 5038 Chain = Val.getValue(1); 5039 InFlag = Val.getValue(2); 5040 5041 switch (VA.getLocInfo()) { 5042 default: llvm_unreachable("Unknown loc info!"); 5043 case CCValAssign::Full: break; 5044 case CCValAssign::AExt: 5045 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 5046 break; 5047 case CCValAssign::ZExt: 5048 Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val, 5049 DAG.getValueType(VA.getValVT())); 5050 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 5051 break; 5052 case CCValAssign::SExt: 5053 Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val, 5054 DAG.getValueType(VA.getValVT())); 5055 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 5056 break; 5057 } 5058 5059 InVals.push_back(Val); 5060 } 5061 5062 return Chain; 5063 } 5064 5065 SDValue PPCTargetLowering::FinishCall( 5066 CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg, 5067 bool isPatchPoint, bool hasNest, SelectionDAG &DAG, 5068 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag, 5069 SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff, 5070 unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins, 5071 SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const { 5072 std::vector<EVT> NodeTys; 5073 SmallVector<SDValue, 8> Ops; 5074 unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, 5075 SPDiff, isTailCall, isPatchPoint, hasNest, 5076 RegsToPass, Ops, NodeTys, CS, Subtarget); 5077 5078 // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls 5079 if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) 5080 Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); 5081 5082 // When performing tail call optimization the callee pops its arguments off 5083 // the stack. Account for this here so these bytes can be pushed back on in 5084 // PPCFrameLowering::eliminateCallFramePseudoInstr. 5085 int BytesCalleePops = 5086 (CallConv == CallingConv::Fast && 5087 getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; 5088 5089 // Add a register mask operand representing the call-preserved registers. 5090 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); 5091 const uint32_t *Mask = 5092 TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv); 5093 assert(Mask && "Missing call preserved mask for calling convention"); 5094 Ops.push_back(DAG.getRegisterMask(Mask)); 5095 5096 if (InFlag.getNode()) 5097 Ops.push_back(InFlag); 5098 5099 // Emit tail call. 5100 if (isTailCall) { 5101 assert(((Callee.getOpcode() == ISD::Register && 5102 cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) || 5103 Callee.getOpcode() == ISD::TargetExternalSymbol || 5104 Callee.getOpcode() == ISD::TargetGlobalAddress || 5105 isa<ConstantSDNode>(Callee)) && 5106 "Expecting an global address, external symbol, absolute value or register"); 5107 5108 DAG.getMachineFunction().getFrameInfo().setHasTailCall(); 5109 return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops); 5110 } 5111 5112 // Add a NOP immediately after the branch instruction when using the 64-bit 5113 // SVR4 ABI. At link time, if caller and callee are in a different module and 5114 // thus have a different TOC, the call will be replaced with a call to a stub 5115 // function which saves the current TOC, loads the TOC of the callee and 5116 // branches to the callee. The NOP will be replaced with a load instruction 5117 // which restores the TOC of the caller from the TOC save slot of the current 5118 // stack frame. If caller and callee belong to the same module (and have the 5119 // same TOC), the NOP will remain unchanged. 5120 5121 MachineFunction &MF = DAG.getMachineFunction(); 5122 if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() && 5123 !isPatchPoint) { 5124 if (CallOpc == PPCISD::BCTRL) { 5125 // This is a call through a function pointer. 5126 // Restore the caller TOC from the save area into R2. 5127 // See PrepareCall() for more information about calls through function 5128 // pointers in the 64-bit SVR4 ABI. 5129 // We are using a target-specific load with r2 hard coded, because the 5130 // result of a target-independent load would never go directly into r2, 5131 // since r2 is a reserved register (which prevents the register allocator 5132 // from allocating it), resulting in an additional register being 5133 // allocated and an unnecessary move instruction being generated. 5134 CallOpc = PPCISD::BCTRL_LOAD_TOC; 5135 5136 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5137 SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); 5138 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); 5139 SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); 5140 SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); 5141 5142 // The address needs to go after the chain input but before the flag (or 5143 // any other variadic arguments). 5144 Ops.insert(std::next(Ops.begin()), AddTOC); 5145 } else if (CallOpc == PPCISD::CALL && 5146 !callsShareTOCBase(&MF.getFunction(), Callee, DAG.getTarget())) { 5147 // Otherwise insert NOP for non-local calls. 5148 CallOpc = PPCISD::CALL_NOP; 5149 } 5150 } 5151 5152 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 5153 InFlag = Chain.getValue(1); 5154 5155 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 5156 DAG.getIntPtrConstant(BytesCalleePops, dl, true), 5157 InFlag, dl); 5158 if (!Ins.empty()) 5159 InFlag = Chain.getValue(1); 5160 5161 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 5162 Ins, dl, DAG, InVals); 5163 } 5164 5165 SDValue 5166 PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 5167 SmallVectorImpl<SDValue> &InVals) const { 5168 SelectionDAG &DAG = CLI.DAG; 5169 SDLoc &dl = CLI.DL; 5170 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 5171 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 5172 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 5173 SDValue Chain = CLI.Chain; 5174 SDValue Callee = CLI.Callee; 5175 bool &isTailCall = CLI.IsTailCall; 5176 CallingConv::ID CallConv = CLI.CallConv; 5177 bool isVarArg = CLI.IsVarArg; 5178 bool isPatchPoint = CLI.IsPatchPoint; 5179 ImmutableCallSite CS = CLI.CS; 5180 5181 if (isTailCall) { 5182 if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall())) 5183 isTailCall = false; 5184 else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) 5185 isTailCall = 5186 IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS, 5187 isVarArg, Outs, Ins, DAG); 5188 else 5189 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, 5190 Ins, DAG); 5191 if (isTailCall) { 5192 ++NumTailCalls; 5193 if (!getTargetMachine().Options.GuaranteedTailCallOpt) 5194 ++NumSiblingCalls; 5195 5196 assert(isa<GlobalAddressSDNode>(Callee) && 5197 "Callee should be an llvm::Function object."); 5198 LLVM_DEBUG( 5199 const GlobalValue *GV = 5200 cast<GlobalAddressSDNode>(Callee)->getGlobal(); 5201 const unsigned Width = 5202 80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0"); 5203 dbgs() << "TCO caller: " 5204 << left_justify(DAG.getMachineFunction().getName(), Width) 5205 << ", callee linkage: " << GV->getVisibility() << ", " 5206 << GV->getLinkage() << "\n"); 5207 } 5208 } 5209 5210 if (!isTailCall && CS && CS.isMustTailCall()) 5211 report_fatal_error("failed to perform tail call elimination on a call " 5212 "site marked musttail"); 5213 5214 // When long calls (i.e. indirect calls) are always used, calls are always 5215 // made via function pointer. If we have a function name, first translate it 5216 // into a pointer. 5217 if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) && 5218 !isTailCall) 5219 Callee = LowerGlobalAddress(Callee, DAG); 5220 5221 if (Subtarget.isSVR4ABI()) { 5222 if (Subtarget.isPPC64()) 5223 return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, 5224 isTailCall, isPatchPoint, Outs, OutVals, Ins, 5225 dl, DAG, InVals, CS); 5226 else 5227 return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, 5228 isTailCall, isPatchPoint, Outs, OutVals, Ins, 5229 dl, DAG, InVals, CS); 5230 } 5231 5232 return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, 5233 isTailCall, isPatchPoint, Outs, OutVals, Ins, 5234 dl, DAG, InVals, CS); 5235 } 5236 5237 SDValue PPCTargetLowering::LowerCall_32SVR4( 5238 SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, 5239 bool isTailCall, bool isPatchPoint, 5240 const SmallVectorImpl<ISD::OutputArg> &Outs, 5241 const SmallVectorImpl<SDValue> &OutVals, 5242 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 5243 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 5244 ImmutableCallSite CS) const { 5245 // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description 5246 // of the 32-bit SVR4 ABI stack frame layout. 5247 5248 assert((CallConv == CallingConv::C || 5249 CallConv == CallingConv::Cold || 5250 CallConv == CallingConv::Fast) && "Unknown calling convention!"); 5251 5252 unsigned PtrByteSize = 4; 5253 5254 MachineFunction &MF = DAG.getMachineFunction(); 5255 5256 // Mark this function as potentially containing a function that contains a 5257 // tail call. As a consequence the frame pointer will be used for dynamicalloc 5258 // and restoring the callers stack pointer in this functions epilog. This is 5259 // done because by tail calling the called function might overwrite the value 5260 // in this function's (MF) stack pointer stack slot 0(SP). 5261 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5262 CallConv == CallingConv::Fast) 5263 MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); 5264 5265 // Count how many bytes are to be pushed on the stack, including the linkage 5266 // area, parameter list area and the part of the local variable space which 5267 // contains copies of aggregates which are passed by value. 5268 5269 // Assign locations to all of the outgoing arguments. 5270 SmallVector<CCValAssign, 16> ArgLocs; 5271 PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); 5272 5273 // Reserve space for the linkage area on the stack. 5274 CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(), 5275 PtrByteSize); 5276 if (useSoftFloat()) 5277 CCInfo.PreAnalyzeCallOperands(Outs); 5278 5279 if (isVarArg) { 5280 // Handle fixed and variable vector arguments differently. 5281 // Fixed vector arguments go into registers as long as registers are 5282 // available. Variable vector arguments always go into memory. 5283 unsigned NumArgs = Outs.size(); 5284 5285 for (unsigned i = 0; i != NumArgs; ++i) { 5286 MVT ArgVT = Outs[i].VT; 5287 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 5288 bool Result; 5289 5290 if (Outs[i].IsFixed) { 5291 Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, 5292 CCInfo); 5293 } else { 5294 Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full, 5295 ArgFlags, CCInfo); 5296 } 5297 5298 if (Result) { 5299 #ifndef NDEBUG 5300 errs() << "Call operand #" << i << " has unhandled type " 5301 << EVT(ArgVT).getEVTString() << "\n"; 5302 #endif 5303 llvm_unreachable(nullptr); 5304 } 5305 } 5306 } else { 5307 // All arguments are treated the same. 5308 CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4); 5309 } 5310 CCInfo.clearWasPPCF128(); 5311 5312 // Assign locations to all of the outgoing aggregate by value arguments. 5313 SmallVector<CCValAssign, 16> ByValArgLocs; 5314 CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext()); 5315 5316 // Reserve stack space for the allocations in CCInfo. 5317 CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); 5318 5319 CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal); 5320 5321 // Size of the linkage area, parameter list area and the part of the local 5322 // space variable where copies of aggregates which are passed by value are 5323 // stored. 5324 unsigned NumBytes = CCByValInfo.getNextStackOffset(); 5325 5326 // Calculate by how many bytes the stack has to be adjusted in case of tail 5327 // call optimization. 5328 int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); 5329 5330 // Adjust the stack pointer for the new arguments... 5331 // These operations are automatically eliminated by the prolog/epilog pass 5332 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 5333 SDValue CallSeqStart = Chain; 5334 5335 // Load the return address and frame pointer so it can be moved somewhere else 5336 // later. 5337 SDValue LROp, FPOp; 5338 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); 5339 5340 // Set up a copy of the stack pointer for use loading and storing any 5341 // arguments that may not fit in the registers available for argument 5342 // passing. 5343 SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32); 5344 5345 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 5346 SmallVector<TailCallArgumentInfo, 8> TailCallArguments; 5347 SmallVector<SDValue, 8> MemOpChains; 5348 5349 bool seenFloatArg = false; 5350 // Walk the register/memloc assignments, inserting copies/loads. 5351 for (unsigned i = 0, j = 0, e = ArgLocs.size(); 5352 i != e; 5353 ++i) { 5354 CCValAssign &VA = ArgLocs[i]; 5355 SDValue Arg = OutVals[i]; 5356 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5357 5358 if (Flags.isByVal()) { 5359 // Argument is an aggregate which is passed by value, thus we need to 5360 // create a copy of it in the local variable space of the current stack 5361 // frame (which is the stack frame of the caller) and pass the address of 5362 // this copy to the callee. 5363 assert((j < ByValArgLocs.size()) && "Index out of bounds!"); 5364 CCValAssign &ByValVA = ByValArgLocs[j++]; 5365 assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!"); 5366 5367 // Memory reserved in the local variable space of the callers stack frame. 5368 unsigned LocMemOffset = ByValVA.getLocMemOffset(); 5369 5370 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 5371 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), 5372 StackPtr, PtrOff); 5373 5374 // Create a copy of the argument in the local area of the current 5375 // stack frame. 5376 SDValue MemcpyCall = 5377 CreateCopyOfByValArgument(Arg, PtrOff, 5378 CallSeqStart.getNode()->getOperand(0), 5379 Flags, DAG, dl); 5380 5381 // This must go outside the CALLSEQ_START..END. 5382 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0, 5383 SDLoc(MemcpyCall)); 5384 DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), 5385 NewCallSeqStart.getNode()); 5386 Chain = CallSeqStart = NewCallSeqStart; 5387 5388 // Pass the address of the aggregate copy on the stack either in a 5389 // physical register or in the parameter list area of the current stack 5390 // frame to the callee. 5391 Arg = PtrOff; 5392 } 5393 5394 // When useCRBits() is true, there can be i1 arguments. 5395 // It is because getRegisterType(MVT::i1) => MVT::i1, 5396 // and for other integer types getRegisterType() => MVT::i32. 5397 // Extend i1 and ensure callee will get i32. 5398 if (Arg.getValueType() == MVT::i1) 5399 Arg = DAG.getNode(Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 5400 dl, MVT::i32, Arg); 5401 5402 if (VA.isRegLoc()) { 5403 seenFloatArg |= VA.getLocVT().isFloatingPoint(); 5404 // Put argument in a physical register. 5405 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 5406 } else { 5407 // Put argument in the parameter list area of the current stack frame. 5408 assert(VA.isMemLoc()); 5409 unsigned LocMemOffset = VA.getLocMemOffset(); 5410 5411 if (!isTailCall) { 5412 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 5413 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()), 5414 StackPtr, PtrOff); 5415 5416 MemOpChains.push_back( 5417 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); 5418 } else { 5419 // Calculate and remember argument location. 5420 CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset, 5421 TailCallArguments); 5422 } 5423 } 5424 } 5425 5426 if (!MemOpChains.empty()) 5427 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 5428 5429 // Build a sequence of copy-to-reg nodes chained together with token chain 5430 // and flag operands which copy the outgoing args into the appropriate regs. 5431 SDValue InFlag; 5432 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 5433 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 5434 RegsToPass[i].second, InFlag); 5435 InFlag = Chain.getValue(1); 5436 } 5437 5438 // Set CR bit 6 to true if this is a vararg call with floating args passed in 5439 // registers. 5440 if (isVarArg) { 5441 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 5442 SDValue Ops[] = { Chain, InFlag }; 5443 5444 Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, 5445 dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1)); 5446 5447 InFlag = Chain.getValue(1); 5448 } 5449 5450 if (isTailCall) 5451 PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, 5452 TailCallArguments); 5453 5454 return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, 5455 /* unused except on PPC64 ELFv1 */ false, DAG, 5456 RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, 5457 NumBytes, Ins, InVals, CS); 5458 } 5459 5460 // Copy an argument into memory, being careful to do this outside the 5461 // call sequence for the call to which the argument belongs. 5462 SDValue PPCTargetLowering::createMemcpyOutsideCallSeq( 5463 SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags, 5464 SelectionDAG &DAG, const SDLoc &dl) const { 5465 SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, 5466 CallSeqStart.getNode()->getOperand(0), 5467 Flags, DAG, dl); 5468 // The MEMCPY must go outside the CALLSEQ_START..END. 5469 int64_t FrameSize = CallSeqStart.getConstantOperandVal(1); 5470 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0, 5471 SDLoc(MemcpyCall)); 5472 DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), 5473 NewCallSeqStart.getNode()); 5474 return NewCallSeqStart; 5475 } 5476 5477 SDValue PPCTargetLowering::LowerCall_64SVR4( 5478 SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, 5479 bool isTailCall, bool isPatchPoint, 5480 const SmallVectorImpl<ISD::OutputArg> &Outs, 5481 const SmallVectorImpl<SDValue> &OutVals, 5482 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 5483 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 5484 ImmutableCallSite CS) const { 5485 bool isELFv2ABI = Subtarget.isELFv2ABI(); 5486 bool isLittleEndian = Subtarget.isLittleEndian(); 5487 unsigned NumOps = Outs.size(); 5488 bool hasNest = false; 5489 bool IsSibCall = false; 5490 5491 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 5492 unsigned PtrByteSize = 8; 5493 5494 MachineFunction &MF = DAG.getMachineFunction(); 5495 5496 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt) 5497 IsSibCall = true; 5498 5499 // Mark this function as potentially containing a function that contains a 5500 // tail call. As a consequence the frame pointer will be used for dynamicalloc 5501 // and restoring the callers stack pointer in this functions epilog. This is 5502 // done because by tail calling the called function might overwrite the value 5503 // in this function's (MF) stack pointer stack slot 0(SP). 5504 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5505 CallConv == CallingConv::Fast) 5506 MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); 5507 5508 assert(!(CallConv == CallingConv::Fast && isVarArg) && 5509 "fastcc not supported on varargs functions"); 5510 5511 // Count how many bytes are to be pushed on the stack, including the linkage 5512 // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes 5513 // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage 5514 // area is 32 bytes reserved space for [SP][CR][LR][TOC]. 5515 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 5516 unsigned NumBytes = LinkageSize; 5517 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 5518 unsigned &QFPR_idx = FPR_idx; 5519 5520 static const MCPhysReg GPR[] = { 5521 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 5522 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 5523 }; 5524 static const MCPhysReg VR[] = { 5525 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 5526 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 5527 }; 5528 5529 const unsigned NumGPRs = array_lengthof(GPR); 5530 const unsigned NumFPRs = useSoftFloat() ? 0 : 13; 5531 const unsigned NumVRs = array_lengthof(VR); 5532 const unsigned NumQFPRs = NumFPRs; 5533 5534 // On ELFv2, we can avoid allocating the parameter area if all the arguments 5535 // can be passed to the callee in registers. 5536 // For the fast calling convention, there is another check below. 5537 // Note: We should keep consistent with LowerFormalArguments_64SVR4() 5538 bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast; 5539 if (!HasParameterArea) { 5540 unsigned ParamAreaSize = NumGPRs * PtrByteSize; 5541 unsigned AvailableFPRs = NumFPRs; 5542 unsigned AvailableVRs = NumVRs; 5543 unsigned NumBytesTmp = NumBytes; 5544 for (unsigned i = 0; i != NumOps; ++i) { 5545 if (Outs[i].Flags.isNest()) continue; 5546 if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags, 5547 PtrByteSize, LinkageSize, ParamAreaSize, 5548 NumBytesTmp, AvailableFPRs, AvailableVRs, 5549 Subtarget.hasQPX())) 5550 HasParameterArea = true; 5551 } 5552 } 5553 5554 // When using the fast calling convention, we don't provide backing for 5555 // arguments that will be in registers. 5556 unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0; 5557 5558 // Avoid allocating parameter area for fastcc functions if all the arguments 5559 // can be passed in the registers. 5560 if (CallConv == CallingConv::Fast) 5561 HasParameterArea = false; 5562 5563 // Add up all the space actually used. 5564 for (unsigned i = 0; i != NumOps; ++i) { 5565 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5566 EVT ArgVT = Outs[i].VT; 5567 EVT OrigVT = Outs[i].ArgVT; 5568 5569 if (Flags.isNest()) 5570 continue; 5571 5572 if (CallConv == CallingConv::Fast) { 5573 if (Flags.isByVal()) { 5574 NumGPRsUsed += (Flags.getByValSize()+7)/8; 5575 if (NumGPRsUsed > NumGPRs) 5576 HasParameterArea = true; 5577 } else { 5578 switch (ArgVT.getSimpleVT().SimpleTy) { 5579 default: llvm_unreachable("Unexpected ValueType for argument!"); 5580 case MVT::i1: 5581 case MVT::i32: 5582 case MVT::i64: 5583 if (++NumGPRsUsed <= NumGPRs) 5584 continue; 5585 break; 5586 case MVT::v4i32: 5587 case MVT::v8i16: 5588 case MVT::v16i8: 5589 case MVT::v2f64: 5590 case MVT::v2i64: 5591 case MVT::v1i128: 5592 case MVT::f128: 5593 if (++NumVRsUsed <= NumVRs) 5594 continue; 5595 break; 5596 case MVT::v4f32: 5597 // When using QPX, this is handled like a FP register, otherwise, it 5598 // is an Altivec register. 5599 if (Subtarget.hasQPX()) { 5600 if (++NumFPRsUsed <= NumFPRs) 5601 continue; 5602 } else { 5603 if (++NumVRsUsed <= NumVRs) 5604 continue; 5605 } 5606 break; 5607 case MVT::f32: 5608 case MVT::f64: 5609 case MVT::v4f64: // QPX 5610 case MVT::v4i1: // QPX 5611 if (++NumFPRsUsed <= NumFPRs) 5612 continue; 5613 break; 5614 } 5615 HasParameterArea = true; 5616 } 5617 } 5618 5619 /* Respect alignment of argument on the stack. */ 5620 unsigned Align = 5621 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); 5622 NumBytes = ((NumBytes + Align - 1) / Align) * Align; 5623 5624 NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); 5625 if (Flags.isInConsecutiveRegsLast()) 5626 NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 5627 } 5628 5629 unsigned NumBytesActuallyUsed = NumBytes; 5630 5631 // In the old ELFv1 ABI, 5632 // the prolog code of the callee may store up to 8 GPR argument registers to 5633 // the stack, allowing va_start to index over them in memory if its varargs. 5634 // Because we cannot tell if this is needed on the caller side, we have to 5635 // conservatively assume that it is needed. As such, make sure we have at 5636 // least enough stack space for the caller to store the 8 GPRs. 5637 // In the ELFv2 ABI, we allocate the parameter area iff a callee 5638 // really requires memory operands, e.g. a vararg function. 5639 if (HasParameterArea) 5640 NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); 5641 else 5642 NumBytes = LinkageSize; 5643 5644 // Tail call needs the stack to be aligned. 5645 if (getTargetMachine().Options.GuaranteedTailCallOpt && 5646 CallConv == CallingConv::Fast) 5647 NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); 5648 5649 int SPDiff = 0; 5650 5651 // Calculate by how many bytes the stack has to be adjusted in case of tail 5652 // call optimization. 5653 if (!IsSibCall) 5654 SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); 5655 5656 // To protect arguments on the stack from being clobbered in a tail call, 5657 // force all the loads to happen before doing any other lowering. 5658 if (isTailCall) 5659 Chain = DAG.getStackArgumentTokenFactor(Chain); 5660 5661 // Adjust the stack pointer for the new arguments... 5662 // These operations are automatically eliminated by the prolog/epilog pass 5663 if (!IsSibCall) 5664 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 5665 SDValue CallSeqStart = Chain; 5666 5667 // Load the return address and frame pointer so it can be move somewhere else 5668 // later. 5669 SDValue LROp, FPOp; 5670 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); 5671 5672 // Set up a copy of the stack pointer for use loading and storing any 5673 // arguments that may not fit in the registers available for argument 5674 // passing. 5675 SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64); 5676 5677 // Figure out which arguments are going to go in registers, and which in 5678 // memory. Also, if this is a vararg function, floating point operations 5679 // must be stored to our stack, and loaded into integer regs as well, if 5680 // any integer regs are available for argument passing. 5681 unsigned ArgOffset = LinkageSize; 5682 5683 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 5684 SmallVector<TailCallArgumentInfo, 8> TailCallArguments; 5685 5686 SmallVector<SDValue, 8> MemOpChains; 5687 for (unsigned i = 0; i != NumOps; ++i) { 5688 SDValue Arg = OutVals[i]; 5689 ISD::ArgFlagsTy Flags = Outs[i].Flags; 5690 EVT ArgVT = Outs[i].VT; 5691 EVT OrigVT = Outs[i].ArgVT; 5692 5693 // PtrOff will be used to store the current argument to the stack if a 5694 // register cannot be found for it. 5695 SDValue PtrOff; 5696 5697 // We re-align the argument offset for each argument, except when using the 5698 // fast calling convention, when we need to make sure we do that only when 5699 // we'll actually use a stack slot. 5700 auto ComputePtrOff = [&]() { 5701 /* Respect alignment of argument on the stack. */ 5702 unsigned Align = 5703 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); 5704 ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; 5705 5706 PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); 5707 5708 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); 5709 }; 5710 5711 if (CallConv != CallingConv::Fast) { 5712 ComputePtrOff(); 5713 5714 /* Compute GPR index associated with argument offset. */ 5715 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; 5716 GPR_idx = std::min(GPR_idx, NumGPRs); 5717 } 5718 5719 // Promote integers to 64-bit values. 5720 if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) { 5721 // FIXME: Should this use ANY_EXTEND if neither sext nor zext? 5722 unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 5723 Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); 5724 } 5725 5726 // FIXME memcpy is used way more than necessary. Correctness first. 5727 // Note: "by value" is code for passing a structure by value, not 5728 // basic types. 5729 if (Flags.isByVal()) { 5730 // Note: Size includes alignment padding, so 5731 // struct x { short a; char b; } 5732 // will have Size = 4. With #pragma pack(1), it will have Size = 3. 5733 // These are the proper values we need for right-justifying the 5734 // aggregate in a parameter register. 5735 unsigned Size = Flags.getByValSize(); 5736 5737 // An empty aggregate parameter takes up no storage and no 5738 // registers. 5739 if (Size == 0) 5740 continue; 5741 5742 if (CallConv == CallingConv::Fast) 5743 ComputePtrOff(); 5744 5745 // All aggregates smaller than 8 bytes must be passed right-justified. 5746 if (Size==1 || Size==2 || Size==4) { 5747 EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); 5748 if (GPR_idx != NumGPRs) { 5749 SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, 5750 MachinePointerInfo(), VT); 5751 MemOpChains.push_back(Load.getValue(1)); 5752 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5753 5754 ArgOffset += PtrByteSize; 5755 continue; 5756 } 5757 } 5758 5759 if (GPR_idx == NumGPRs && Size < 8) { 5760 SDValue AddPtr = PtrOff; 5761 if (!isLittleEndian) { 5762 SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, 5763 PtrOff.getValueType()); 5764 AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); 5765 } 5766 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, 5767 CallSeqStart, 5768 Flags, DAG, dl); 5769 ArgOffset += PtrByteSize; 5770 continue; 5771 } 5772 // Copy entire object into memory. There are cases where gcc-generated 5773 // code assumes it is there, even if it could be put entirely into 5774 // registers. (This is not what the doc says.) 5775 5776 // FIXME: The above statement is likely due to a misunderstanding of the 5777 // documents. All arguments must be copied into the parameter area BY 5778 // THE CALLEE in the event that the callee takes the address of any 5779 // formal argument. That has not yet been implemented. However, it is 5780 // reasonable to use the stack area as a staging area for the register 5781 // load. 5782 5783 // Skip this for small aggregates, as we will use the same slot for a 5784 // right-justified copy, below. 5785 if (Size >= 8) 5786 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, 5787 CallSeqStart, 5788 Flags, DAG, dl); 5789 5790 // When a register is available, pass a small aggregate right-justified. 5791 if (Size < 8 && GPR_idx != NumGPRs) { 5792 // The easiest way to get this right-justified in a register 5793 // is to copy the structure into the rightmost portion of a 5794 // local variable slot, then load the whole slot into the 5795 // register. 5796 // FIXME: The memcpy seems to produce pretty awful code for 5797 // small aggregates, particularly for packed ones. 5798 // FIXME: It would be preferable to use the slot in the 5799 // parameter save area instead of a new local variable. 5800 SDValue AddPtr = PtrOff; 5801 if (!isLittleEndian) { 5802 SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType()); 5803 AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); 5804 } 5805 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, 5806 CallSeqStart, 5807 Flags, DAG, dl); 5808 5809 // Load the slot into the register. 5810 SDValue Load = 5811 DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo()); 5812 MemOpChains.push_back(Load.getValue(1)); 5813 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5814 5815 // Done with this argument. 5816 ArgOffset += PtrByteSize; 5817 continue; 5818 } 5819 5820 // For aggregates larger than PtrByteSize, copy the pieces of the 5821 // object that fit into registers from the parameter save area. 5822 for (unsigned j=0; j<Size; j+=PtrByteSize) { 5823 SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType()); 5824 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 5825 if (GPR_idx != NumGPRs) { 5826 SDValue Load = 5827 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo()); 5828 MemOpChains.push_back(Load.getValue(1)); 5829 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 5830 ArgOffset += PtrByteSize; 5831 } else { 5832 ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; 5833 break; 5834 } 5835 } 5836 continue; 5837 } 5838 5839 switch (Arg.getSimpleValueType().SimpleTy) { 5840 default: llvm_unreachable("Unexpected ValueType for argument!"); 5841 case MVT::i1: 5842 case MVT::i32: 5843 case MVT::i64: 5844 if (Flags.isNest()) { 5845 // The 'nest' parameter, if any, is passed in R11. 5846 RegsToPass.push_back(std::make_pair(PPC::X11, Arg)); 5847 hasNest = true; 5848 break; 5849 } 5850 5851 // These can be scalar arguments or elements of an integer array type 5852 // passed directly. Clang may use those instead of "byval" aggregate 5853 // types to avoid forcing arguments to memory unnecessarily. 5854 if (GPR_idx != NumGPRs) { 5855 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); 5856 } else { 5857 if (CallConv == CallingConv::Fast) 5858 ComputePtrOff(); 5859 5860 assert(HasParameterArea && 5861 "Parameter area must exist to pass an argument in memory."); 5862 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 5863 true, isTailCall, false, MemOpChains, 5864 TailCallArguments, dl); 5865 if (CallConv == CallingConv::Fast) 5866 ArgOffset += PtrByteSize; 5867 } 5868 if (CallConv != CallingConv::Fast) 5869 ArgOffset += PtrByteSize; 5870 break; 5871 case MVT::f32: 5872 case MVT::f64: { 5873 // These can be scalar arguments or elements of a float array type 5874 // passed directly. The latter are used to implement ELFv2 homogenous 5875 // float aggregates. 5876 5877 // Named arguments go into FPRs first, and once they overflow, the 5878 // remaining arguments go into GPRs and then the parameter save area. 5879 // Unnamed arguments for vararg functions always go to GPRs and 5880 // then the parameter save area. For now, put all arguments to vararg 5881 // routines always in both locations (FPR *and* GPR or stack slot). 5882 bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs; 5883 bool NeededLoad = false; 5884 5885 // First load the argument into the next available FPR. 5886 if (FPR_idx != NumFPRs) 5887 RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); 5888 5889 // Next, load the argument into GPR or stack slot if needed. 5890 if (!NeedGPROrStack) 5891 ; 5892 else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) { 5893 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8 5894 // once we support fp <-> gpr moves. 5895 5896 // In the non-vararg case, this can only ever happen in the 5897 // presence of f32 array types, since otherwise we never run 5898 // out of FPRs before running out of GPRs. 5899 SDValue ArgVal; 5900 5901 // Double values are always passed in a single GPR. 5902 if (Arg.getValueType() != MVT::f32) { 5903 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 5904 5905 // Non-array float values are extended and passed in a GPR. 5906 } else if (!Flags.isInConsecutiveRegs()) { 5907 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); 5908 ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); 5909 5910 // If we have an array of floats, we collect every odd element 5911 // together with its predecessor into one GPR. 5912 } else if (ArgOffset % PtrByteSize != 0) { 5913 SDValue Lo, Hi; 5914 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); 5915 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); 5916 if (!isLittleEndian) 5917 std::swap(Lo, Hi); 5918 ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 5919 5920 // The final element, if even, goes into the first half of a GPR. 5921 } else if (Flags.isInConsecutiveRegsLast()) { 5922 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); 5923 ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); 5924 if (!isLittleEndian) 5925 ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, 5926 DAG.getConstant(32, dl, MVT::i32)); 5927 5928 // Non-final even elements are skipped; they will be handled 5929 // together the with subsequent argument on the next go-around. 5930 } else 5931 ArgVal = SDValue(); 5932 5933 if (ArgVal.getNode()) 5934 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal)); 5935 } else { 5936 if (CallConv == CallingConv::Fast) 5937 ComputePtrOff(); 5938 5939 // Single-precision floating-point values are mapped to the 5940 // second (rightmost) word of the stack doubleword. 5941 if (Arg.getValueType() == MVT::f32 && 5942 !isLittleEndian && !Flags.isInConsecutiveRegs()) { 5943 SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); 5944 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); 5945 } 5946 5947 assert(HasParameterArea && 5948 "Parameter area must exist to pass an argument in memory."); 5949 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 5950 true, isTailCall, false, MemOpChains, 5951 TailCallArguments, dl); 5952 5953 NeededLoad = true; 5954 } 5955 // When passing an array of floats, the array occupies consecutive 5956 // space in the argument area; only round up to the next doubleword 5957 // at the end of the array. Otherwise, each float takes 8 bytes. 5958 if (CallConv != CallingConv::Fast || NeededLoad) { 5959 ArgOffset += (Arg.getValueType() == MVT::f32 && 5960 Flags.isInConsecutiveRegs()) ? 4 : 8; 5961 if (Flags.isInConsecutiveRegsLast()) 5962 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; 5963 } 5964 break; 5965 } 5966 case MVT::v4f32: 5967 case MVT::v4i32: 5968 case MVT::v8i16: 5969 case MVT::v16i8: 5970 case MVT::v2f64: 5971 case MVT::v2i64: 5972 case MVT::v1i128: 5973 case MVT::f128: 5974 if (!Subtarget.hasQPX()) { 5975 // These can be scalar arguments or elements of a vector array type 5976 // passed directly. The latter are used to implement ELFv2 homogenous 5977 // vector aggregates. 5978 5979 // For a varargs call, named arguments go into VRs or on the stack as 5980 // usual; unnamed arguments always go to the stack or the corresponding 5981 // GPRs when within range. For now, we always put the value in both 5982 // locations (or even all three). 5983 if (isVarArg) { 5984 assert(HasParameterArea && 5985 "Parameter area must exist if we have a varargs call."); 5986 // We could elide this store in the case where the object fits 5987 // entirely in R registers. Maybe later. 5988 SDValue Store = 5989 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 5990 MemOpChains.push_back(Store); 5991 if (VR_idx != NumVRs) { 5992 SDValue Load = 5993 DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo()); 5994 MemOpChains.push_back(Load.getValue(1)); 5995 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load)); 5996 } 5997 ArgOffset += 16; 5998 for (unsigned i=0; i<16; i+=PtrByteSize) { 5999 if (GPR_idx == NumGPRs) 6000 break; 6001 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, 6002 DAG.getConstant(i, dl, PtrVT)); 6003 SDValue Load = 6004 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); 6005 MemOpChains.push_back(Load.getValue(1)); 6006 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6007 } 6008 break; 6009 } 6010 6011 // Non-varargs Altivec params go into VRs or on the stack. 6012 if (VR_idx != NumVRs) { 6013 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); 6014 } else { 6015 if (CallConv == CallingConv::Fast) 6016 ComputePtrOff(); 6017 6018 assert(HasParameterArea && 6019 "Parameter area must exist to pass an argument in memory."); 6020 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6021 true, isTailCall, true, MemOpChains, 6022 TailCallArguments, dl); 6023 if (CallConv == CallingConv::Fast) 6024 ArgOffset += 16; 6025 } 6026 6027 if (CallConv != CallingConv::Fast) 6028 ArgOffset += 16; 6029 break; 6030 } // not QPX 6031 6032 assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && 6033 "Invalid QPX parameter type"); 6034 6035 LLVM_FALLTHROUGH; 6036 case MVT::v4f64: 6037 case MVT::v4i1: { 6038 bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; 6039 if (isVarArg) { 6040 assert(HasParameterArea && 6041 "Parameter area must exist if we have a varargs call."); 6042 // We could elide this store in the case where the object fits 6043 // entirely in R registers. Maybe later. 6044 SDValue Store = 6045 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 6046 MemOpChains.push_back(Store); 6047 if (QFPR_idx != NumQFPRs) { 6048 SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store, 6049 PtrOff, MachinePointerInfo()); 6050 MemOpChains.push_back(Load.getValue(1)); 6051 RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load)); 6052 } 6053 ArgOffset += (IsF32 ? 16 : 32); 6054 for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) { 6055 if (GPR_idx == NumGPRs) 6056 break; 6057 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, 6058 DAG.getConstant(i, dl, PtrVT)); 6059 SDValue Load = 6060 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); 6061 MemOpChains.push_back(Load.getValue(1)); 6062 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6063 } 6064 break; 6065 } 6066 6067 // Non-varargs QPX params go into registers or on the stack. 6068 if (QFPR_idx != NumQFPRs) { 6069 RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg)); 6070 } else { 6071 if (CallConv == CallingConv::Fast) 6072 ComputePtrOff(); 6073 6074 assert(HasParameterArea && 6075 "Parameter area must exist to pass an argument in memory."); 6076 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6077 true, isTailCall, true, MemOpChains, 6078 TailCallArguments, dl); 6079 if (CallConv == CallingConv::Fast) 6080 ArgOffset += (IsF32 ? 16 : 32); 6081 } 6082 6083 if (CallConv != CallingConv::Fast) 6084 ArgOffset += (IsF32 ? 16 : 32); 6085 break; 6086 } 6087 } 6088 } 6089 6090 assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) && 6091 "mismatch in size of parameter area"); 6092 (void)NumBytesActuallyUsed; 6093 6094 if (!MemOpChains.empty()) 6095 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 6096 6097 // Check if this is an indirect call (MTCTR/BCTRL). 6098 // See PrepareCall() for more information about calls through function 6099 // pointers in the 64-bit SVR4 ABI. 6100 if (!isTailCall && !isPatchPoint && 6101 !isFunctionGlobalAddress(Callee) && 6102 !isa<ExternalSymbolSDNode>(Callee)) { 6103 // Load r2 into a virtual register and store it to the TOC save area. 6104 setUsesTOCBasePtr(DAG); 6105 SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); 6106 // TOC save area offset. 6107 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); 6108 SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); 6109 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); 6110 Chain = DAG.getStore( 6111 Val.getValue(1), dl, Val, AddPtr, 6112 MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset)); 6113 // In the ELFv2 ABI, R12 must contain the address of an indirect callee. 6114 // This does not mean the MTCTR instruction must use R12; it's easier 6115 // to model this as an extra parameter, so do that. 6116 if (isELFv2ABI && !isPatchPoint) 6117 RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); 6118 } 6119 6120 // Build a sequence of copy-to-reg nodes chained together with token chain 6121 // and flag operands which copy the outgoing args into the appropriate regs. 6122 SDValue InFlag; 6123 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 6124 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 6125 RegsToPass[i].second, InFlag); 6126 InFlag = Chain.getValue(1); 6127 } 6128 6129 if (isTailCall && !IsSibCall) 6130 PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, 6131 TailCallArguments); 6132 6133 return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest, 6134 DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, 6135 SPDiff, NumBytes, Ins, InVals, CS); 6136 } 6137 6138 SDValue PPCTargetLowering::LowerCall_Darwin( 6139 SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, 6140 bool isTailCall, bool isPatchPoint, 6141 const SmallVectorImpl<ISD::OutputArg> &Outs, 6142 const SmallVectorImpl<SDValue> &OutVals, 6143 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 6144 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, 6145 ImmutableCallSite CS) const { 6146 unsigned NumOps = Outs.size(); 6147 6148 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6149 bool isPPC64 = PtrVT == MVT::i64; 6150 unsigned PtrByteSize = isPPC64 ? 8 : 4; 6151 6152 MachineFunction &MF = DAG.getMachineFunction(); 6153 6154 // Mark this function as potentially containing a function that contains a 6155 // tail call. As a consequence the frame pointer will be used for dynamicalloc 6156 // and restoring the callers stack pointer in this functions epilog. This is 6157 // done because by tail calling the called function might overwrite the value 6158 // in this function's (MF) stack pointer stack slot 0(SP). 6159 if (getTargetMachine().Options.GuaranteedTailCallOpt && 6160 CallConv == CallingConv::Fast) 6161 MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); 6162 6163 // Count how many bytes are to be pushed on the stack, including the linkage 6164 // area, and parameter passing area. We start with 24/48 bytes, which is 6165 // prereserved space for [SP][CR][LR][3 x unused]. 6166 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); 6167 unsigned NumBytes = LinkageSize; 6168 6169 // Add up all the space actually used. 6170 // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually 6171 // they all go in registers, but we must reserve stack space for them for 6172 // possible use by the caller. In varargs or 64-bit calls, parameters are 6173 // assigned stack space in order, with padding so Altivec parameters are 6174 // 16-byte aligned. 6175 unsigned nAltivecParamsAtEnd = 0; 6176 for (unsigned i = 0; i != NumOps; ++i) { 6177 ISD::ArgFlagsTy Flags = Outs[i].Flags; 6178 EVT ArgVT = Outs[i].VT; 6179 // Varargs Altivec parameters are padded to a 16 byte boundary. 6180 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || 6181 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || 6182 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) { 6183 if (!isVarArg && !isPPC64) { 6184 // Non-varargs Altivec parameters go after all the non-Altivec 6185 // parameters; handle those later so we know how much padding we need. 6186 nAltivecParamsAtEnd++; 6187 continue; 6188 } 6189 // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. 6190 NumBytes = ((NumBytes+15)/16)*16; 6191 } 6192 NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); 6193 } 6194 6195 // Allow for Altivec parameters at the end, if needed. 6196 if (nAltivecParamsAtEnd) { 6197 NumBytes = ((NumBytes+15)/16)*16; 6198 NumBytes += 16*nAltivecParamsAtEnd; 6199 } 6200 6201 // The prolog code of the callee may store up to 8 GPR argument registers to 6202 // the stack, allowing va_start to index over them in memory if its varargs. 6203 // Because we cannot tell if this is needed on the caller side, we have to 6204 // conservatively assume that it is needed. As such, make sure we have at 6205 // least enough stack space for the caller to store the 8 GPRs. 6206 NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); 6207 6208 // Tail call needs the stack to be aligned. 6209 if (getTargetMachine().Options.GuaranteedTailCallOpt && 6210 CallConv == CallingConv::Fast) 6211 NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); 6212 6213 // Calculate by how many bytes the stack has to be adjusted in case of tail 6214 // call optimization. 6215 int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); 6216 6217 // To protect arguments on the stack from being clobbered in a tail call, 6218 // force all the loads to happen before doing any other lowering. 6219 if (isTailCall) 6220 Chain = DAG.getStackArgumentTokenFactor(Chain); 6221 6222 // Adjust the stack pointer for the new arguments... 6223 // These operations are automatically eliminated by the prolog/epilog pass 6224 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 6225 SDValue CallSeqStart = Chain; 6226 6227 // Load the return address and frame pointer so it can be move somewhere else 6228 // later. 6229 SDValue LROp, FPOp; 6230 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); 6231 6232 // Set up a copy of the stack pointer for use loading and storing any 6233 // arguments that may not fit in the registers available for argument 6234 // passing. 6235 SDValue StackPtr; 6236 if (isPPC64) 6237 StackPtr = DAG.getRegister(PPC::X1, MVT::i64); 6238 else 6239 StackPtr = DAG.getRegister(PPC::R1, MVT::i32); 6240 6241 // Figure out which arguments are going to go in registers, and which in 6242 // memory. Also, if this is a vararg function, floating point operations 6243 // must be stored to our stack, and loaded into integer regs as well, if 6244 // any integer regs are available for argument passing. 6245 unsigned ArgOffset = LinkageSize; 6246 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; 6247 6248 static const MCPhysReg GPR_32[] = { // 32-bit registers. 6249 PPC::R3, PPC::R4, PPC::R5, PPC::R6, 6250 PPC::R7, PPC::R8, PPC::R9, PPC::R10, 6251 }; 6252 static const MCPhysReg GPR_64[] = { // 64-bit registers. 6253 PPC::X3, PPC::X4, PPC::X5, PPC::X6, 6254 PPC::X7, PPC::X8, PPC::X9, PPC::X10, 6255 }; 6256 static const MCPhysReg VR[] = { 6257 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, 6258 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 6259 }; 6260 const unsigned NumGPRs = array_lengthof(GPR_32); 6261 const unsigned NumFPRs = 13; 6262 const unsigned NumVRs = array_lengthof(VR); 6263 6264 const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; 6265 6266 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 6267 SmallVector<TailCallArgumentInfo, 8> TailCallArguments; 6268 6269 SmallVector<SDValue, 8> MemOpChains; 6270 for (unsigned i = 0; i != NumOps; ++i) { 6271 SDValue Arg = OutVals[i]; 6272 ISD::ArgFlagsTy Flags = Outs[i].Flags; 6273 6274 // PtrOff will be used to store the current argument to the stack if a 6275 // register cannot be found for it. 6276 SDValue PtrOff; 6277 6278 PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); 6279 6280 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); 6281 6282 // On PPC64, promote integers to 64-bit values. 6283 if (isPPC64 && Arg.getValueType() == MVT::i32) { 6284 // FIXME: Should this use ANY_EXTEND if neither sext nor zext? 6285 unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 6286 Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); 6287 } 6288 6289 // FIXME memcpy is used way more than necessary. Correctness first. 6290 // Note: "by value" is code for passing a structure by value, not 6291 // basic types. 6292 if (Flags.isByVal()) { 6293 unsigned Size = Flags.getByValSize(); 6294 // Very small objects are passed right-justified. Everything else is 6295 // passed left-justified. 6296 if (Size==1 || Size==2) { 6297 EVT VT = (Size==1) ? MVT::i8 : MVT::i16; 6298 if (GPR_idx != NumGPRs) { 6299 SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, 6300 MachinePointerInfo(), VT); 6301 MemOpChains.push_back(Load.getValue(1)); 6302 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6303 6304 ArgOffset += PtrByteSize; 6305 } else { 6306 SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, 6307 PtrOff.getValueType()); 6308 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); 6309 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, 6310 CallSeqStart, 6311 Flags, DAG, dl); 6312 ArgOffset += PtrByteSize; 6313 } 6314 continue; 6315 } 6316 // Copy entire object into memory. There are cases where gcc-generated 6317 // code assumes it is there, even if it could be put entirely into 6318 // registers. (This is not what the doc says.) 6319 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, 6320 CallSeqStart, 6321 Flags, DAG, dl); 6322 6323 // For small aggregates (Darwin only) and aggregates >= PtrByteSize, 6324 // copy the pieces of the object that fit into registers from the 6325 // parameter save area. 6326 for (unsigned j=0; j<Size; j+=PtrByteSize) { 6327 SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType()); 6328 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 6329 if (GPR_idx != NumGPRs) { 6330 SDValue Load = 6331 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo()); 6332 MemOpChains.push_back(Load.getValue(1)); 6333 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6334 ArgOffset += PtrByteSize; 6335 } else { 6336 ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; 6337 break; 6338 } 6339 } 6340 continue; 6341 } 6342 6343 switch (Arg.getSimpleValueType().SimpleTy) { 6344 default: llvm_unreachable("Unexpected ValueType for argument!"); 6345 case MVT::i1: 6346 case MVT::i32: 6347 case MVT::i64: 6348 if (GPR_idx != NumGPRs) { 6349 if (Arg.getValueType() == MVT::i1) 6350 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg); 6351 6352 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); 6353 } else { 6354 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6355 isPPC64, isTailCall, false, MemOpChains, 6356 TailCallArguments, dl); 6357 } 6358 ArgOffset += PtrByteSize; 6359 break; 6360 case MVT::f32: 6361 case MVT::f64: 6362 if (FPR_idx != NumFPRs) { 6363 RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); 6364 6365 if (isVarArg) { 6366 SDValue Store = 6367 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 6368 MemOpChains.push_back(Store); 6369 6370 // Float varargs are always shadowed in available integer registers 6371 if (GPR_idx != NumGPRs) { 6372 SDValue Load = 6373 DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo()); 6374 MemOpChains.push_back(Load.getValue(1)); 6375 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6376 } 6377 if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){ 6378 SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); 6379 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); 6380 SDValue Load = 6381 DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo()); 6382 MemOpChains.push_back(Load.getValue(1)); 6383 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6384 } 6385 } else { 6386 // If we have any FPRs remaining, we may also have GPRs remaining. 6387 // Args passed in FPRs consume either 1 (f32) or 2 (f64) available 6388 // GPRs. 6389 if (GPR_idx != NumGPRs) 6390 ++GPR_idx; 6391 if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && 6392 !isPPC64) // PPC64 has 64-bit GPR's obviously :) 6393 ++GPR_idx; 6394 } 6395 } else 6396 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6397 isPPC64, isTailCall, false, MemOpChains, 6398 TailCallArguments, dl); 6399 if (isPPC64) 6400 ArgOffset += 8; 6401 else 6402 ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8; 6403 break; 6404 case MVT::v4f32: 6405 case MVT::v4i32: 6406 case MVT::v8i16: 6407 case MVT::v16i8: 6408 if (isVarArg) { 6409 // These go aligned on the stack, or in the corresponding R registers 6410 // when within range. The Darwin PPC ABI doc claims they also go in 6411 // V registers; in fact gcc does this only for arguments that are 6412 // prototyped, not for those that match the ... We do it for all 6413 // arguments, seems to work. 6414 while (ArgOffset % 16 !=0) { 6415 ArgOffset += PtrByteSize; 6416 if (GPR_idx != NumGPRs) 6417 GPR_idx++; 6418 } 6419 // We could elide this store in the case where the object fits 6420 // entirely in R registers. Maybe later. 6421 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, 6422 DAG.getConstant(ArgOffset, dl, PtrVT)); 6423 SDValue Store = 6424 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); 6425 MemOpChains.push_back(Store); 6426 if (VR_idx != NumVRs) { 6427 SDValue Load = 6428 DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo()); 6429 MemOpChains.push_back(Load.getValue(1)); 6430 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load)); 6431 } 6432 ArgOffset += 16; 6433 for (unsigned i=0; i<16; i+=PtrByteSize) { 6434 if (GPR_idx == NumGPRs) 6435 break; 6436 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, 6437 DAG.getConstant(i, dl, PtrVT)); 6438 SDValue Load = 6439 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); 6440 MemOpChains.push_back(Load.getValue(1)); 6441 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); 6442 } 6443 break; 6444 } 6445 6446 // Non-varargs Altivec params generally go in registers, but have 6447 // stack space allocated at the end. 6448 if (VR_idx != NumVRs) { 6449 // Doesn't have GPR space allocated. 6450 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); 6451 } else if (nAltivecParamsAtEnd==0) { 6452 // We are emitting Altivec params in order. 6453 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6454 isPPC64, isTailCall, true, MemOpChains, 6455 TailCallArguments, dl); 6456 ArgOffset += 16; 6457 } 6458 break; 6459 } 6460 } 6461 // If all Altivec parameters fit in registers, as they usually do, 6462 // they get stack space following the non-Altivec parameters. We 6463 // don't track this here because nobody below needs it. 6464 // If there are more Altivec parameters than fit in registers emit 6465 // the stores here. 6466 if (!isVarArg && nAltivecParamsAtEnd > NumVRs) { 6467 unsigned j = 0; 6468 // Offset is aligned; skip 1st 12 params which go in V registers. 6469 ArgOffset = ((ArgOffset+15)/16)*16; 6470 ArgOffset += 12*16; 6471 for (unsigned i = 0; i != NumOps; ++i) { 6472 SDValue Arg = OutVals[i]; 6473 EVT ArgType = Outs[i].VT; 6474 if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 || 6475 ArgType==MVT::v8i16 || ArgType==MVT::v16i8) { 6476 if (++j > NumVRs) { 6477 SDValue PtrOff; 6478 // We are emitting Altivec params in order. 6479 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, 6480 isPPC64, isTailCall, true, MemOpChains, 6481 TailCallArguments, dl); 6482 ArgOffset += 16; 6483 } 6484 } 6485 } 6486 } 6487 6488 if (!MemOpChains.empty()) 6489 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 6490 6491 // On Darwin, R12 must contain the address of an indirect callee. This does 6492 // not mean the MTCTR instruction must use R12; it's easier to model this as 6493 // an extra parameter, so do that. 6494 if (!isTailCall && 6495 !isFunctionGlobalAddress(Callee) && 6496 !isa<ExternalSymbolSDNode>(Callee) && 6497 !isBLACompatibleAddress(Callee, DAG)) 6498 RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : 6499 PPC::R12), Callee)); 6500 6501 // Build a sequence of copy-to-reg nodes chained together with token chain 6502 // and flag operands which copy the outgoing args into the appropriate regs. 6503 SDValue InFlag; 6504 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 6505 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 6506 RegsToPass[i].second, InFlag); 6507 InFlag = Chain.getValue(1); 6508 } 6509 6510 if (isTailCall) 6511 PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, 6512 TailCallArguments); 6513 6514 return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, 6515 /* unused except on PPC64 ELFv1 */ false, DAG, 6516 RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, 6517 NumBytes, Ins, InVals, CS); 6518 } 6519 6520 bool 6521 PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 6522 MachineFunction &MF, bool isVarArg, 6523 const SmallVectorImpl<ISD::OutputArg> &Outs, 6524 LLVMContext &Context) const { 6525 SmallVector<CCValAssign, 16> RVLocs; 6526 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 6527 return CCInfo.CheckReturn( 6528 Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold) 6529 ? RetCC_PPC_Cold 6530 : RetCC_PPC); 6531 } 6532 6533 SDValue 6534 PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 6535 bool isVarArg, 6536 const SmallVectorImpl<ISD::OutputArg> &Outs, 6537 const SmallVectorImpl<SDValue> &OutVals, 6538 const SDLoc &dl, SelectionDAG &DAG) const { 6539 SmallVector<CCValAssign, 16> RVLocs; 6540 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 6541 *DAG.getContext()); 6542 CCInfo.AnalyzeReturn(Outs, 6543 (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold) 6544 ? RetCC_PPC_Cold 6545 : RetCC_PPC); 6546 6547 SDValue Flag; 6548 SmallVector<SDValue, 4> RetOps(1, Chain); 6549 6550 // Copy the result values into the output registers. 6551 for (unsigned i = 0; i != RVLocs.size(); ++i) { 6552 CCValAssign &VA = RVLocs[i]; 6553 assert(VA.isRegLoc() && "Can only return in registers!"); 6554 6555 SDValue Arg = OutVals[i]; 6556 6557 switch (VA.getLocInfo()) { 6558 default: llvm_unreachable("Unknown loc info!"); 6559 case CCValAssign::Full: break; 6560 case CCValAssign::AExt: 6561 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 6562 break; 6563 case CCValAssign::ZExt: 6564 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 6565 break; 6566 case CCValAssign::SExt: 6567 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 6568 break; 6569 } 6570 6571 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 6572 Flag = Chain.getValue(1); 6573 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 6574 } 6575 6576 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); 6577 const MCPhysReg *I = 6578 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 6579 if (I) { 6580 for (; *I; ++I) { 6581 6582 if (PPC::G8RCRegClass.contains(*I)) 6583 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 6584 else if (PPC::F8RCRegClass.contains(*I)) 6585 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 6586 else if (PPC::CRRCRegClass.contains(*I)) 6587 RetOps.push_back(DAG.getRegister(*I, MVT::i1)); 6588 else if (PPC::VRRCRegClass.contains(*I)) 6589 RetOps.push_back(DAG.getRegister(*I, MVT::Other)); 6590 else 6591 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 6592 } 6593 } 6594 6595 RetOps[0] = Chain; // Update chain. 6596 6597 // Add the flag if we have it. 6598 if (Flag.getNode()) 6599 RetOps.push_back(Flag); 6600 6601 return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); 6602 } 6603 6604 SDValue 6605 PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, 6606 SelectionDAG &DAG) const { 6607 SDLoc dl(Op); 6608 6609 // Get the correct type for integers. 6610 EVT IntVT = Op.getValueType(); 6611 6612 // Get the inputs. 6613 SDValue Chain = Op.getOperand(0); 6614 SDValue FPSIdx = getFramePointerFrameIndex(DAG); 6615 // Build a DYNAREAOFFSET node. 6616 SDValue Ops[2] = {Chain, FPSIdx}; 6617 SDVTList VTs = DAG.getVTList(IntVT); 6618 return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops); 6619 } 6620 6621 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, 6622 SelectionDAG &DAG) const { 6623 // When we pop the dynamic allocation we need to restore the SP link. 6624 SDLoc dl(Op); 6625 6626 // Get the correct type for pointers. 6627 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6628 6629 // Construct the stack pointer operand. 6630 bool isPPC64 = Subtarget.isPPC64(); 6631 unsigned SP = isPPC64 ? PPC::X1 : PPC::R1; 6632 SDValue StackPtr = DAG.getRegister(SP, PtrVT); 6633 6634 // Get the operands for the STACKRESTORE. 6635 SDValue Chain = Op.getOperand(0); 6636 SDValue SaveSP = Op.getOperand(1); 6637 6638 // Load the old link SP. 6639 SDValue LoadLinkSP = 6640 DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo()); 6641 6642 // Restore the stack pointer. 6643 Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP); 6644 6645 // Store the old link SP. 6646 return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo()); 6647 } 6648 6649 SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const { 6650 MachineFunction &MF = DAG.getMachineFunction(); 6651 bool isPPC64 = Subtarget.isPPC64(); 6652 EVT PtrVT = getPointerTy(MF.getDataLayout()); 6653 6654 // Get current frame pointer save index. The users of this index will be 6655 // primarily DYNALLOC instructions. 6656 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); 6657 int RASI = FI->getReturnAddrSaveIndex(); 6658 6659 // If the frame pointer save index hasn't been defined yet. 6660 if (!RASI) { 6661 // Find out what the fix offset of the frame pointer save area. 6662 int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset(); 6663 // Allocate the frame index for frame pointer save area. 6664 RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false); 6665 // Save the result. 6666 FI->setReturnAddrSaveIndex(RASI); 6667 } 6668 return DAG.getFrameIndex(RASI, PtrVT); 6669 } 6670 6671 SDValue 6672 PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { 6673 MachineFunction &MF = DAG.getMachineFunction(); 6674 bool isPPC64 = Subtarget.isPPC64(); 6675 EVT PtrVT = getPointerTy(MF.getDataLayout()); 6676 6677 // Get current frame pointer save index. The users of this index will be 6678 // primarily DYNALLOC instructions. 6679 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); 6680 int FPSI = FI->getFramePointerSaveIndex(); 6681 6682 // If the frame pointer save index hasn't been defined yet. 6683 if (!FPSI) { 6684 // Find out what the fix offset of the frame pointer save area. 6685 int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset(); 6686 // Allocate the frame index for frame pointer save area. 6687 FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true); 6688 // Save the result. 6689 FI->setFramePointerSaveIndex(FPSI); 6690 } 6691 return DAG.getFrameIndex(FPSI, PtrVT); 6692 } 6693 6694 SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6695 SelectionDAG &DAG) const { 6696 // Get the inputs. 6697 SDValue Chain = Op.getOperand(0); 6698 SDValue Size = Op.getOperand(1); 6699 SDLoc dl(Op); 6700 6701 // Get the correct type for pointers. 6702 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6703 // Negate the size. 6704 SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT, 6705 DAG.getConstant(0, dl, PtrVT), Size); 6706 // Construct a node for the frame pointer save index. 6707 SDValue FPSIdx = getFramePointerFrameIndex(DAG); 6708 // Build a DYNALLOC node. 6709 SDValue Ops[3] = { Chain, NegSize, FPSIdx }; 6710 SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other); 6711 return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops); 6712 } 6713 6714 SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op, 6715 SelectionDAG &DAG) const { 6716 MachineFunction &MF = DAG.getMachineFunction(); 6717 6718 bool isPPC64 = Subtarget.isPPC64(); 6719 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 6720 6721 int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false); 6722 return DAG.getFrameIndex(FI, PtrVT); 6723 } 6724 6725 SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, 6726 SelectionDAG &DAG) const { 6727 SDLoc DL(Op); 6728 return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL, 6729 DAG.getVTList(MVT::i32, MVT::Other), 6730 Op.getOperand(0), Op.getOperand(1)); 6731 } 6732 6733 SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, 6734 SelectionDAG &DAG) const { 6735 SDLoc DL(Op); 6736 return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other, 6737 Op.getOperand(0), Op.getOperand(1)); 6738 } 6739 6740 SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 6741 if (Op.getValueType().isVector()) 6742 return LowerVectorLoad(Op, DAG); 6743 6744 assert(Op.getValueType() == MVT::i1 && 6745 "Custom lowering only for i1 loads"); 6746 6747 // First, load 8 bits into 32 bits, then truncate to 1 bit. 6748 6749 SDLoc dl(Op); 6750 LoadSDNode *LD = cast<LoadSDNode>(Op); 6751 6752 SDValue Chain = LD->getChain(); 6753 SDValue BasePtr = LD->getBasePtr(); 6754 MachineMemOperand *MMO = LD->getMemOperand(); 6755 6756 SDValue NewLD = 6757 DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain, 6758 BasePtr, MVT::i8, MMO); 6759 SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD); 6760 6761 SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) }; 6762 return DAG.getMergeValues(Ops, dl); 6763 } 6764 6765 SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 6766 if (Op.getOperand(1).getValueType().isVector()) 6767 return LowerVectorStore(Op, DAG); 6768 6769 assert(Op.getOperand(1).getValueType() == MVT::i1 && 6770 "Custom lowering only for i1 stores"); 6771 6772 // First, zero extend to 32 bits, then use a truncating store to 8 bits. 6773 6774 SDLoc dl(Op); 6775 StoreSDNode *ST = cast<StoreSDNode>(Op); 6776 6777 SDValue Chain = ST->getChain(); 6778 SDValue BasePtr = ST->getBasePtr(); 6779 SDValue Value = ST->getValue(); 6780 MachineMemOperand *MMO = ST->getMemOperand(); 6781 6782 Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()), 6783 Value); 6784 return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO); 6785 } 6786 6787 // FIXME: Remove this once the ANDI glue bug is fixed: 6788 SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { 6789 assert(Op.getValueType() == MVT::i1 && 6790 "Custom lowering only for i1 results"); 6791 6792 SDLoc DL(Op); 6793 return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1, 6794 Op.getOperand(0)); 6795 } 6796 6797 /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when 6798 /// possible. 6799 SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 6800 // Not FP? Not a fsel. 6801 if (!Op.getOperand(0).getValueType().isFloatingPoint() || 6802 !Op.getOperand(2).getValueType().isFloatingPoint()) 6803 return Op; 6804 6805 // We might be able to do better than this under some circumstances, but in 6806 // general, fsel-based lowering of select is a finite-math-only optimization. 6807 // For more information, see section F.3 of the 2.06 ISA specification. 6808 if (!DAG.getTarget().Options.NoInfsFPMath || 6809 !DAG.getTarget().Options.NoNaNsFPMath) 6810 return Op; 6811 // TODO: Propagate flags from the select rather than global settings. 6812 SDNodeFlags Flags; 6813 Flags.setNoInfs(true); 6814 Flags.setNoNaNs(true); 6815 6816 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 6817 6818 EVT ResVT = Op.getValueType(); 6819 EVT CmpVT = Op.getOperand(0).getValueType(); 6820 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 6821 SDValue TV = Op.getOperand(2), FV = Op.getOperand(3); 6822 SDLoc dl(Op); 6823 6824 // If the RHS of the comparison is a 0.0, we don't need to do the 6825 // subtraction at all. 6826 SDValue Sel1; 6827 if (isFloatingPointZero(RHS)) 6828 switch (CC) { 6829 default: break; // SETUO etc aren't handled by fsel. 6830 case ISD::SETNE: 6831 std::swap(TV, FV); 6832 LLVM_FALLTHROUGH; 6833 case ISD::SETEQ: 6834 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits 6835 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); 6836 Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); 6837 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits 6838 Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); 6839 return DAG.getNode(PPCISD::FSEL, dl, ResVT, 6840 DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV); 6841 case ISD::SETULT: 6842 case ISD::SETLT: 6843 std::swap(TV, FV); // fsel is natively setge, swap operands for setlt 6844 LLVM_FALLTHROUGH; 6845 case ISD::SETOGE: 6846 case ISD::SETGE: 6847 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits 6848 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); 6849 return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); 6850 case ISD::SETUGT: 6851 case ISD::SETGT: 6852 std::swap(TV, FV); // fsel is natively setge, swap operands for setlt 6853 LLVM_FALLTHROUGH; 6854 case ISD::SETOLE: 6855 case ISD::SETLE: 6856 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits 6857 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); 6858 return DAG.getNode(PPCISD::FSEL, dl, ResVT, 6859 DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV); 6860 } 6861 6862 SDValue Cmp; 6863 switch (CC) { 6864 default: break; // SETUO etc aren't handled by fsel. 6865 case ISD::SETNE: 6866 std::swap(TV, FV); 6867 LLVM_FALLTHROUGH; 6868 case ISD::SETEQ: 6869 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); 6870 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6871 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6872 Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); 6873 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits 6874 Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); 6875 return DAG.getNode(PPCISD::FSEL, dl, ResVT, 6876 DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); 6877 case ISD::SETULT: 6878 case ISD::SETLT: 6879 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); 6880 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6881 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6882 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); 6883 case ISD::SETOGE: 6884 case ISD::SETGE: 6885 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); 6886 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6887 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6888 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); 6889 case ISD::SETUGT: 6890 case ISD::SETGT: 6891 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); 6892 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6893 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6894 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); 6895 case ISD::SETOLE: 6896 case ISD::SETLE: 6897 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags); 6898 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits 6899 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); 6900 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); 6901 } 6902 return Op; 6903 } 6904 6905 void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, 6906 SelectionDAG &DAG, 6907 const SDLoc &dl) const { 6908 assert(Op.getOperand(0).getValueType().isFloatingPoint()); 6909 SDValue Src = Op.getOperand(0); 6910 if (Src.getValueType() == MVT::f32) 6911 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); 6912 6913 SDValue Tmp; 6914 switch (Op.getSimpleValueType().SimpleTy) { 6915 default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); 6916 case MVT::i32: 6917 Tmp = DAG.getNode( 6918 Op.getOpcode() == ISD::FP_TO_SINT 6919 ? PPCISD::FCTIWZ 6920 : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), 6921 dl, MVT::f64, Src); 6922 break; 6923 case MVT::i64: 6924 assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && 6925 "i64 FP_TO_UINT is supported only with FPCVT"); 6926 Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : 6927 PPCISD::FCTIDUZ, 6928 dl, MVT::f64, Src); 6929 break; 6930 } 6931 6932 // Convert the FP value to an int value through memory. 6933 bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() && 6934 (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); 6935 SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); 6936 int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex(); 6937 MachinePointerInfo MPI = 6938 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 6939 6940 // Emit a store to the stack slot. 6941 SDValue Chain; 6942 if (i32Stack) { 6943 MachineFunction &MF = DAG.getMachineFunction(); 6944 MachineMemOperand *MMO = 6945 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4); 6946 SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr }; 6947 Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, 6948 DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO); 6949 } else 6950 Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI); 6951 6952 // Result is a load from the stack slot. If loading 4 bytes, make sure to 6953 // add in a bias on big endian. 6954 if (Op.getValueType() == MVT::i32 && !i32Stack) { 6955 FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, 6956 DAG.getConstant(4, dl, FIPtr.getValueType())); 6957 MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4); 6958 } 6959 6960 RLI.Chain = Chain; 6961 RLI.Ptr = FIPtr; 6962 RLI.MPI = MPI; 6963 } 6964 6965 /// Custom lowers floating point to integer conversions to use 6966 /// the direct move instructions available in ISA 2.07 to avoid the 6967 /// need for load/store combinations. 6968 SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op, 6969 SelectionDAG &DAG, 6970 const SDLoc &dl) const { 6971 assert(Op.getOperand(0).getValueType().isFloatingPoint()); 6972 SDValue Src = Op.getOperand(0); 6973 6974 if (Src.getValueType() == MVT::f32) 6975 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); 6976 6977 SDValue Tmp; 6978 switch (Op.getSimpleValueType().SimpleTy) { 6979 default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); 6980 case MVT::i32: 6981 Tmp = DAG.getNode( 6982 Op.getOpcode() == ISD::FP_TO_SINT 6983 ? PPCISD::FCTIWZ 6984 : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), 6985 dl, MVT::f64, Src); 6986 Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp); 6987 break; 6988 case MVT::i64: 6989 assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && 6990 "i64 FP_TO_UINT is supported only with FPCVT"); 6991 Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : 6992 PPCISD::FCTIDUZ, 6993 dl, MVT::f64, Src); 6994 Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp); 6995 break; 6996 } 6997 return Tmp; 6998 } 6999 7000 SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, 7001 const SDLoc &dl) const { 7002 7003 // FP to INT conversions are legal for f128. 7004 if (EnableQuadPrecision && (Op->getOperand(0).getValueType() == MVT::f128)) 7005 return Op; 7006 7007 // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on 7008 // PPC (the libcall is not available). 7009 if (Op.getOperand(0).getValueType() == MVT::ppcf128) { 7010 if (Op.getValueType() == MVT::i32) { 7011 if (Op.getOpcode() == ISD::FP_TO_SINT) { 7012 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, 7013 MVT::f64, Op.getOperand(0), 7014 DAG.getIntPtrConstant(0, dl)); 7015 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, 7016 MVT::f64, Op.getOperand(0), 7017 DAG.getIntPtrConstant(1, dl)); 7018 7019 // Add the two halves of the long double in round-to-zero mode. 7020 SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi); 7021 7022 // Now use a smaller FP_TO_SINT. 7023 return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res); 7024 } 7025 if (Op.getOpcode() == ISD::FP_TO_UINT) { 7026 const uint64_t TwoE31[] = {0x41e0000000000000LL, 0}; 7027 APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31)); 7028 SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128); 7029 // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X 7030 // FIXME: generated code sucks. 7031 // TODO: Are there fast-math-flags to propagate to this FSUB? 7032 SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128, 7033 Op.getOperand(0), Tmp); 7034 True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True); 7035 True = DAG.getNode(ISD::ADD, dl, MVT::i32, True, 7036 DAG.getConstant(0x80000000, dl, MVT::i32)); 7037 SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, 7038 Op.getOperand(0)); 7039 return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False, 7040 ISD::SETGE); 7041 } 7042 } 7043 7044 return SDValue(); 7045 } 7046 7047 if (Subtarget.hasDirectMove() && Subtarget.isPPC64()) 7048 return LowerFP_TO_INTDirectMove(Op, DAG, dl); 7049 7050 ReuseLoadInfo RLI; 7051 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); 7052 7053 return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI, 7054 RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges); 7055 } 7056 7057 // We're trying to insert a regular store, S, and then a load, L. If the 7058 // incoming value, O, is a load, we might just be able to have our load use the 7059 // address used by O. However, we don't know if anything else will store to 7060 // that address before we can load from it. To prevent this situation, we need 7061 // to insert our load, L, into the chain as a peer of O. To do this, we give L 7062 // the same chain operand as O, we create a token factor from the chain results 7063 // of O and L, and we replace all uses of O's chain result with that token 7064 // factor (see spliceIntoChain below for this last part). 7065 bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, 7066 ReuseLoadInfo &RLI, 7067 SelectionDAG &DAG, 7068 ISD::LoadExtType ET) const { 7069 SDLoc dl(Op); 7070 if (ET == ISD::NON_EXTLOAD && 7071 (Op.getOpcode() == ISD::FP_TO_UINT || 7072 Op.getOpcode() == ISD::FP_TO_SINT) && 7073 isOperationLegalOrCustom(Op.getOpcode(), 7074 Op.getOperand(0).getValueType())) { 7075 7076 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl); 7077 return true; 7078 } 7079 7080 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op); 7081 if (!LD || LD->getExtensionType() != ET || LD->isVolatile() || 7082 LD->isNonTemporal()) 7083 return false; 7084 if (LD->getMemoryVT() != MemVT) 7085 return false; 7086 7087 RLI.Ptr = LD->getBasePtr(); 7088 if (LD->isIndexed() && !LD->getOffset().isUndef()) { 7089 assert(LD->getAddressingMode() == ISD::PRE_INC && 7090 "Non-pre-inc AM on PPC?"); 7091 RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr, 7092 LD->getOffset()); 7093 } 7094 7095 RLI.Chain = LD->getChain(); 7096 RLI.MPI = LD->getPointerInfo(); 7097 RLI.IsDereferenceable = LD->isDereferenceable(); 7098 RLI.IsInvariant = LD->isInvariant(); 7099 RLI.Alignment = LD->getAlignment(); 7100 RLI.AAInfo = LD->getAAInfo(); 7101 RLI.Ranges = LD->getRanges(); 7102 7103 RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1); 7104 return true; 7105 } 7106 7107 // Given the head of the old chain, ResChain, insert a token factor containing 7108 // it and NewResChain, and make users of ResChain now be users of that token 7109 // factor. 7110 // TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead. 7111 void PPCTargetLowering::spliceIntoChain(SDValue ResChain, 7112 SDValue NewResChain, 7113 SelectionDAG &DAG) const { 7114 if (!ResChain) 7115 return; 7116 7117 SDLoc dl(NewResChain); 7118 7119 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 7120 NewResChain, DAG.getUNDEF(MVT::Other)); 7121 assert(TF.getNode() != NewResChain.getNode() && 7122 "A new TF really is required here"); 7123 7124 DAG.ReplaceAllUsesOfValueWith(ResChain, TF); 7125 DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain); 7126 } 7127 7128 /// Analyze profitability of direct move 7129 /// prefer float load to int load plus direct move 7130 /// when there is no integer use of int load 7131 bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const { 7132 SDNode *Origin = Op.getOperand(0).getNode(); 7133 if (Origin->getOpcode() != ISD::LOAD) 7134 return true; 7135 7136 // If there is no LXSIBZX/LXSIHZX, like Power8, 7137 // prefer direct move if the memory size is 1 or 2 bytes. 7138 MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand(); 7139 if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2) 7140 return true; 7141 7142 for (SDNode::use_iterator UI = Origin->use_begin(), 7143 UE = Origin->use_end(); 7144 UI != UE; ++UI) { 7145 7146 // Only look at the users of the loaded value. 7147 if (UI.getUse().get().getResNo() != 0) 7148 continue; 7149 7150 if (UI->getOpcode() != ISD::SINT_TO_FP && 7151 UI->getOpcode() != ISD::UINT_TO_FP) 7152 return true; 7153 } 7154 7155 return false; 7156 } 7157 7158 /// Custom lowers integer to floating point conversions to use 7159 /// the direct move instructions available in ISA 2.07 to avoid the 7160 /// need for load/store combinations. 7161 SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op, 7162 SelectionDAG &DAG, 7163 const SDLoc &dl) const { 7164 assert((Op.getValueType() == MVT::f32 || 7165 Op.getValueType() == MVT::f64) && 7166 "Invalid floating point type as target of conversion"); 7167 assert(Subtarget.hasFPCVT() && 7168 "Int to FP conversions with direct moves require FPCVT"); 7169 SDValue FP; 7170 SDValue Src = Op.getOperand(0); 7171 bool SinglePrec = Op.getValueType() == MVT::f32; 7172 bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32; 7173 bool Signed = Op.getOpcode() == ISD::SINT_TO_FP; 7174 unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) : 7175 (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU); 7176 7177 if (WordInt) { 7178 FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ, 7179 dl, MVT::f64, Src); 7180 FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); 7181 } 7182 else { 7183 FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src); 7184 FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); 7185 } 7186 7187 return FP; 7188 } 7189 7190 static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) { 7191 7192 EVT VecVT = Vec.getValueType(); 7193 assert(VecVT.isVector() && "Expected a vector type."); 7194 assert(VecVT.getSizeInBits() < 128 && "Vector is already full width."); 7195 7196 EVT EltVT = VecVT.getVectorElementType(); 7197 unsigned WideNumElts = 128 / EltVT.getSizeInBits(); 7198 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts); 7199 7200 unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements(); 7201 SmallVector<SDValue, 16> Ops(NumConcat); 7202 Ops[0] = Vec; 7203 SDValue UndefVec = DAG.getUNDEF(VecVT); 7204 for (unsigned i = 1; i < NumConcat; ++i) 7205 Ops[i] = UndefVec; 7206 7207 return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops); 7208 } 7209 7210 SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG, 7211 const SDLoc &dl) const { 7212 7213 unsigned Opc = Op.getOpcode(); 7214 assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP) && 7215 "Unexpected conversion type"); 7216 assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) && 7217 "Supports conversions to v2f64/v4f32 only."); 7218 7219 bool SignedConv = Opc == ISD::SINT_TO_FP; 7220 bool FourEltRes = Op.getValueType() == MVT::v4f32; 7221 7222 SDValue Wide = widenVec(DAG, Op.getOperand(0), dl); 7223 EVT WideVT = Wide.getValueType(); 7224 unsigned WideNumElts = WideVT.getVectorNumElements(); 7225 MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64; 7226 7227 SmallVector<int, 16> ShuffV; 7228 for (unsigned i = 0; i < WideNumElts; ++i) 7229 ShuffV.push_back(i + WideNumElts); 7230 7231 int Stride = FourEltRes ? WideNumElts / 4 : WideNumElts / 2; 7232 int SaveElts = FourEltRes ? 4 : 2; 7233 if (Subtarget.isLittleEndian()) 7234 for (int i = 0; i < SaveElts; i++) 7235 ShuffV[i * Stride] = i; 7236 else 7237 for (int i = 1; i <= SaveElts; i++) 7238 ShuffV[i * Stride - 1] = i - 1; 7239 7240 SDValue ShuffleSrc2 = 7241 SignedConv ? DAG.getUNDEF(WideVT) : DAG.getConstant(0, dl, WideVT); 7242 SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV); 7243 unsigned ExtendOp = 7244 SignedConv ? (unsigned)PPCISD::SExtVElems : (unsigned)ISD::BITCAST; 7245 7246 SDValue Extend; 7247 if (!Subtarget.hasP9Altivec() && SignedConv) { 7248 Arrange = DAG.getBitcast(IntermediateVT, Arrange); 7249 Extend = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, IntermediateVT, Arrange, 7250 DAG.getValueType(Op.getOperand(0).getValueType())); 7251 } else 7252 Extend = DAG.getNode(ExtendOp, dl, IntermediateVT, Arrange); 7253 7254 return DAG.getNode(Opc, dl, Op.getValueType(), Extend); 7255 } 7256 7257 SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, 7258 SelectionDAG &DAG) const { 7259 SDLoc dl(Op); 7260 7261 EVT InVT = Op.getOperand(0).getValueType(); 7262 EVT OutVT = Op.getValueType(); 7263 if (OutVT.isVector() && OutVT.isFloatingPoint() && 7264 isOperationCustom(Op.getOpcode(), InVT)) 7265 return LowerINT_TO_FPVector(Op, DAG, dl); 7266 7267 // Conversions to f128 are legal. 7268 if (EnableQuadPrecision && (Op.getValueType() == MVT::f128)) 7269 return Op; 7270 7271 if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) { 7272 if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64) 7273 return SDValue(); 7274 7275 SDValue Value = Op.getOperand(0); 7276 // The values are now known to be -1 (false) or 1 (true). To convert this 7277 // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). 7278 // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 7279 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); 7280 7281 SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); 7282 7283 Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 7284 7285 if (Op.getValueType() != MVT::v4f64) 7286 Value = DAG.getNode(ISD::FP_ROUND, dl, 7287 Op.getValueType(), Value, 7288 DAG.getIntPtrConstant(1, dl)); 7289 return Value; 7290 } 7291 7292 // Don't handle ppc_fp128 here; let it be lowered to a libcall. 7293 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) 7294 return SDValue(); 7295 7296 if (Op.getOperand(0).getValueType() == MVT::i1) 7297 return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0), 7298 DAG.getConstantFP(1.0, dl, Op.getValueType()), 7299 DAG.getConstantFP(0.0, dl, Op.getValueType())); 7300 7301 // If we have direct moves, we can do all the conversion, skip the store/load 7302 // however, without FPCVT we can't do most conversions. 7303 if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) && 7304 Subtarget.isPPC64() && Subtarget.hasFPCVT()) 7305 return LowerINT_TO_FPDirectMove(Op, DAG, dl); 7306 7307 assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && 7308 "UINT_TO_FP is supported only with FPCVT"); 7309 7310 // If we have FCFIDS, then use it when converting to single-precision. 7311 // Otherwise, convert to double-precision and then round. 7312 unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 7313 ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS 7314 : PPCISD::FCFIDS) 7315 : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU 7316 : PPCISD::FCFID); 7317 MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 7318 ? MVT::f32 7319 : MVT::f64; 7320 7321 if (Op.getOperand(0).getValueType() == MVT::i64) { 7322 SDValue SINT = Op.getOperand(0); 7323 // When converting to single-precision, we actually need to convert 7324 // to double-precision first and then round to single-precision. 7325 // To avoid double-rounding effects during that operation, we have 7326 // to prepare the input operand. Bits that might be truncated when 7327 // converting to double-precision are replaced by a bit that won't 7328 // be lost at this stage, but is below the single-precision rounding 7329 // position. 7330 // 7331 // However, if -enable-unsafe-fp-math is in effect, accept double 7332 // rounding to avoid the extra overhead. 7333 if (Op.getValueType() == MVT::f32 && 7334 !Subtarget.hasFPCVT() && 7335 !DAG.getTarget().Options.UnsafeFPMath) { 7336 7337 // Twiddle input to make sure the low 11 bits are zero. (If this 7338 // is the case, we are guaranteed the value will fit into the 53 bit 7339 // mantissa of an IEEE double-precision value without rounding.) 7340 // If any of those low 11 bits were not zero originally, make sure 7341 // bit 12 (value 2048) is set instead, so that the final rounding 7342 // to single-precision gets the correct result. 7343 SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64, 7344 SINT, DAG.getConstant(2047, dl, MVT::i64)); 7345 Round = DAG.getNode(ISD::ADD, dl, MVT::i64, 7346 Round, DAG.getConstant(2047, dl, MVT::i64)); 7347 Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT); 7348 Round = DAG.getNode(ISD::AND, dl, MVT::i64, 7349 Round, DAG.getConstant(-2048, dl, MVT::i64)); 7350 7351 // However, we cannot use that value unconditionally: if the magnitude 7352 // of the input value is small, the bit-twiddling we did above might 7353 // end up visibly changing the output. Fortunately, in that case, we 7354 // don't need to twiddle bits since the original input will convert 7355 // exactly to double-precision floating-point already. Therefore, 7356 // construct a conditional to use the original value if the top 11 7357 // bits are all sign-bit copies, and use the rounded value computed 7358 // above otherwise. 7359 SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64, 7360 SINT, DAG.getConstant(53, dl, MVT::i32)); 7361 Cond = DAG.getNode(ISD::ADD, dl, MVT::i64, 7362 Cond, DAG.getConstant(1, dl, MVT::i64)); 7363 Cond = DAG.getSetCC(dl, MVT::i32, 7364 Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT); 7365 7366 SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT); 7367 } 7368 7369 ReuseLoadInfo RLI; 7370 SDValue Bits; 7371 7372 MachineFunction &MF = DAG.getMachineFunction(); 7373 if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) { 7374 Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI, 7375 RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges); 7376 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); 7377 } else if (Subtarget.hasLFIWAX() && 7378 canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) { 7379 MachineMemOperand *MMO = 7380 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7381 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7382 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7383 Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl, 7384 DAG.getVTList(MVT::f64, MVT::Other), 7385 Ops, MVT::i32, MMO); 7386 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); 7387 } else if (Subtarget.hasFPCVT() && 7388 canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) { 7389 MachineMemOperand *MMO = 7390 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7391 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7392 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7393 Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl, 7394 DAG.getVTList(MVT::f64, MVT::Other), 7395 Ops, MVT::i32, MMO); 7396 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG); 7397 } else if (((Subtarget.hasLFIWAX() && 7398 SINT.getOpcode() == ISD::SIGN_EXTEND) || 7399 (Subtarget.hasFPCVT() && 7400 SINT.getOpcode() == ISD::ZERO_EXTEND)) && 7401 SINT.getOperand(0).getValueType() == MVT::i32) { 7402 MachineFrameInfo &MFI = MF.getFrameInfo(); 7403 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 7404 7405 int FrameIdx = MFI.CreateStackObject(4, 4, false); 7406 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7407 7408 SDValue Store = 7409 DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, 7410 MachinePointerInfo::getFixedStack( 7411 DAG.getMachineFunction(), FrameIdx)); 7412 7413 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && 7414 "Expected an i32 store"); 7415 7416 RLI.Ptr = FIdx; 7417 RLI.Chain = Store; 7418 RLI.MPI = 7419 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 7420 RLI.Alignment = 4; 7421 7422 MachineMemOperand *MMO = 7423 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7424 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7425 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7426 Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ? 7427 PPCISD::LFIWZX : PPCISD::LFIWAX, 7428 dl, DAG.getVTList(MVT::f64, MVT::Other), 7429 Ops, MVT::i32, MMO); 7430 } else 7431 Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); 7432 7433 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); 7434 7435 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) 7436 FP = DAG.getNode(ISD::FP_ROUND, dl, 7437 MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); 7438 return FP; 7439 } 7440 7441 assert(Op.getOperand(0).getValueType() == MVT::i32 && 7442 "Unhandled INT_TO_FP type in custom expander!"); 7443 // Since we only generate this in 64-bit mode, we can take advantage of 7444 // 64-bit registers. In particular, sign extend the input value into the 7445 // 64-bit register with extsw, store the WHOLE 64-bit value into the stack 7446 // then lfd it and fcfid it. 7447 MachineFunction &MF = DAG.getMachineFunction(); 7448 MachineFrameInfo &MFI = MF.getFrameInfo(); 7449 EVT PtrVT = getPointerTy(MF.getDataLayout()); 7450 7451 SDValue Ld; 7452 if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { 7453 ReuseLoadInfo RLI; 7454 bool ReusingLoad; 7455 if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI, 7456 DAG))) { 7457 int FrameIdx = MFI.CreateStackObject(4, 4, false); 7458 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7459 7460 SDValue Store = 7461 DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, 7462 MachinePointerInfo::getFixedStack( 7463 DAG.getMachineFunction(), FrameIdx)); 7464 7465 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && 7466 "Expected an i32 store"); 7467 7468 RLI.Ptr = FIdx; 7469 RLI.Chain = Store; 7470 RLI.MPI = 7471 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 7472 RLI.Alignment = 4; 7473 } 7474 7475 MachineMemOperand *MMO = 7476 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, 7477 RLI.Alignment, RLI.AAInfo, RLI.Ranges); 7478 SDValue Ops[] = { RLI.Chain, RLI.Ptr }; 7479 Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? 7480 PPCISD::LFIWZX : PPCISD::LFIWAX, 7481 dl, DAG.getVTList(MVT::f64, MVT::Other), 7482 Ops, MVT::i32, MMO); 7483 if (ReusingLoad) 7484 spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG); 7485 } else { 7486 assert(Subtarget.isPPC64() && 7487 "i32->FP without LFIWAX supported only on PPC64"); 7488 7489 int FrameIdx = MFI.CreateStackObject(8, 8, false); 7490 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7491 7492 SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, 7493 Op.getOperand(0)); 7494 7495 // STD the extended value into the stack slot. 7496 SDValue Store = DAG.getStore( 7497 DAG.getEntryNode(), dl, Ext64, FIdx, 7498 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); 7499 7500 // Load the value as a double. 7501 Ld = DAG.getLoad( 7502 MVT::f64, dl, Store, FIdx, 7503 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); 7504 } 7505 7506 // FCFID it and return it. 7507 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); 7508 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) 7509 FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, 7510 DAG.getIntPtrConstant(0, dl)); 7511 return FP; 7512 } 7513 7514 SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7515 SelectionDAG &DAG) const { 7516 SDLoc dl(Op); 7517 /* 7518 The rounding mode is in bits 30:31 of FPSR, and has the following 7519 settings: 7520 00 Round to nearest 7521 01 Round to 0 7522 10 Round to +inf 7523 11 Round to -inf 7524 7525 FLT_ROUNDS, on the other hand, expects the following: 7526 -1 Undefined 7527 0 Round to 0 7528 1 Round to nearest 7529 2 Round to +inf 7530 3 Round to -inf 7531 7532 To perform the conversion, we do: 7533 ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1)) 7534 */ 7535 7536 MachineFunction &MF = DAG.getMachineFunction(); 7537 EVT VT = Op.getValueType(); 7538 EVT PtrVT = getPointerTy(MF.getDataLayout()); 7539 7540 // Save FP Control Word to register 7541 EVT NodeTys[] = { 7542 MVT::f64, // return register 7543 MVT::Glue // unused in this context 7544 }; 7545 SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None); 7546 7547 // Save FP register to stack slot 7548 int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false); 7549 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); 7550 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot, 7551 MachinePointerInfo()); 7552 7553 // Load FP Control Word from low 32 bits of stack slot. 7554 SDValue Four = DAG.getConstant(4, dl, PtrVT); 7555 SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four); 7556 SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo()); 7557 7558 // Transform as necessary 7559 SDValue CWD1 = 7560 DAG.getNode(ISD::AND, dl, MVT::i32, 7561 CWD, DAG.getConstant(3, dl, MVT::i32)); 7562 SDValue CWD2 = 7563 DAG.getNode(ISD::SRL, dl, MVT::i32, 7564 DAG.getNode(ISD::AND, dl, MVT::i32, 7565 DAG.getNode(ISD::XOR, dl, MVT::i32, 7566 CWD, DAG.getConstant(3, dl, MVT::i32)), 7567 DAG.getConstant(3, dl, MVT::i32)), 7568 DAG.getConstant(1, dl, MVT::i32)); 7569 7570 SDValue RetVal = 7571 DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2); 7572 7573 return DAG.getNode((VT.getSizeInBits() < 16 ? 7574 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7575 } 7576 7577 SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { 7578 EVT VT = Op.getValueType(); 7579 unsigned BitWidth = VT.getSizeInBits(); 7580 SDLoc dl(Op); 7581 assert(Op.getNumOperands() == 3 && 7582 VT == Op.getOperand(1).getValueType() && 7583 "Unexpected SHL!"); 7584 7585 // Expand into a bunch of logical ops. Note that these ops 7586 // depend on the PPC behavior for oversized shift amounts. 7587 SDValue Lo = Op.getOperand(0); 7588 SDValue Hi = Op.getOperand(1); 7589 SDValue Amt = Op.getOperand(2); 7590 EVT AmtVT = Amt.getValueType(); 7591 7592 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, 7593 DAG.getConstant(BitWidth, dl, AmtVT), Amt); 7594 SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt); 7595 SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1); 7596 SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3); 7597 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, 7598 DAG.getConstant(-BitWidth, dl, AmtVT)); 7599 SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5); 7600 SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); 7601 SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt); 7602 SDValue OutOps[] = { OutLo, OutHi }; 7603 return DAG.getMergeValues(OutOps, dl); 7604 } 7605 7606 SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { 7607 EVT VT = Op.getValueType(); 7608 SDLoc dl(Op); 7609 unsigned BitWidth = VT.getSizeInBits(); 7610 assert(Op.getNumOperands() == 3 && 7611 VT == Op.getOperand(1).getValueType() && 7612 "Unexpected SRL!"); 7613 7614 // Expand into a bunch of logical ops. Note that these ops 7615 // depend on the PPC behavior for oversized shift amounts. 7616 SDValue Lo = Op.getOperand(0); 7617 SDValue Hi = Op.getOperand(1); 7618 SDValue Amt = Op.getOperand(2); 7619 EVT AmtVT = Amt.getValueType(); 7620 7621 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, 7622 DAG.getConstant(BitWidth, dl, AmtVT), Amt); 7623 SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); 7624 SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); 7625 SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); 7626 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, 7627 DAG.getConstant(-BitWidth, dl, AmtVT)); 7628 SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5); 7629 SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); 7630 SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt); 7631 SDValue OutOps[] = { OutLo, OutHi }; 7632 return DAG.getMergeValues(OutOps, dl); 7633 } 7634 7635 SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { 7636 SDLoc dl(Op); 7637 EVT VT = Op.getValueType(); 7638 unsigned BitWidth = VT.getSizeInBits(); 7639 assert(Op.getNumOperands() == 3 && 7640 VT == Op.getOperand(1).getValueType() && 7641 "Unexpected SRA!"); 7642 7643 // Expand into a bunch of logical ops, followed by a select_cc. 7644 SDValue Lo = Op.getOperand(0); 7645 SDValue Hi = Op.getOperand(1); 7646 SDValue Amt = Op.getOperand(2); 7647 EVT AmtVT = Amt.getValueType(); 7648 7649 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, 7650 DAG.getConstant(BitWidth, dl, AmtVT), Amt); 7651 SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); 7652 SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); 7653 SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); 7654 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, 7655 DAG.getConstant(-BitWidth, dl, AmtVT)); 7656 SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5); 7657 SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt); 7658 SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT), 7659 Tmp4, Tmp6, ISD::SETLE); 7660 SDValue OutOps[] = { OutLo, OutHi }; 7661 return DAG.getMergeValues(OutOps, dl); 7662 } 7663 7664 //===----------------------------------------------------------------------===// 7665 // Vector related lowering. 7666 // 7667 7668 /// BuildSplatI - Build a canonical splati of Val with an element size of 7669 /// SplatSize. Cast the result to VT. 7670 static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, 7671 SelectionDAG &DAG, const SDLoc &dl) { 7672 assert(Val >= -16 && Val <= 15 && "vsplti is out of range!"); 7673 7674 static const MVT VTys[] = { // canonical VT to use for each size. 7675 MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32 7676 }; 7677 7678 EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1]; 7679 7680 // Force vspltis[hw] -1 to vspltisb -1 to canonicalize. 7681 if (Val == -1) 7682 SplatSize = 1; 7683 7684 EVT CanonicalVT = VTys[SplatSize-1]; 7685 7686 // Build a canonical splat for this value. 7687 return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT)); 7688 } 7689 7690 /// BuildIntrinsicOp - Return a unary operator intrinsic node with the 7691 /// specified intrinsic ID. 7692 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG, 7693 const SDLoc &dl, EVT DestVT = MVT::Other) { 7694 if (DestVT == MVT::Other) DestVT = Op.getValueType(); 7695 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, 7696 DAG.getConstant(IID, dl, MVT::i32), Op); 7697 } 7698 7699 /// BuildIntrinsicOp - Return a binary operator intrinsic node with the 7700 /// specified intrinsic ID. 7701 static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS, 7702 SelectionDAG &DAG, const SDLoc &dl, 7703 EVT DestVT = MVT::Other) { 7704 if (DestVT == MVT::Other) DestVT = LHS.getValueType(); 7705 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, 7706 DAG.getConstant(IID, dl, MVT::i32), LHS, RHS); 7707 } 7708 7709 /// BuildIntrinsicOp - Return a ternary operator intrinsic node with the 7710 /// specified intrinsic ID. 7711 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1, 7712 SDValue Op2, SelectionDAG &DAG, const SDLoc &dl, 7713 EVT DestVT = MVT::Other) { 7714 if (DestVT == MVT::Other) DestVT = Op0.getValueType(); 7715 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, 7716 DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2); 7717 } 7718 7719 /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified 7720 /// amount. The result has the specified value type. 7721 static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT, 7722 SelectionDAG &DAG, const SDLoc &dl) { 7723 // Force LHS/RHS to be the right type. 7724 LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS); 7725 RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS); 7726 7727 int Ops[16]; 7728 for (unsigned i = 0; i != 16; ++i) 7729 Ops[i] = i + Amt; 7730 SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops); 7731 return DAG.getNode(ISD::BITCAST, dl, VT, T); 7732 } 7733 7734 /// Do we have an efficient pattern in a .td file for this node? 7735 /// 7736 /// \param V - pointer to the BuildVectorSDNode being matched 7737 /// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves? 7738 /// 7739 /// There are some patterns where it is beneficial to keep a BUILD_VECTOR 7740 /// node as a BUILD_VECTOR node rather than expanding it. The patterns where 7741 /// the opposite is true (expansion is beneficial) are: 7742 /// - The node builds a vector out of integers that are not 32 or 64-bits 7743 /// - The node builds a vector out of constants 7744 /// - The node is a "load-and-splat" 7745 /// In all other cases, we will choose to keep the BUILD_VECTOR. 7746 static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V, 7747 bool HasDirectMove, 7748 bool HasP8Vector) { 7749 EVT VecVT = V->getValueType(0); 7750 bool RightType = VecVT == MVT::v2f64 || 7751 (HasP8Vector && VecVT == MVT::v4f32) || 7752 (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32)); 7753 if (!RightType) 7754 return false; 7755 7756 bool IsSplat = true; 7757 bool IsLoad = false; 7758 SDValue Op0 = V->getOperand(0); 7759 7760 // This function is called in a block that confirms the node is not a constant 7761 // splat. So a constant BUILD_VECTOR here means the vector is built out of 7762 // different constants. 7763 if (V->isConstant()) 7764 return false; 7765 for (int i = 0, e = V->getNumOperands(); i < e; ++i) { 7766 if (V->getOperand(i).isUndef()) 7767 return false; 7768 // We want to expand nodes that represent load-and-splat even if the 7769 // loaded value is a floating point truncation or conversion to int. 7770 if (V->getOperand(i).getOpcode() == ISD::LOAD || 7771 (V->getOperand(i).getOpcode() == ISD::FP_ROUND && 7772 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) || 7773 (V->getOperand(i).getOpcode() == ISD::FP_TO_SINT && 7774 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) || 7775 (V->getOperand(i).getOpcode() == ISD::FP_TO_UINT && 7776 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD)) 7777 IsLoad = true; 7778 // If the operands are different or the input is not a load and has more 7779 // uses than just this BV node, then it isn't a splat. 7780 if (V->getOperand(i) != Op0 || 7781 (!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode()))) 7782 IsSplat = false; 7783 } 7784 return !(IsSplat && IsLoad); 7785 } 7786 7787 // Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128. 7788 SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { 7789 7790 SDLoc dl(Op); 7791 SDValue Op0 = Op->getOperand(0); 7792 7793 if (!EnableQuadPrecision || 7794 (Op.getValueType() != MVT::f128 ) || 7795 (Op0.getOpcode() != ISD::BUILD_PAIR) || 7796 (Op0.getOperand(0).getValueType() != MVT::i64) || 7797 (Op0.getOperand(1).getValueType() != MVT::i64)) 7798 return SDValue(); 7799 7800 return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Op0.getOperand(0), 7801 Op0.getOperand(1)); 7802 } 7803 7804 // If this is a case we can't handle, return null and let the default 7805 // expansion code take care of it. If we CAN select this case, and if it 7806 // selects to a single instruction, return Op. Otherwise, if we can codegen 7807 // this case more efficiently than a constant pool load, lower it to the 7808 // sequence of ops that should be used. 7809 SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, 7810 SelectionDAG &DAG) const { 7811 SDLoc dl(Op); 7812 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 7813 assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); 7814 7815 if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) { 7816 // We first build an i32 vector, load it into a QPX register, 7817 // then convert it to a floating-point vector and compare it 7818 // to a zero vector to get the boolean result. 7819 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 7820 int FrameIdx = MFI.CreateStackObject(16, 16, false); 7821 MachinePointerInfo PtrInfo = 7822 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 7823 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 7824 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 7825 7826 assert(BVN->getNumOperands() == 4 && 7827 "BUILD_VECTOR for v4i1 does not have 4 operands"); 7828 7829 bool IsConst = true; 7830 for (unsigned i = 0; i < 4; ++i) { 7831 if (BVN->getOperand(i).isUndef()) continue; 7832 if (!isa<ConstantSDNode>(BVN->getOperand(i))) { 7833 IsConst = false; 7834 break; 7835 } 7836 } 7837 7838 if (IsConst) { 7839 Constant *One = 7840 ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0); 7841 Constant *NegOne = 7842 ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0); 7843 7844 Constant *CV[4]; 7845 for (unsigned i = 0; i < 4; ++i) { 7846 if (BVN->getOperand(i).isUndef()) 7847 CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); 7848 else if (isNullConstant(BVN->getOperand(i))) 7849 CV[i] = NegOne; 7850 else 7851 CV[i] = One; 7852 } 7853 7854 Constant *CP = ConstantVector::get(CV); 7855 SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), 7856 16 /* alignment */); 7857 7858 SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; 7859 SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other}); 7860 return DAG.getMemIntrinsicNode( 7861 PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, 7862 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 7863 } 7864 7865 SmallVector<SDValue, 4> Stores; 7866 for (unsigned i = 0; i < 4; ++i) { 7867 if (BVN->getOperand(i).isUndef()) continue; 7868 7869 unsigned Offset = 4*i; 7870 SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); 7871 Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); 7872 7873 unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize(); 7874 if (StoreSize > 4) { 7875 Stores.push_back( 7876 DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx, 7877 PtrInfo.getWithOffset(Offset), MVT::i32)); 7878 } else { 7879 SDValue StoreValue = BVN->getOperand(i); 7880 if (StoreSize < 4) 7881 StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue); 7882 7883 Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx, 7884 PtrInfo.getWithOffset(Offset))); 7885 } 7886 } 7887 7888 SDValue StoreChain; 7889 if (!Stores.empty()) 7890 StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); 7891 else 7892 StoreChain = DAG.getEntryNode(); 7893 7894 // Now load from v4i32 into the QPX register; this will extend it to 7895 // v4i64 but not yet convert it to a floating point. Nevertheless, this 7896 // is typed as v4f64 because the QPX register integer states are not 7897 // explicitly represented. 7898 7899 SDValue Ops[] = {StoreChain, 7900 DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32), 7901 FIdx}; 7902 SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other}); 7903 7904 SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, 7905 dl, VTs, Ops, MVT::v4i32, PtrInfo); 7906 LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, 7907 DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32), 7908 LoadedVect); 7909 7910 SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64); 7911 7912 return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ); 7913 } 7914 7915 // All other QPX vectors are handled by generic code. 7916 if (Subtarget.hasQPX()) 7917 return SDValue(); 7918 7919 // Check if this is a splat of a constant value. 7920 APInt APSplatBits, APSplatUndef; 7921 unsigned SplatBitSize; 7922 bool HasAnyUndefs; 7923 if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, 7924 HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || 7925 SplatBitSize > 32) { 7926 // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be 7927 // lowered to VSX instructions under certain conditions. 7928 // Without VSX, there is no pattern more efficient than expanding the node. 7929 if (Subtarget.hasVSX() && 7930 haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(), 7931 Subtarget.hasP8Vector())) 7932 return Op; 7933 return SDValue(); 7934 } 7935 7936 unsigned SplatBits = APSplatBits.getZExtValue(); 7937 unsigned SplatUndef = APSplatUndef.getZExtValue(); 7938 unsigned SplatSize = SplatBitSize / 8; 7939 7940 // First, handle single instruction cases. 7941 7942 // All zeros? 7943 if (SplatBits == 0) { 7944 // Canonicalize all zero vectors to be v4i32. 7945 if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) { 7946 SDValue Z = DAG.getConstant(0, dl, MVT::v4i32); 7947 Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z); 7948 } 7949 return Op; 7950 } 7951 7952 // We have XXSPLTIB for constant splats one byte wide 7953 if (Subtarget.hasP9Vector() && SplatSize == 1) { 7954 // This is a splat of 1-byte elements with some elements potentially undef. 7955 // Rather than trying to match undef in the SDAG patterns, ensure that all 7956 // elements are the same constant. 7957 if (HasAnyUndefs || ISD::isBuildVectorAllOnes(BVN)) { 7958 SmallVector<SDValue, 16> Ops(16, DAG.getConstant(SplatBits, 7959 dl, MVT::i32)); 7960 SDValue NewBV = DAG.getBuildVector(MVT::v16i8, dl, Ops); 7961 if (Op.getValueType() != MVT::v16i8) 7962 return DAG.getBitcast(Op.getValueType(), NewBV); 7963 return NewBV; 7964 } 7965 7966 // BuildVectorSDNode::isConstantSplat() is actually pretty smart. It'll 7967 // detect that constant splats like v8i16: 0xABAB are really just splats 7968 // of a 1-byte constant. In this case, we need to convert the node to a 7969 // splat of v16i8 and a bitcast. 7970 if (Op.getValueType() != MVT::v16i8) 7971 return DAG.getBitcast(Op.getValueType(), 7972 DAG.getConstant(SplatBits, dl, MVT::v16i8)); 7973 7974 return Op; 7975 } 7976 7977 // If the sign extended value is in the range [-16,15], use VSPLTI[bhw]. 7978 int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >> 7979 (32-SplatBitSize)); 7980 if (SextVal >= -16 && SextVal <= 15) 7981 return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl); 7982 7983 // Two instruction sequences. 7984 7985 // If this value is in the range [-32,30] and is even, use: 7986 // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2) 7987 // If this value is in the range [17,31] and is odd, use: 7988 // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16) 7989 // If this value is in the range [-31,-17] and is odd, use: 7990 // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16) 7991 // Note the last two are three-instruction sequences. 7992 if (SextVal >= -32 && SextVal <= 31) { 7993 // To avoid having these optimizations undone by constant folding, 7994 // we convert to a pseudo that will be expanded later into one of 7995 // the above forms. 7996 SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32); 7997 EVT VT = (SplatSize == 1 ? MVT::v16i8 : 7998 (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32)); 7999 SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32); 8000 SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize); 8001 if (VT == Op.getValueType()) 8002 return RetVal; 8003 else 8004 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal); 8005 } 8006 8007 // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is 8008 // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important 8009 // for fneg/fabs. 8010 if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) { 8011 // Make -1 and vspltisw -1: 8012 SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl); 8013 8014 // Make the VSLW intrinsic, computing 0x8000_0000. 8015 SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV, 8016 OnesV, DAG, dl); 8017 8018 // xor by OnesV to invert it. 8019 Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV); 8020 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 8021 } 8022 8023 // Check to see if this is a wide variety of vsplti*, binop self cases. 8024 static const signed char SplatCsts[] = { 8025 -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, 8026 -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16 8027 }; 8028 8029 for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) { 8030 // Indirect through the SplatCsts array so that we favor 'vsplti -1' for 8031 // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1' 8032 int i = SplatCsts[idx]; 8033 8034 // Figure out what shift amount will be used by altivec if shifted by i in 8035 // this splat size. 8036 unsigned TypeShiftAmt = i & (SplatBitSize-1); 8037 8038 // vsplti + shl self. 8039 if (SextVal == (int)((unsigned)i << TypeShiftAmt)) { 8040 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 8041 static const unsigned IIDs[] = { // Intrinsic to use for each size. 8042 Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0, 8043 Intrinsic::ppc_altivec_vslw 8044 }; 8045 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 8046 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 8047 } 8048 8049 // vsplti + srl self. 8050 if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { 8051 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 8052 static const unsigned IIDs[] = { // Intrinsic to use for each size. 8053 Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0, 8054 Intrinsic::ppc_altivec_vsrw 8055 }; 8056 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 8057 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 8058 } 8059 8060 // vsplti + sra self. 8061 if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { 8062 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 8063 static const unsigned IIDs[] = { // Intrinsic to use for each size. 8064 Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0, 8065 Intrinsic::ppc_altivec_vsraw 8066 }; 8067 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 8068 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 8069 } 8070 8071 // vsplti + rol self. 8072 if (SextVal == (int)(((unsigned)i << TypeShiftAmt) | 8073 ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) { 8074 SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); 8075 static const unsigned IIDs[] = { // Intrinsic to use for each size. 8076 Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0, 8077 Intrinsic::ppc_altivec_vrlw 8078 }; 8079 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); 8080 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); 8081 } 8082 8083 // t = vsplti c, result = vsldoi t, t, 1 8084 if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) { 8085 SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); 8086 unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1; 8087 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); 8088 } 8089 // t = vsplti c, result = vsldoi t, t, 2 8090 if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) { 8091 SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); 8092 unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2; 8093 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); 8094 } 8095 // t = vsplti c, result = vsldoi t, t, 3 8096 if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) { 8097 SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); 8098 unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3; 8099 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl); 8100 } 8101 } 8102 8103 return SDValue(); 8104 } 8105 8106 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 8107 /// the specified operations to build the shuffle. 8108 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 8109 SDValue RHS, SelectionDAG &DAG, 8110 const SDLoc &dl) { 8111 unsigned OpNum = (PFEntry >> 26) & 0x0F; 8112 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8113 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8114 8115 enum { 8116 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 8117 OP_VMRGHW, 8118 OP_VMRGLW, 8119 OP_VSPLTISW0, 8120 OP_VSPLTISW1, 8121 OP_VSPLTISW2, 8122 OP_VSPLTISW3, 8123 OP_VSLDOI4, 8124 OP_VSLDOI8, 8125 OP_VSLDOI12 8126 }; 8127 8128 if (OpNum == OP_COPY) { 8129 if (LHSID == (1*9+2)*9+3) return LHS; 8130 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 8131 return RHS; 8132 } 8133 8134 SDValue OpLHS, OpRHS; 8135 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 8136 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 8137 8138 int ShufIdxs[16]; 8139 switch (OpNum) { 8140 default: llvm_unreachable("Unknown i32 permute!"); 8141 case OP_VMRGHW: 8142 ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3; 8143 ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19; 8144 ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7; 8145 ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23; 8146 break; 8147 case OP_VMRGLW: 8148 ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11; 8149 ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27; 8150 ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15; 8151 ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31; 8152 break; 8153 case OP_VSPLTISW0: 8154 for (unsigned i = 0; i != 16; ++i) 8155 ShufIdxs[i] = (i&3)+0; 8156 break; 8157 case OP_VSPLTISW1: 8158 for (unsigned i = 0; i != 16; ++i) 8159 ShufIdxs[i] = (i&3)+4; 8160 break; 8161 case OP_VSPLTISW2: 8162 for (unsigned i = 0; i != 16; ++i) 8163 ShufIdxs[i] = (i&3)+8; 8164 break; 8165 case OP_VSPLTISW3: 8166 for (unsigned i = 0; i != 16; ++i) 8167 ShufIdxs[i] = (i&3)+12; 8168 break; 8169 case OP_VSLDOI4: 8170 return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl); 8171 case OP_VSLDOI8: 8172 return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl); 8173 case OP_VSLDOI12: 8174 return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl); 8175 } 8176 EVT VT = OpLHS.getValueType(); 8177 OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS); 8178 OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS); 8179 SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs); 8180 return DAG.getNode(ISD::BITCAST, dl, VT, T); 8181 } 8182 8183 /// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled 8184 /// by the VINSERTB instruction introduced in ISA 3.0, else just return default 8185 /// SDValue. 8186 SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N, 8187 SelectionDAG &DAG) const { 8188 const unsigned BytesInVector = 16; 8189 bool IsLE = Subtarget.isLittleEndian(); 8190 SDLoc dl(N); 8191 SDValue V1 = N->getOperand(0); 8192 SDValue V2 = N->getOperand(1); 8193 unsigned ShiftElts = 0, InsertAtByte = 0; 8194 bool Swap = false; 8195 8196 // Shifts required to get the byte we want at element 7. 8197 unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1, 8198 0, 15, 14, 13, 12, 11, 10, 9}; 8199 unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0, 8200 1, 2, 3, 4, 5, 6, 7, 8}; 8201 8202 ArrayRef<int> Mask = N->getMask(); 8203 int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 8204 8205 // For each mask element, find out if we're just inserting something 8206 // from V2 into V1 or vice versa. 8207 // Possible permutations inserting an element from V2 into V1: 8208 // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 8209 // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 8210 // ... 8211 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X 8212 // Inserting from V1 into V2 will be similar, except mask range will be 8213 // [16,31]. 8214 8215 bool FoundCandidate = false; 8216 // If both vector operands for the shuffle are the same vector, the mask 8217 // will contain only elements from the first one and the second one will be 8218 // undef. 8219 unsigned VINSERTBSrcElem = IsLE ? 8 : 7; 8220 // Go through the mask of half-words to find an element that's being moved 8221 // from one vector to the other. 8222 for (unsigned i = 0; i < BytesInVector; ++i) { 8223 unsigned CurrentElement = Mask[i]; 8224 // If 2nd operand is undefined, we should only look for element 7 in the 8225 // Mask. 8226 if (V2.isUndef() && CurrentElement != VINSERTBSrcElem) 8227 continue; 8228 8229 bool OtherElementsInOrder = true; 8230 // Examine the other elements in the Mask to see if they're in original 8231 // order. 8232 for (unsigned j = 0; j < BytesInVector; ++j) { 8233 if (j == i) 8234 continue; 8235 // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be 8236 // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined, 8237 // in which we always assume we're always picking from the 1st operand. 8238 int MaskOffset = 8239 (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0; 8240 if (Mask[j] != OriginalOrder[j] + MaskOffset) { 8241 OtherElementsInOrder = false; 8242 break; 8243 } 8244 } 8245 // If other elements are in original order, we record the number of shifts 8246 // we need to get the element we want into element 7. Also record which byte 8247 // in the vector we should insert into. 8248 if (OtherElementsInOrder) { 8249 // If 2nd operand is undefined, we assume no shifts and no swapping. 8250 if (V2.isUndef()) { 8251 ShiftElts = 0; 8252 Swap = false; 8253 } else { 8254 // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4. 8255 ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF] 8256 : BigEndianShifts[CurrentElement & 0xF]; 8257 Swap = CurrentElement < BytesInVector; 8258 } 8259 InsertAtByte = IsLE ? BytesInVector - (i + 1) : i; 8260 FoundCandidate = true; 8261 break; 8262 } 8263 } 8264 8265 if (!FoundCandidate) 8266 return SDValue(); 8267 8268 // Candidate found, construct the proper SDAG sequence with VINSERTB, 8269 // optionally with VECSHL if shift is required. 8270 if (Swap) 8271 std::swap(V1, V2); 8272 if (V2.isUndef()) 8273 V2 = V1; 8274 if (ShiftElts) { 8275 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2, 8276 DAG.getConstant(ShiftElts, dl, MVT::i32)); 8277 return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl, 8278 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 8279 } 8280 return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2, 8281 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 8282 } 8283 8284 /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled 8285 /// by the VINSERTH instruction introduced in ISA 3.0, else just return default 8286 /// SDValue. 8287 SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N, 8288 SelectionDAG &DAG) const { 8289 const unsigned NumHalfWords = 8; 8290 const unsigned BytesInVector = NumHalfWords * 2; 8291 // Check that the shuffle is on half-words. 8292 if (!isNByteElemShuffleMask(N, 2, 1)) 8293 return SDValue(); 8294 8295 bool IsLE = Subtarget.isLittleEndian(); 8296 SDLoc dl(N); 8297 SDValue V1 = N->getOperand(0); 8298 SDValue V2 = N->getOperand(1); 8299 unsigned ShiftElts = 0, InsertAtByte = 0; 8300 bool Swap = false; 8301 8302 // Shifts required to get the half-word we want at element 3. 8303 unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5}; 8304 unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4}; 8305 8306 uint32_t Mask = 0; 8307 uint32_t OriginalOrderLow = 0x1234567; 8308 uint32_t OriginalOrderHigh = 0x89ABCDEF; 8309 // Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a 8310 // 32-bit space, only need 4-bit nibbles per element. 8311 for (unsigned i = 0; i < NumHalfWords; ++i) { 8312 unsigned MaskShift = (NumHalfWords - 1 - i) * 4; 8313 Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift); 8314 } 8315 8316 // For each mask element, find out if we're just inserting something 8317 // from V2 into V1 or vice versa. Possible permutations inserting an element 8318 // from V2 into V1: 8319 // X, 1, 2, 3, 4, 5, 6, 7 8320 // 0, X, 2, 3, 4, 5, 6, 7 8321 // 0, 1, X, 3, 4, 5, 6, 7 8322 // 0, 1, 2, X, 4, 5, 6, 7 8323 // 0, 1, 2, 3, X, 5, 6, 7 8324 // 0, 1, 2, 3, 4, X, 6, 7 8325 // 0, 1, 2, 3, 4, 5, X, 7 8326 // 0, 1, 2, 3, 4, 5, 6, X 8327 // Inserting from V1 into V2 will be similar, except mask range will be [8,15]. 8328 8329 bool FoundCandidate = false; 8330 // Go through the mask of half-words to find an element that's being moved 8331 // from one vector to the other. 8332 for (unsigned i = 0; i < NumHalfWords; ++i) { 8333 unsigned MaskShift = (NumHalfWords - 1 - i) * 4; 8334 uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF; 8335 uint32_t MaskOtherElts = ~(0xF << MaskShift); 8336 uint32_t TargetOrder = 0x0; 8337 8338 // If both vector operands for the shuffle are the same vector, the mask 8339 // will contain only elements from the first one and the second one will be 8340 // undef. 8341 if (V2.isUndef()) { 8342 ShiftElts = 0; 8343 unsigned VINSERTHSrcElem = IsLE ? 4 : 3; 8344 TargetOrder = OriginalOrderLow; 8345 Swap = false; 8346 // Skip if not the correct element or mask of other elements don't equal 8347 // to our expected order. 8348 if (MaskOneElt == VINSERTHSrcElem && 8349 (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) { 8350 InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2; 8351 FoundCandidate = true; 8352 break; 8353 } 8354 } else { // If both operands are defined. 8355 // Target order is [8,15] if the current mask is between [0,7]. 8356 TargetOrder = 8357 (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow; 8358 // Skip if mask of other elements don't equal our expected order. 8359 if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) { 8360 // We only need the last 3 bits for the number of shifts. 8361 ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7] 8362 : BigEndianShifts[MaskOneElt & 0x7]; 8363 InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2; 8364 Swap = MaskOneElt < NumHalfWords; 8365 FoundCandidate = true; 8366 break; 8367 } 8368 } 8369 } 8370 8371 if (!FoundCandidate) 8372 return SDValue(); 8373 8374 // Candidate found, construct the proper SDAG sequence with VINSERTH, 8375 // optionally with VECSHL if shift is required. 8376 if (Swap) 8377 std::swap(V1, V2); 8378 if (V2.isUndef()) 8379 V2 = V1; 8380 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 8381 if (ShiftElts) { 8382 // Double ShiftElts because we're left shifting on v16i8 type. 8383 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2, 8384 DAG.getConstant(2 * ShiftElts, dl, MVT::i32)); 8385 SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl); 8386 SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2, 8387 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 8388 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); 8389 } 8390 SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 8391 SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2, 8392 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 8393 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); 8394 } 8395 8396 /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this 8397 /// is a shuffle we can handle in a single instruction, return it. Otherwise, 8398 /// return the code it can be lowered into. Worst case, it can always be 8399 /// lowered into a vperm. 8400 SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 8401 SelectionDAG &DAG) const { 8402 SDLoc dl(Op); 8403 SDValue V1 = Op.getOperand(0); 8404 SDValue V2 = Op.getOperand(1); 8405 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 8406 EVT VT = Op.getValueType(); 8407 bool isLittleEndian = Subtarget.isLittleEndian(); 8408 8409 unsigned ShiftElts, InsertAtByte; 8410 bool Swap = false; 8411 if (Subtarget.hasP9Vector() && 8412 PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, 8413 isLittleEndian)) { 8414 if (Swap) 8415 std::swap(V1, V2); 8416 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 8417 SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2); 8418 if (ShiftElts) { 8419 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2, 8420 DAG.getConstant(ShiftElts, dl, MVT::i32)); 8421 SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl, 8422 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 8423 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); 8424 } 8425 SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2, 8426 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 8427 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); 8428 } 8429 8430 if (Subtarget.hasP9Altivec()) { 8431 SDValue NewISDNode; 8432 if ((NewISDNode = lowerToVINSERTH(SVOp, DAG))) 8433 return NewISDNode; 8434 8435 if ((NewISDNode = lowerToVINSERTB(SVOp, DAG))) 8436 return NewISDNode; 8437 } 8438 8439 if (Subtarget.hasVSX() && 8440 PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { 8441 if (Swap) 8442 std::swap(V1, V2); 8443 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 8444 SDValue Conv2 = 8445 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2); 8446 8447 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2, 8448 DAG.getConstant(ShiftElts, dl, MVT::i32)); 8449 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl); 8450 } 8451 8452 if (Subtarget.hasVSX() && 8453 PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { 8454 if (Swap) 8455 std::swap(V1, V2); 8456 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); 8457 SDValue Conv2 = 8458 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2); 8459 8460 SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2, 8461 DAG.getConstant(ShiftElts, dl, MVT::i32)); 8462 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI); 8463 } 8464 8465 if (Subtarget.hasP9Vector()) { 8466 if (PPC::isXXBRHShuffleMask(SVOp)) { 8467 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 8468 SDValue ReveHWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v8i16, Conv); 8469 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord); 8470 } else if (PPC::isXXBRWShuffleMask(SVOp)) { 8471 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 8472 SDValue ReveWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v4i32, Conv); 8473 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord); 8474 } else if (PPC::isXXBRDShuffleMask(SVOp)) { 8475 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); 8476 SDValue ReveDWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Conv); 8477 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord); 8478 } else if (PPC::isXXBRQShuffleMask(SVOp)) { 8479 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1); 8480 SDValue ReveQWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v1i128, Conv); 8481 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord); 8482 } 8483 } 8484 8485 if (Subtarget.hasVSX()) { 8486 if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { 8487 int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); 8488 8489 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 8490 SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv, 8491 DAG.getConstant(SplatIdx, dl, MVT::i32)); 8492 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat); 8493 } 8494 8495 // Left shifts of 8 bytes are actually swaps. Convert accordingly. 8496 if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) { 8497 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 8498 SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv); 8499 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap); 8500 } 8501 } 8502 8503 if (Subtarget.hasQPX()) { 8504 if (VT.getVectorNumElements() != 4) 8505 return SDValue(); 8506 8507 if (V2.isUndef()) V2 = V1; 8508 8509 int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp); 8510 if (AlignIdx != -1) { 8511 return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2, 8512 DAG.getConstant(AlignIdx, dl, MVT::i32)); 8513 } else if (SVOp->isSplat()) { 8514 int SplatIdx = SVOp->getSplatIndex(); 8515 if (SplatIdx >= 4) { 8516 std::swap(V1, V2); 8517 SplatIdx -= 4; 8518 } 8519 8520 return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, 8521 DAG.getConstant(SplatIdx, dl, MVT::i32)); 8522 } 8523 8524 // Lower this into a qvgpci/qvfperm pair. 8525 8526 // Compute the qvgpci literal 8527 unsigned idx = 0; 8528 for (unsigned i = 0; i < 4; ++i) { 8529 int m = SVOp->getMaskElt(i); 8530 unsigned mm = m >= 0 ? (unsigned) m : i; 8531 idx |= mm << (3-i)*3; 8532 } 8533 8534 SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64, 8535 DAG.getConstant(idx, dl, MVT::i32)); 8536 return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3); 8537 } 8538 8539 // Cases that are handled by instructions that take permute immediates 8540 // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be 8541 // selected by the instruction selector. 8542 if (V2.isUndef()) { 8543 if (PPC::isSplatShuffleMask(SVOp, 1) || 8544 PPC::isSplatShuffleMask(SVOp, 2) || 8545 PPC::isSplatShuffleMask(SVOp, 4) || 8546 PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) || 8547 PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) || 8548 PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 || 8549 PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) || 8550 PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) || 8551 PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) || 8552 PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) || 8553 PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) || 8554 PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) || 8555 (Subtarget.hasP8Altivec() && ( 8556 PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) || 8557 PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) || 8558 PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) { 8559 return Op; 8560 } 8561 } 8562 8563 // Altivec has a variety of "shuffle immediates" that take two vector inputs 8564 // and produce a fixed permutation. If any of these match, do not lower to 8565 // VPERM. 8566 unsigned int ShuffleKind = isLittleEndian ? 2 : 0; 8567 if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) || 8568 PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) || 8569 PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 || 8570 PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) || 8571 PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) || 8572 PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) || 8573 PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) || 8574 PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) || 8575 PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) || 8576 (Subtarget.hasP8Altivec() && ( 8577 PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) || 8578 PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) || 8579 PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG)))) 8580 return Op; 8581 8582 // Check to see if this is a shuffle of 4-byte values. If so, we can use our 8583 // perfect shuffle table to emit an optimal matching sequence. 8584 ArrayRef<int> PermMask = SVOp->getMask(); 8585 8586 unsigned PFIndexes[4]; 8587 bool isFourElementShuffle = true; 8588 for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number 8589 unsigned EltNo = 8; // Start out undef. 8590 for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. 8591 if (PermMask[i*4+j] < 0) 8592 continue; // Undef, ignore it. 8593 8594 unsigned ByteSource = PermMask[i*4+j]; 8595 if ((ByteSource & 3) != j) { 8596 isFourElementShuffle = false; 8597 break; 8598 } 8599 8600 if (EltNo == 8) { 8601 EltNo = ByteSource/4; 8602 } else if (EltNo != ByteSource/4) { 8603 isFourElementShuffle = false; 8604 break; 8605 } 8606 } 8607 PFIndexes[i] = EltNo; 8608 } 8609 8610 // If this shuffle can be expressed as a shuffle of 4-byte elements, use the 8611 // perfect shuffle vector to determine if it is cost effective to do this as 8612 // discrete instructions, or whether we should use a vperm. 8613 // For now, we skip this for little endian until such time as we have a 8614 // little-endian perfect shuffle table. 8615 if (isFourElementShuffle && !isLittleEndian) { 8616 // Compute the index in the perfect shuffle table. 8617 unsigned PFTableIndex = 8618 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8619 8620 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8621 unsigned Cost = (PFEntry >> 30); 8622 8623 // Determining when to avoid vperm is tricky. Many things affect the cost 8624 // of vperm, particularly how many times the perm mask needs to be computed. 8625 // For example, if the perm mask can be hoisted out of a loop or is already 8626 // used (perhaps because there are multiple permutes with the same shuffle 8627 // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of 8628 // the loop requires an extra register. 8629 // 8630 // As a compromise, we only emit discrete instructions if the shuffle can be 8631 // generated in 3 or fewer operations. When we have loop information 8632 // available, if this block is within a loop, we should avoid using vperm 8633 // for 3-operation perms and use a constant pool load instead. 8634 if (Cost < 3) 8635 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8636 } 8637 8638 // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant 8639 // vector that will get spilled to the constant pool. 8640 if (V2.isUndef()) V2 = V1; 8641 8642 // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except 8643 // that it is in input element units, not in bytes. Convert now. 8644 8645 // For little endian, the order of the input vectors is reversed, and 8646 // the permutation mask is complemented with respect to 31. This is 8647 // necessary to produce proper semantics with the big-endian-biased vperm 8648 // instruction. 8649 EVT EltVT = V1.getValueType().getVectorElementType(); 8650 unsigned BytesPerElement = EltVT.getSizeInBits()/8; 8651 8652 SmallVector<SDValue, 16> ResultMask; 8653 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { 8654 unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; 8655 8656 for (unsigned j = 0; j != BytesPerElement; ++j) 8657 if (isLittleEndian) 8658 ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j), 8659 dl, MVT::i32)); 8660 else 8661 ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl, 8662 MVT::i32)); 8663 } 8664 8665 SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask); 8666 if (isLittleEndian) 8667 return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), 8668 V2, V1, VPermMask); 8669 else 8670 return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), 8671 V1, V2, VPermMask); 8672 } 8673 8674 /// getVectorCompareInfo - Given an intrinsic, return false if it is not a 8675 /// vector comparison. If it is, return true and fill in Opc/isDot with 8676 /// information about the intrinsic. 8677 static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, 8678 bool &isDot, const PPCSubtarget &Subtarget) { 8679 unsigned IntrinsicID = 8680 cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); 8681 CompareOpc = -1; 8682 isDot = false; 8683 switch (IntrinsicID) { 8684 default: 8685 return false; 8686 // Comparison predicates. 8687 case Intrinsic::ppc_altivec_vcmpbfp_p: 8688 CompareOpc = 966; 8689 isDot = true; 8690 break; 8691 case Intrinsic::ppc_altivec_vcmpeqfp_p: 8692 CompareOpc = 198; 8693 isDot = true; 8694 break; 8695 case Intrinsic::ppc_altivec_vcmpequb_p: 8696 CompareOpc = 6; 8697 isDot = true; 8698 break; 8699 case Intrinsic::ppc_altivec_vcmpequh_p: 8700 CompareOpc = 70; 8701 isDot = true; 8702 break; 8703 case Intrinsic::ppc_altivec_vcmpequw_p: 8704 CompareOpc = 134; 8705 isDot = true; 8706 break; 8707 case Intrinsic::ppc_altivec_vcmpequd_p: 8708 if (Subtarget.hasP8Altivec()) { 8709 CompareOpc = 199; 8710 isDot = true; 8711 } else 8712 return false; 8713 break; 8714 case Intrinsic::ppc_altivec_vcmpneb_p: 8715 case Intrinsic::ppc_altivec_vcmpneh_p: 8716 case Intrinsic::ppc_altivec_vcmpnew_p: 8717 case Intrinsic::ppc_altivec_vcmpnezb_p: 8718 case Intrinsic::ppc_altivec_vcmpnezh_p: 8719 case Intrinsic::ppc_altivec_vcmpnezw_p: 8720 if (Subtarget.hasP9Altivec()) { 8721 switch (IntrinsicID) { 8722 default: 8723 llvm_unreachable("Unknown comparison intrinsic."); 8724 case Intrinsic::ppc_altivec_vcmpneb_p: 8725 CompareOpc = 7; 8726 break; 8727 case Intrinsic::ppc_altivec_vcmpneh_p: 8728 CompareOpc = 71; 8729 break; 8730 case Intrinsic::ppc_altivec_vcmpnew_p: 8731 CompareOpc = 135; 8732 break; 8733 case Intrinsic::ppc_altivec_vcmpnezb_p: 8734 CompareOpc = 263; 8735 break; 8736 case Intrinsic::ppc_altivec_vcmpnezh_p: 8737 CompareOpc = 327; 8738 break; 8739 case Intrinsic::ppc_altivec_vcmpnezw_p: 8740 CompareOpc = 391; 8741 break; 8742 } 8743 isDot = true; 8744 } else 8745 return false; 8746 break; 8747 case Intrinsic::ppc_altivec_vcmpgefp_p: 8748 CompareOpc = 454; 8749 isDot = true; 8750 break; 8751 case Intrinsic::ppc_altivec_vcmpgtfp_p: 8752 CompareOpc = 710; 8753 isDot = true; 8754 break; 8755 case Intrinsic::ppc_altivec_vcmpgtsb_p: 8756 CompareOpc = 774; 8757 isDot = true; 8758 break; 8759 case Intrinsic::ppc_altivec_vcmpgtsh_p: 8760 CompareOpc = 838; 8761 isDot = true; 8762 break; 8763 case Intrinsic::ppc_altivec_vcmpgtsw_p: 8764 CompareOpc = 902; 8765 isDot = true; 8766 break; 8767 case Intrinsic::ppc_altivec_vcmpgtsd_p: 8768 if (Subtarget.hasP8Altivec()) { 8769 CompareOpc = 967; 8770 isDot = true; 8771 } else 8772 return false; 8773 break; 8774 case Intrinsic::ppc_altivec_vcmpgtub_p: 8775 CompareOpc = 518; 8776 isDot = true; 8777 break; 8778 case Intrinsic::ppc_altivec_vcmpgtuh_p: 8779 CompareOpc = 582; 8780 isDot = true; 8781 break; 8782 case Intrinsic::ppc_altivec_vcmpgtuw_p: 8783 CompareOpc = 646; 8784 isDot = true; 8785 break; 8786 case Intrinsic::ppc_altivec_vcmpgtud_p: 8787 if (Subtarget.hasP8Altivec()) { 8788 CompareOpc = 711; 8789 isDot = true; 8790 } else 8791 return false; 8792 break; 8793 8794 // VSX predicate comparisons use the same infrastructure 8795 case Intrinsic::ppc_vsx_xvcmpeqdp_p: 8796 case Intrinsic::ppc_vsx_xvcmpgedp_p: 8797 case Intrinsic::ppc_vsx_xvcmpgtdp_p: 8798 case Intrinsic::ppc_vsx_xvcmpeqsp_p: 8799 case Intrinsic::ppc_vsx_xvcmpgesp_p: 8800 case Intrinsic::ppc_vsx_xvcmpgtsp_p: 8801 if (Subtarget.hasVSX()) { 8802 switch (IntrinsicID) { 8803 case Intrinsic::ppc_vsx_xvcmpeqdp_p: 8804 CompareOpc = 99; 8805 break; 8806 case Intrinsic::ppc_vsx_xvcmpgedp_p: 8807 CompareOpc = 115; 8808 break; 8809 case Intrinsic::ppc_vsx_xvcmpgtdp_p: 8810 CompareOpc = 107; 8811 break; 8812 case Intrinsic::ppc_vsx_xvcmpeqsp_p: 8813 CompareOpc = 67; 8814 break; 8815 case Intrinsic::ppc_vsx_xvcmpgesp_p: 8816 CompareOpc = 83; 8817 break; 8818 case Intrinsic::ppc_vsx_xvcmpgtsp_p: 8819 CompareOpc = 75; 8820 break; 8821 } 8822 isDot = true; 8823 } else 8824 return false; 8825 break; 8826 8827 // Normal Comparisons. 8828 case Intrinsic::ppc_altivec_vcmpbfp: 8829 CompareOpc = 966; 8830 break; 8831 case Intrinsic::ppc_altivec_vcmpeqfp: 8832 CompareOpc = 198; 8833 break; 8834 case Intrinsic::ppc_altivec_vcmpequb: 8835 CompareOpc = 6; 8836 break; 8837 case Intrinsic::ppc_altivec_vcmpequh: 8838 CompareOpc = 70; 8839 break; 8840 case Intrinsic::ppc_altivec_vcmpequw: 8841 CompareOpc = 134; 8842 break; 8843 case Intrinsic::ppc_altivec_vcmpequd: 8844 if (Subtarget.hasP8Altivec()) 8845 CompareOpc = 199; 8846 else 8847 return false; 8848 break; 8849 case Intrinsic::ppc_altivec_vcmpneb: 8850 case Intrinsic::ppc_altivec_vcmpneh: 8851 case Intrinsic::ppc_altivec_vcmpnew: 8852 case Intrinsic::ppc_altivec_vcmpnezb: 8853 case Intrinsic::ppc_altivec_vcmpnezh: 8854 case Intrinsic::ppc_altivec_vcmpnezw: 8855 if (Subtarget.hasP9Altivec()) 8856 switch (IntrinsicID) { 8857 default: 8858 llvm_unreachable("Unknown comparison intrinsic."); 8859 case Intrinsic::ppc_altivec_vcmpneb: 8860 CompareOpc = 7; 8861 break; 8862 case Intrinsic::ppc_altivec_vcmpneh: 8863 CompareOpc = 71; 8864 break; 8865 case Intrinsic::ppc_altivec_vcmpnew: 8866 CompareOpc = 135; 8867 break; 8868 case Intrinsic::ppc_altivec_vcmpnezb: 8869 CompareOpc = 263; 8870 break; 8871 case Intrinsic::ppc_altivec_vcmpnezh: 8872 CompareOpc = 327; 8873 break; 8874 case Intrinsic::ppc_altivec_vcmpnezw: 8875 CompareOpc = 391; 8876 break; 8877 } 8878 else 8879 return false; 8880 break; 8881 case Intrinsic::ppc_altivec_vcmpgefp: 8882 CompareOpc = 454; 8883 break; 8884 case Intrinsic::ppc_altivec_vcmpgtfp: 8885 CompareOpc = 710; 8886 break; 8887 case Intrinsic::ppc_altivec_vcmpgtsb: 8888 CompareOpc = 774; 8889 break; 8890 case Intrinsic::ppc_altivec_vcmpgtsh: 8891 CompareOpc = 838; 8892 break; 8893 case Intrinsic::ppc_altivec_vcmpgtsw: 8894 CompareOpc = 902; 8895 break; 8896 case Intrinsic::ppc_altivec_vcmpgtsd: 8897 if (Subtarget.hasP8Altivec()) 8898 CompareOpc = 967; 8899 else 8900 return false; 8901 break; 8902 case Intrinsic::ppc_altivec_vcmpgtub: 8903 CompareOpc = 518; 8904 break; 8905 case Intrinsic::ppc_altivec_vcmpgtuh: 8906 CompareOpc = 582; 8907 break; 8908 case Intrinsic::ppc_altivec_vcmpgtuw: 8909 CompareOpc = 646; 8910 break; 8911 case Intrinsic::ppc_altivec_vcmpgtud: 8912 if (Subtarget.hasP8Altivec()) 8913 CompareOpc = 711; 8914 else 8915 return false; 8916 break; 8917 } 8918 return true; 8919 } 8920 8921 /// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom 8922 /// lower, do it, otherwise return null. 8923 SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 8924 SelectionDAG &DAG) const { 8925 unsigned IntrinsicID = 8926 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8927 8928 SDLoc dl(Op); 8929 8930 if (IntrinsicID == Intrinsic::thread_pointer) { 8931 // Reads the thread pointer register, used for __builtin_thread_pointer. 8932 if (Subtarget.isPPC64()) 8933 return DAG.getRegister(PPC::X13, MVT::i64); 8934 return DAG.getRegister(PPC::R2, MVT::i32); 8935 } 8936 8937 // If this is a lowered altivec predicate compare, CompareOpc is set to the 8938 // opcode number of the comparison. 8939 int CompareOpc; 8940 bool isDot; 8941 if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget)) 8942 return SDValue(); // Don't custom lower most intrinsics. 8943 8944 // If this is a non-dot comparison, make the VCMP node and we are done. 8945 if (!isDot) { 8946 SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(), 8947 Op.getOperand(1), Op.getOperand(2), 8948 DAG.getConstant(CompareOpc, dl, MVT::i32)); 8949 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp); 8950 } 8951 8952 // Create the PPCISD altivec 'dot' comparison node. 8953 SDValue Ops[] = { 8954 Op.getOperand(2), // LHS 8955 Op.getOperand(3), // RHS 8956 DAG.getConstant(CompareOpc, dl, MVT::i32) 8957 }; 8958 EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue }; 8959 SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); 8960 8961 // Now that we have the comparison, emit a copy from the CR to a GPR. 8962 // This is flagged to the above dot comparison. 8963 SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32, 8964 DAG.getRegister(PPC::CR6, MVT::i32), 8965 CompNode.getValue(1)); 8966 8967 // Unpack the result based on how the target uses it. 8968 unsigned BitNo; // Bit # of CR6. 8969 bool InvertBit; // Invert result? 8970 switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) { 8971 default: // Can't happen, don't crash on invalid number though. 8972 case 0: // Return the value of the EQ bit of CR6. 8973 BitNo = 0; InvertBit = false; 8974 break; 8975 case 1: // Return the inverted value of the EQ bit of CR6. 8976 BitNo = 0; InvertBit = true; 8977 break; 8978 case 2: // Return the value of the LT bit of CR6. 8979 BitNo = 2; InvertBit = false; 8980 break; 8981 case 3: // Return the inverted value of the LT bit of CR6. 8982 BitNo = 2; InvertBit = true; 8983 break; 8984 } 8985 8986 // Shift the bit into the low position. 8987 Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags, 8988 DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32)); 8989 // Isolate the bit. 8990 Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags, 8991 DAG.getConstant(1, dl, MVT::i32)); 8992 8993 // If we are supposed to, toggle the bit. 8994 if (InvertBit) 8995 Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags, 8996 DAG.getConstant(1, dl, MVT::i32)); 8997 return Flags; 8998 } 8999 9000 SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, 9001 SelectionDAG &DAG) const { 9002 // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to 9003 // the beginning of the argument list. 9004 int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1; 9005 SDLoc DL(Op); 9006 switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) { 9007 case Intrinsic::ppc_cfence: { 9008 assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument."); 9009 assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); 9010 return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, 9011 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, 9012 Op.getOperand(ArgStart + 1)), 9013 Op.getOperand(0)), 9014 0); 9015 } 9016 default: 9017 break; 9018 } 9019 return SDValue(); 9020 } 9021 9022 SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const { 9023 // Check for a DIV with the same operands as this REM. 9024 for (auto UI : Op.getOperand(1)->uses()) { 9025 if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) || 9026 (Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV)) 9027 if (UI->getOperand(0) == Op.getOperand(0) && 9028 UI->getOperand(1) == Op.getOperand(1)) 9029 return SDValue(); 9030 } 9031 return Op; 9032 } 9033 9034 // Lower scalar BSWAP64 to xxbrd. 9035 SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const { 9036 SDLoc dl(Op); 9037 // MTVSRDD 9038 Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0), 9039 Op.getOperand(0)); 9040 // XXBRD 9041 Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op); 9042 // MFVSRD 9043 int VectorIndex = 0; 9044 if (Subtarget.isLittleEndian()) 9045 VectorIndex = 1; 9046 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op, 9047 DAG.getTargetConstant(VectorIndex, dl, MVT::i32)); 9048 return Op; 9049 } 9050 9051 // ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be 9052 // compared to a value that is atomically loaded (atomic loads zero-extend). 9053 SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, 9054 SelectionDAG &DAG) const { 9055 assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP && 9056 "Expecting an atomic compare-and-swap here."); 9057 SDLoc dl(Op); 9058 auto *AtomicNode = cast<AtomicSDNode>(Op.getNode()); 9059 EVT MemVT = AtomicNode->getMemoryVT(); 9060 if (MemVT.getSizeInBits() >= 32) 9061 return Op; 9062 9063 SDValue CmpOp = Op.getOperand(2); 9064 // If this is already correctly zero-extended, leave it alone. 9065 auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits()); 9066 if (DAG.MaskedValueIsZero(CmpOp, HighBits)) 9067 return Op; 9068 9069 // Clear the high bits of the compare operand. 9070 unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1; 9071 SDValue NewCmpOp = 9072 DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp, 9073 DAG.getConstant(MaskVal, dl, MVT::i32)); 9074 9075 // Replace the existing compare operand with the properly zero-extended one. 9076 SmallVector<SDValue, 4> Ops; 9077 for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++) 9078 Ops.push_back(AtomicNode->getOperand(i)); 9079 Ops[2] = NewCmpOp; 9080 MachineMemOperand *MMO = AtomicNode->getMemOperand(); 9081 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other); 9082 auto NodeTy = 9083 (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16; 9084 return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO); 9085 } 9086 9087 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, 9088 SelectionDAG &DAG) const { 9089 SDLoc dl(Op); 9090 // Create a stack slot that is 16-byte aligned. 9091 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 9092 int FrameIdx = MFI.CreateStackObject(16, 16, false); 9093 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 9094 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 9095 9096 // Store the input value into Value#0 of the stack slot. 9097 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, 9098 MachinePointerInfo()); 9099 // Load it out. 9100 return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo()); 9101 } 9102 9103 SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 9104 SelectionDAG &DAG) const { 9105 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && 9106 "Should only be called for ISD::INSERT_VECTOR_ELT"); 9107 9108 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 9109 // We have legal lowering for constant indices but not for variable ones. 9110 if (!C) 9111 return SDValue(); 9112 9113 EVT VT = Op.getValueType(); 9114 SDLoc dl(Op); 9115 SDValue V1 = Op.getOperand(0); 9116 SDValue V2 = Op.getOperand(1); 9117 // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types. 9118 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 9119 SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2); 9120 unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8; 9121 unsigned InsertAtElement = C->getZExtValue(); 9122 unsigned InsertAtByte = InsertAtElement * BytesInEachElement; 9123 if (Subtarget.isLittleEndian()) { 9124 InsertAtByte = (16 - BytesInEachElement) - InsertAtByte; 9125 } 9126 return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz, 9127 DAG.getConstant(InsertAtByte, dl, MVT::i32)); 9128 } 9129 return Op; 9130 } 9131 9132 SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 9133 SelectionDAG &DAG) const { 9134 SDLoc dl(Op); 9135 SDNode *N = Op.getNode(); 9136 9137 assert(N->getOperand(0).getValueType() == MVT::v4i1 && 9138 "Unknown extract_vector_elt type"); 9139 9140 SDValue Value = N->getOperand(0); 9141 9142 // The first part of this is like the store lowering except that we don't 9143 // need to track the chain. 9144 9145 // The values are now known to be -1 (false) or 1 (true). To convert this 9146 // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). 9147 // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 9148 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); 9149 9150 // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to 9151 // understand how to form the extending load. 9152 SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); 9153 9154 Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 9155 9156 // Now convert to an integer and store. 9157 Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, 9158 DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), 9159 Value); 9160 9161 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 9162 int FrameIdx = MFI.CreateStackObject(16, 16, false); 9163 MachinePointerInfo PtrInfo = 9164 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 9165 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 9166 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 9167 9168 SDValue StoreChain = DAG.getEntryNode(); 9169 SDValue Ops[] = {StoreChain, 9170 DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), 9171 Value, FIdx}; 9172 SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); 9173 9174 StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, 9175 dl, VTs, Ops, MVT::v4i32, PtrInfo); 9176 9177 // Extract the value requested. 9178 unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 9179 SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); 9180 Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); 9181 9182 SDValue IntVal = 9183 DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); 9184 9185 if (!Subtarget.useCRBits()) 9186 return IntVal; 9187 9188 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal); 9189 } 9190 9191 /// Lowering for QPX v4i1 loads 9192 SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, 9193 SelectionDAG &DAG) const { 9194 SDLoc dl(Op); 9195 LoadSDNode *LN = cast<LoadSDNode>(Op.getNode()); 9196 SDValue LoadChain = LN->getChain(); 9197 SDValue BasePtr = LN->getBasePtr(); 9198 9199 if (Op.getValueType() == MVT::v4f64 || 9200 Op.getValueType() == MVT::v4f32) { 9201 EVT MemVT = LN->getMemoryVT(); 9202 unsigned Alignment = LN->getAlignment(); 9203 9204 // If this load is properly aligned, then it is legal. 9205 if (Alignment >= MemVT.getStoreSize()) 9206 return Op; 9207 9208 EVT ScalarVT = Op.getValueType().getScalarType(), 9209 ScalarMemVT = MemVT.getScalarType(); 9210 unsigned Stride = ScalarMemVT.getStoreSize(); 9211 9212 SDValue Vals[4], LoadChains[4]; 9213 for (unsigned Idx = 0; Idx < 4; ++Idx) { 9214 SDValue Load; 9215 if (ScalarVT != ScalarMemVT) 9216 Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain, 9217 BasePtr, 9218 LN->getPointerInfo().getWithOffset(Idx * Stride), 9219 ScalarMemVT, MinAlign(Alignment, Idx * Stride), 9220 LN->getMemOperand()->getFlags(), LN->getAAInfo()); 9221 else 9222 Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr, 9223 LN->getPointerInfo().getWithOffset(Idx * Stride), 9224 MinAlign(Alignment, Idx * Stride), 9225 LN->getMemOperand()->getFlags(), LN->getAAInfo()); 9226 9227 if (Idx == 0 && LN->isIndexed()) { 9228 assert(LN->getAddressingMode() == ISD::PRE_INC && 9229 "Unknown addressing mode on vector load"); 9230 Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(), 9231 LN->getAddressingMode()); 9232 } 9233 9234 Vals[Idx] = Load; 9235 LoadChains[Idx] = Load.getValue(1); 9236 9237 BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 9238 DAG.getConstant(Stride, dl, 9239 BasePtr.getValueType())); 9240 } 9241 9242 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); 9243 SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals); 9244 9245 if (LN->isIndexed()) { 9246 SDValue RetOps[] = { Value, Vals[0].getValue(1), TF }; 9247 return DAG.getMergeValues(RetOps, dl); 9248 } 9249 9250 SDValue RetOps[] = { Value, TF }; 9251 return DAG.getMergeValues(RetOps, dl); 9252 } 9253 9254 assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower"); 9255 assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported"); 9256 9257 // To lower v4i1 from a byte array, we load the byte elements of the 9258 // vector and then reuse the BUILD_VECTOR logic. 9259 9260 SDValue VectElmts[4], VectElmtChains[4]; 9261 for (unsigned i = 0; i < 4; ++i) { 9262 SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); 9263 Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); 9264 9265 VectElmts[i] = DAG.getExtLoad( 9266 ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx, 9267 LN->getPointerInfo().getWithOffset(i), MVT::i8, 9268 /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo()); 9269 VectElmtChains[i] = VectElmts[i].getValue(1); 9270 } 9271 9272 LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains); 9273 SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts); 9274 9275 SDValue RVals[] = { Value, LoadChain }; 9276 return DAG.getMergeValues(RVals, dl); 9277 } 9278 9279 /// Lowering for QPX v4i1 stores 9280 SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, 9281 SelectionDAG &DAG) const { 9282 SDLoc dl(Op); 9283 StoreSDNode *SN = cast<StoreSDNode>(Op.getNode()); 9284 SDValue StoreChain = SN->getChain(); 9285 SDValue BasePtr = SN->getBasePtr(); 9286 SDValue Value = SN->getValue(); 9287 9288 if (Value.getValueType() == MVT::v4f64 || 9289 Value.getValueType() == MVT::v4f32) { 9290 EVT MemVT = SN->getMemoryVT(); 9291 unsigned Alignment = SN->getAlignment(); 9292 9293 // If this store is properly aligned, then it is legal. 9294 if (Alignment >= MemVT.getStoreSize()) 9295 return Op; 9296 9297 EVT ScalarVT = Value.getValueType().getScalarType(), 9298 ScalarMemVT = MemVT.getScalarType(); 9299 unsigned Stride = ScalarMemVT.getStoreSize(); 9300 9301 SDValue Stores[4]; 9302 for (unsigned Idx = 0; Idx < 4; ++Idx) { 9303 SDValue Ex = DAG.getNode( 9304 ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, 9305 DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout()))); 9306 SDValue Store; 9307 if (ScalarVT != ScalarMemVT) 9308 Store = 9309 DAG.getTruncStore(StoreChain, dl, Ex, BasePtr, 9310 SN->getPointerInfo().getWithOffset(Idx * Stride), 9311 ScalarMemVT, MinAlign(Alignment, Idx * Stride), 9312 SN->getMemOperand()->getFlags(), SN->getAAInfo()); 9313 else 9314 Store = DAG.getStore(StoreChain, dl, Ex, BasePtr, 9315 SN->getPointerInfo().getWithOffset(Idx * Stride), 9316 MinAlign(Alignment, Idx * Stride), 9317 SN->getMemOperand()->getFlags(), SN->getAAInfo()); 9318 9319 if (Idx == 0 && SN->isIndexed()) { 9320 assert(SN->getAddressingMode() == ISD::PRE_INC && 9321 "Unknown addressing mode on vector store"); 9322 Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(), 9323 SN->getAddressingMode()); 9324 } 9325 9326 BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 9327 DAG.getConstant(Stride, dl, 9328 BasePtr.getValueType())); 9329 Stores[Idx] = Store; 9330 } 9331 9332 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); 9333 9334 if (SN->isIndexed()) { 9335 SDValue RetOps[] = { TF, Stores[0].getValue(1) }; 9336 return DAG.getMergeValues(RetOps, dl); 9337 } 9338 9339 return TF; 9340 } 9341 9342 assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported"); 9343 assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower"); 9344 9345 // The values are now known to be -1 (false) or 1 (true). To convert this 9346 // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). 9347 // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 9348 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); 9349 9350 // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to 9351 // understand how to form the extending load. 9352 SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); 9353 9354 Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); 9355 9356 // Now convert to an integer and store. 9357 Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, 9358 DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), 9359 Value); 9360 9361 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 9362 int FrameIdx = MFI.CreateStackObject(16, 16, false); 9363 MachinePointerInfo PtrInfo = 9364 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); 9365 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 9366 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); 9367 9368 SDValue Ops[] = {StoreChain, 9369 DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), 9370 Value, FIdx}; 9371 SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); 9372 9373 StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, 9374 dl, VTs, Ops, MVT::v4i32, PtrInfo); 9375 9376 // Move data into the byte array. 9377 SDValue Loads[4], LoadChains[4]; 9378 for (unsigned i = 0; i < 4; ++i) { 9379 unsigned Offset = 4*i; 9380 SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); 9381 Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); 9382 9383 Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, 9384 PtrInfo.getWithOffset(Offset)); 9385 LoadChains[i] = Loads[i].getValue(1); 9386 } 9387 9388 StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); 9389 9390 SDValue Stores[4]; 9391 for (unsigned i = 0; i < 4; ++i) { 9392 SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); 9393 Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); 9394 9395 Stores[i] = DAG.getTruncStore( 9396 StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), 9397 MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(), 9398 SN->getAAInfo()); 9399 } 9400 9401 StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); 9402 9403 return StoreChain; 9404 } 9405 9406 SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { 9407 SDLoc dl(Op); 9408 if (Op.getValueType() == MVT::v4i32) { 9409 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 9410 9411 SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl); 9412 SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt. 9413 9414 SDValue RHSSwap = // = vrlw RHS, 16 9415 BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl); 9416 9417 // Shrinkify inputs to v8i16. 9418 LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS); 9419 RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS); 9420 RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap); 9421 9422 // Low parts multiplied together, generating 32-bit results (we ignore the 9423 // top parts). 9424 SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh, 9425 LHS, RHS, DAG, dl, MVT::v4i32); 9426 9427 SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm, 9428 LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32); 9429 // Shift the high parts up 16 bits. 9430 HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd, 9431 Neg16, DAG, dl); 9432 return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd); 9433 } else if (Op.getValueType() == MVT::v8i16) { 9434 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 9435 9436 SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl); 9437 9438 return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm, 9439 LHS, RHS, Zero, DAG, dl); 9440 } else if (Op.getValueType() == MVT::v16i8) { 9441 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); 9442 bool isLittleEndian = Subtarget.isLittleEndian(); 9443 9444 // Multiply the even 8-bit parts, producing 16-bit sums. 9445 SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub, 9446 LHS, RHS, DAG, dl, MVT::v8i16); 9447 EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts); 9448 9449 // Multiply the odd 8-bit parts, producing 16-bit sums. 9450 SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub, 9451 LHS, RHS, DAG, dl, MVT::v8i16); 9452 OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts); 9453 9454 // Merge the results together. Because vmuleub and vmuloub are 9455 // instructions with a big-endian bias, we must reverse the 9456 // element numbering and reverse the meaning of "odd" and "even" 9457 // when generating little endian code. 9458 int Ops[16]; 9459 for (unsigned i = 0; i != 8; ++i) { 9460 if (isLittleEndian) { 9461 Ops[i*2 ] = 2*i; 9462 Ops[i*2+1] = 2*i+16; 9463 } else { 9464 Ops[i*2 ] = 2*i+1; 9465 Ops[i*2+1] = 2*i+1+16; 9466 } 9467 } 9468 if (isLittleEndian) 9469 return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops); 9470 else 9471 return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops); 9472 } else { 9473 llvm_unreachable("Unknown mul to lower!"); 9474 } 9475 } 9476 9477 SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { 9478 9479 assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS"); 9480 9481 EVT VT = Op.getValueType(); 9482 assert(VT.isVector() && 9483 "Only set vector abs as custom, scalar abs shouldn't reach here!"); 9484 assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || 9485 VT == MVT::v16i8) && 9486 "Unexpected vector element type!"); 9487 assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) && 9488 "Current subtarget doesn't support smax v2i64!"); 9489 9490 // For vector abs, it can be lowered to: 9491 // abs x 9492 // ==> 9493 // y = -x 9494 // smax(x, y) 9495 9496 SDLoc dl(Op); 9497 SDValue X = Op.getOperand(0); 9498 SDValue Zero = DAG.getConstant(0, dl, VT); 9499 SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X); 9500 9501 // SMAX patch https://reviews.llvm.org/D47332 9502 // hasn't landed yet, so use intrinsic first here. 9503 // TODO: Should use SMAX directly once SMAX patch landed 9504 Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw; 9505 if (VT == MVT::v2i64) 9506 BifID = Intrinsic::ppc_altivec_vmaxsd; 9507 else if (VT == MVT::v8i16) 9508 BifID = Intrinsic::ppc_altivec_vmaxsh; 9509 else if (VT == MVT::v16i8) 9510 BifID = Intrinsic::ppc_altivec_vmaxsb; 9511 9512 return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT); 9513 } 9514 9515 /// LowerOperation - Provide custom lowering hooks for some operations. 9516 /// 9517 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9518 switch (Op.getOpcode()) { 9519 default: llvm_unreachable("Wasn't expecting to be able to lower this!"); 9520 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9521 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9522 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9523 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9524 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 9525 case ISD::SETCC: return LowerSETCC(Op, DAG); 9526 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); 9527 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); 9528 9529 // Variable argument lowering. 9530 case ISD::VASTART: return LowerVASTART(Op, DAG); 9531 case ISD::VAARG: return LowerVAARG(Op, DAG); 9532 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 9533 9534 case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG); 9535 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 9536 case ISD::GET_DYNAMIC_AREA_OFFSET: 9537 return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG); 9538 9539 // Exception handling lowering. 9540 case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG); 9541 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); 9542 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); 9543 9544 case ISD::LOAD: return LowerLOAD(Op, DAG); 9545 case ISD::STORE: return LowerSTORE(Op, DAG); 9546 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); 9547 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 9548 case ISD::FP_TO_UINT: 9549 case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op)); 9550 case ISD::UINT_TO_FP: 9551 case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 9552 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9553 9554 // Lower 64-bit shifts. 9555 case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG); 9556 case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG); 9557 case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG); 9558 9559 // Vector-related lowering. 9560 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 9561 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 9562 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 9563 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 9564 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 9565 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9566 case ISD::MUL: return LowerMUL(Op, DAG); 9567 case ISD::ABS: return LowerABS(Op, DAG); 9568 9569 // For counter-based loop handling. 9570 case ISD::INTRINSIC_W_CHAIN: return SDValue(); 9571 9572 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 9573 9574 // Frame & Return address. 9575 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9576 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9577 9578 case ISD::INTRINSIC_VOID: 9579 return LowerINTRINSIC_VOID(Op, DAG); 9580 case ISD::SREM: 9581 case ISD::UREM: 9582 return LowerREM(Op, DAG); 9583 case ISD::BSWAP: 9584 return LowerBSWAP(Op, DAG); 9585 case ISD::ATOMIC_CMP_SWAP: 9586 return LowerATOMIC_CMP_SWAP(Op, DAG); 9587 } 9588 } 9589 9590 void PPCTargetLowering::ReplaceNodeResults(SDNode *N, 9591 SmallVectorImpl<SDValue>&Results, 9592 SelectionDAG &DAG) const { 9593 SDLoc dl(N); 9594 switch (N->getOpcode()) { 9595 default: 9596 llvm_unreachable("Do not know how to custom type legalize this operation!"); 9597 case ISD::READCYCLECOUNTER: { 9598 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 9599 SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0)); 9600 9601 Results.push_back(RTB); 9602 Results.push_back(RTB.getValue(1)); 9603 Results.push_back(RTB.getValue(2)); 9604 break; 9605 } 9606 case ISD::INTRINSIC_W_CHAIN: { 9607 if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 9608 Intrinsic::ppc_is_decremented_ctr_nonzero) 9609 break; 9610 9611 assert(N->getValueType(0) == MVT::i1 && 9612 "Unexpected result type for CTR decrement intrinsic"); 9613 EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), 9614 N->getValueType(0)); 9615 SDVTList VTs = DAG.getVTList(SVT, MVT::Other); 9616 SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0), 9617 N->getOperand(1)); 9618 9619 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt)); 9620 Results.push_back(NewInt.getValue(1)); 9621 break; 9622 } 9623 case ISD::VAARG: { 9624 if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64()) 9625 return; 9626 9627 EVT VT = N->getValueType(0); 9628 9629 if (VT == MVT::i64) { 9630 SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG); 9631 9632 Results.push_back(NewNode); 9633 Results.push_back(NewNode.getValue(1)); 9634 } 9635 return; 9636 } 9637 case ISD::FP_TO_SINT: 9638 case ISD::FP_TO_UINT: 9639 // LowerFP_TO_INT() can only handle f32 and f64. 9640 if (N->getOperand(0).getValueType() == MVT::ppcf128) 9641 return; 9642 Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); 9643 return; 9644 case ISD::BITCAST: 9645 // Don't handle bitcast here. 9646 return; 9647 } 9648 } 9649 9650 //===----------------------------------------------------------------------===// 9651 // Other Lowering Code 9652 //===----------------------------------------------------------------------===// 9653 9654 static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { 9655 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 9656 Function *Func = Intrinsic::getDeclaration(M, Id); 9657 return Builder.CreateCall(Func, {}); 9658 } 9659 9660 // The mappings for emitLeading/TrailingFence is taken from 9661 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 9662 Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 9663 Instruction *Inst, 9664 AtomicOrdering Ord) const { 9665 if (Ord == AtomicOrdering::SequentiallyConsistent) 9666 return callIntrinsic(Builder, Intrinsic::ppc_sync); 9667 if (isReleaseOrStronger(Ord)) 9668 return callIntrinsic(Builder, Intrinsic::ppc_lwsync); 9669 return nullptr; 9670 } 9671 9672 Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 9673 Instruction *Inst, 9674 AtomicOrdering Ord) const { 9675 if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) { 9676 // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and 9677 // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html 9678 // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. 9679 if (isa<LoadInst>(Inst) && Subtarget.isPPC64()) 9680 return Builder.CreateCall( 9681 Intrinsic::getDeclaration( 9682 Builder.GetInsertBlock()->getParent()->getParent(), 9683 Intrinsic::ppc_cfence, {Inst->getType()}), 9684 {Inst}); 9685 // FIXME: Can use isync for rmw operation. 9686 return callIntrinsic(Builder, Intrinsic::ppc_lwsync); 9687 } 9688 return nullptr; 9689 } 9690 9691 MachineBasicBlock * 9692 PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, 9693 unsigned AtomicSize, 9694 unsigned BinOpcode, 9695 unsigned CmpOpcode, 9696 unsigned CmpPred) const { 9697 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 9698 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9699 9700 auto LoadMnemonic = PPC::LDARX; 9701 auto StoreMnemonic = PPC::STDCX; 9702 switch (AtomicSize) { 9703 default: 9704 llvm_unreachable("Unexpected size of atomic entity"); 9705 case 1: 9706 LoadMnemonic = PPC::LBARX; 9707 StoreMnemonic = PPC::STBCX; 9708 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); 9709 break; 9710 case 2: 9711 LoadMnemonic = PPC::LHARX; 9712 StoreMnemonic = PPC::STHCX; 9713 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4"); 9714 break; 9715 case 4: 9716 LoadMnemonic = PPC::LWARX; 9717 StoreMnemonic = PPC::STWCX; 9718 break; 9719 case 8: 9720 LoadMnemonic = PPC::LDARX; 9721 StoreMnemonic = PPC::STDCX; 9722 break; 9723 } 9724 9725 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9726 MachineFunction *F = BB->getParent(); 9727 MachineFunction::iterator It = ++BB->getIterator(); 9728 9729 unsigned dest = MI.getOperand(0).getReg(); 9730 unsigned ptrA = MI.getOperand(1).getReg(); 9731 unsigned ptrB = MI.getOperand(2).getReg(); 9732 unsigned incr = MI.getOperand(3).getReg(); 9733 DebugLoc dl = MI.getDebugLoc(); 9734 9735 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); 9736 MachineBasicBlock *loop2MBB = 9737 CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr; 9738 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 9739 F->insert(It, loopMBB); 9740 if (CmpOpcode) 9741 F->insert(It, loop2MBB); 9742 F->insert(It, exitMBB); 9743 exitMBB->splice(exitMBB->begin(), BB, 9744 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9745 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 9746 9747 MachineRegisterInfo &RegInfo = F->getRegInfo(); 9748 unsigned TmpReg = (!BinOpcode) ? incr : 9749 RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass 9750 : &PPC::GPRCRegClass); 9751 9752 // thisMBB: 9753 // ... 9754 // fallthrough --> loopMBB 9755 BB->addSuccessor(loopMBB); 9756 9757 // loopMBB: 9758 // l[wd]arx dest, ptr 9759 // add r0, dest, incr 9760 // st[wd]cx. r0, ptr 9761 // bne- loopMBB 9762 // fallthrough --> exitMBB 9763 9764 // For max/min... 9765 // loopMBB: 9766 // l[wd]arx dest, ptr 9767 // cmpl?[wd] incr, dest 9768 // bgt exitMBB 9769 // loop2MBB: 9770 // st[wd]cx. dest, ptr 9771 // bne- loopMBB 9772 // fallthrough --> exitMBB 9773 9774 BB = loopMBB; 9775 BuildMI(BB, dl, TII->get(LoadMnemonic), dest) 9776 .addReg(ptrA).addReg(ptrB); 9777 if (BinOpcode) 9778 BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest); 9779 if (CmpOpcode) { 9780 // Signed comparisons of byte or halfword values must be sign-extended. 9781 if (CmpOpcode == PPC::CMPW && AtomicSize < 4) { 9782 unsigned ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); 9783 BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH), 9784 ExtReg).addReg(dest); 9785 BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) 9786 .addReg(incr).addReg(ExtReg); 9787 } else 9788 BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) 9789 .addReg(incr).addReg(dest); 9790 9791 BuildMI(BB, dl, TII->get(PPC::BCC)) 9792 .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB); 9793 BB->addSuccessor(loop2MBB); 9794 BB->addSuccessor(exitMBB); 9795 BB = loop2MBB; 9796 } 9797 BuildMI(BB, dl, TII->get(StoreMnemonic)) 9798 .addReg(TmpReg).addReg(ptrA).addReg(ptrB); 9799 BuildMI(BB, dl, TII->get(PPC::BCC)) 9800 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); 9801 BB->addSuccessor(loopMBB); 9802 BB->addSuccessor(exitMBB); 9803 9804 // exitMBB: 9805 // ... 9806 BB = exitMBB; 9807 return BB; 9808 } 9809 9810 MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( 9811 MachineInstr &MI, MachineBasicBlock *BB, 9812 bool is8bit, // operation 9813 unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const { 9814 // If we support part-word atomic mnemonics, just use them 9815 if (Subtarget.hasPartwordAtomics()) 9816 return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode, 9817 CmpPred); 9818 9819 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 9820 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 9821 // In 64 bit mode we have to use 64 bits for addresses, even though the 9822 // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address 9823 // registers without caring whether they're 32 or 64, but here we're 9824 // doing actual arithmetic on the addresses. 9825 bool is64bit = Subtarget.isPPC64(); 9826 bool isLittleEndian = Subtarget.isLittleEndian(); 9827 unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; 9828 9829 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9830 MachineFunction *F = BB->getParent(); 9831 MachineFunction::iterator It = ++BB->getIterator(); 9832 9833 unsigned dest = MI.getOperand(0).getReg(); 9834 unsigned ptrA = MI.getOperand(1).getReg(); 9835 unsigned ptrB = MI.getOperand(2).getReg(); 9836 unsigned incr = MI.getOperand(3).getReg(); 9837 DebugLoc dl = MI.getDebugLoc(); 9838 9839 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); 9840 MachineBasicBlock *loop2MBB = 9841 CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr; 9842 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 9843 F->insert(It, loopMBB); 9844 if (CmpOpcode) 9845 F->insert(It, loop2MBB); 9846 F->insert(It, exitMBB); 9847 exitMBB->splice(exitMBB->begin(), BB, 9848 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 9849 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 9850 9851 MachineRegisterInfo &RegInfo = F->getRegInfo(); 9852 const TargetRegisterClass *RC = 9853 is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; 9854 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; 9855 9856 unsigned PtrReg = RegInfo.createVirtualRegister(RC); 9857 unsigned Shift1Reg = RegInfo.createVirtualRegister(GPRC); 9858 unsigned ShiftReg = 9859 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC); 9860 unsigned Incr2Reg = RegInfo.createVirtualRegister(GPRC); 9861 unsigned MaskReg = RegInfo.createVirtualRegister(GPRC); 9862 unsigned Mask2Reg = RegInfo.createVirtualRegister(GPRC); 9863 unsigned Mask3Reg = RegInfo.createVirtualRegister(GPRC); 9864 unsigned Tmp2Reg = RegInfo.createVirtualRegister(GPRC); 9865 unsigned Tmp3Reg = RegInfo.createVirtualRegister(GPRC); 9866 unsigned Tmp4Reg = RegInfo.createVirtualRegister(GPRC); 9867 unsigned TmpDestReg = RegInfo.createVirtualRegister(GPRC); 9868 unsigned Ptr1Reg; 9869 unsigned TmpReg = 9870 (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC); 9871 9872 // thisMBB: 9873 // ... 9874 // fallthrough --> loopMBB 9875 BB->addSuccessor(loopMBB); 9876 9877 // The 4-byte load must be aligned, while a char or short may be 9878 // anywhere in the word. Hence all this nasty bookkeeping code. 9879 // add ptr1, ptrA, ptrB [copy if ptrA==0] 9880 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] 9881 // xori shift, shift1, 24 [16] 9882 // rlwinm ptr, ptr1, 0, 0, 29 9883 // slw incr2, incr, shift 9884 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] 9885 // slw mask, mask2, shift 9886 // loopMBB: 9887 // lwarx tmpDest, ptr 9888 // add tmp, tmpDest, incr2 9889 // andc tmp2, tmpDest, mask 9890 // and tmp3, tmp, mask 9891 // or tmp4, tmp3, tmp2 9892 // stwcx. tmp4, ptr 9893 // bne- loopMBB 9894 // fallthrough --> exitMBB 9895 // srw dest, tmpDest, shift 9896 if (ptrA != ZeroReg) { 9897 Ptr1Reg = RegInfo.createVirtualRegister(RC); 9898 BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) 9899 .addReg(ptrA) 9900 .addReg(ptrB); 9901 } else { 9902 Ptr1Reg = ptrB; 9903 } 9904 // We need use 32-bit subregister to avoid mismatch register class in 64-bit 9905 // mode. 9906 BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg) 9907 .addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0) 9908 .addImm(3) 9909 .addImm(27) 9910 .addImm(is8bit ? 28 : 27); 9911 if (!isLittleEndian) 9912 BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg) 9913 .addReg(Shift1Reg) 9914 .addImm(is8bit ? 24 : 16); 9915 if (is64bit) 9916 BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) 9917 .addReg(Ptr1Reg) 9918 .addImm(0) 9919 .addImm(61); 9920 else 9921 BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) 9922 .addReg(Ptr1Reg) 9923 .addImm(0) 9924 .addImm(0) 9925 .addImm(29); 9926 BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg).addReg(incr).addReg(ShiftReg); 9927 if (is8bit) 9928 BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); 9929 else { 9930 BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); 9931 BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) 9932 .addReg(Mask3Reg) 9933 .addImm(65535); 9934 } 9935 BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) 9936 .addReg(Mask2Reg) 9937 .addReg(ShiftReg); 9938 9939 BB = loopMBB; 9940 BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) 9941 .addReg(ZeroReg) 9942 .addReg(PtrReg); 9943 if (BinOpcode) 9944 BuildMI(BB, dl, TII->get(BinOpcode), TmpReg) 9945 .addReg(Incr2Reg) 9946 .addReg(TmpDestReg); 9947 BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg) 9948 .addReg(TmpDestReg) 9949 .addReg(MaskReg); 9950 BuildMI(BB, dl, TII->get(PPC::AND), Tmp3Reg).addReg(TmpReg).addReg(MaskReg); 9951 if (CmpOpcode) { 9952 // For unsigned comparisons, we can directly compare the shifted values. 9953 // For signed comparisons we shift and sign extend. 9954 unsigned SReg = RegInfo.createVirtualRegister(GPRC); 9955 BuildMI(BB, dl, TII->get(PPC::AND), SReg) 9956 .addReg(TmpDestReg) 9957 .addReg(MaskReg); 9958 unsigned ValueReg = SReg; 9959 unsigned CmpReg = Incr2Reg; 9960 if (CmpOpcode == PPC::CMPW) { 9961 ValueReg = RegInfo.createVirtualRegister(GPRC); 9962 BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg) 9963 .addReg(SReg) 9964 .addReg(ShiftReg); 9965 unsigned ValueSReg = RegInfo.createVirtualRegister(GPRC); 9966 BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg) 9967 .addReg(ValueReg); 9968 ValueReg = ValueSReg; 9969 CmpReg = incr; 9970 } 9971 BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) 9972 .addReg(CmpReg) 9973 .addReg(ValueReg); 9974 BuildMI(BB, dl, TII->get(PPC::BCC)) 9975 .addImm(CmpPred) 9976 .addReg(PPC::CR0) 9977 .addMBB(exitMBB); 9978 BB->addSuccessor(loop2MBB); 9979 BB->addSuccessor(exitMBB); 9980 BB = loop2MBB; 9981 } 9982 BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg).addReg(Tmp3Reg).addReg(Tmp2Reg); 9983 BuildMI(BB, dl, TII->get(PPC::STWCX)) 9984 .addReg(Tmp4Reg) 9985 .addReg(ZeroReg) 9986 .addReg(PtrReg); 9987 BuildMI(BB, dl, TII->get(PPC::BCC)) 9988 .addImm(PPC::PRED_NE) 9989 .addReg(PPC::CR0) 9990 .addMBB(loopMBB); 9991 BB->addSuccessor(loopMBB); 9992 BB->addSuccessor(exitMBB); 9993 9994 // exitMBB: 9995 // ... 9996 BB = exitMBB; 9997 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest) 9998 .addReg(TmpDestReg) 9999 .addReg(ShiftReg); 10000 return BB; 10001 } 10002 10003 llvm::MachineBasicBlock * 10004 PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, 10005 MachineBasicBlock *MBB) const { 10006 DebugLoc DL = MI.getDebugLoc(); 10007 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 10008 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); 10009 10010 MachineFunction *MF = MBB->getParent(); 10011 MachineRegisterInfo &MRI = MF->getRegInfo(); 10012 10013 const BasicBlock *BB = MBB->getBasicBlock(); 10014 MachineFunction::iterator I = ++MBB->getIterator(); 10015 10016 unsigned DstReg = MI.getOperand(0).getReg(); 10017 const TargetRegisterClass *RC = MRI.getRegClass(DstReg); 10018 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); 10019 unsigned mainDstReg = MRI.createVirtualRegister(RC); 10020 unsigned restoreDstReg = MRI.createVirtualRegister(RC); 10021 10022 MVT PVT = getPointerTy(MF->getDataLayout()); 10023 assert((PVT == MVT::i64 || PVT == MVT::i32) && 10024 "Invalid Pointer Size!"); 10025 // For v = setjmp(buf), we generate 10026 // 10027 // thisMBB: 10028 // SjLjSetup mainMBB 10029 // bl mainMBB 10030 // v_restore = 1 10031 // b sinkMBB 10032 // 10033 // mainMBB: 10034 // buf[LabelOffset] = LR 10035 // v_main = 0 10036 // 10037 // sinkMBB: 10038 // v = phi(main, restore) 10039 // 10040 10041 MachineBasicBlock *thisMBB = MBB; 10042 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); 10043 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); 10044 MF->insert(I, mainMBB); 10045 MF->insert(I, sinkMBB); 10046 10047 MachineInstrBuilder MIB; 10048 10049 // Transfer the remainder of BB and its successor edges to sinkMBB. 10050 sinkMBB->splice(sinkMBB->begin(), MBB, 10051 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 10052 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); 10053 10054 // Note that the structure of the jmp_buf used here is not compatible 10055 // with that used by libc, and is not designed to be. Specifically, it 10056 // stores only those 'reserved' registers that LLVM does not otherwise 10057 // understand how to spill. Also, by convention, by the time this 10058 // intrinsic is called, Clang has already stored the frame address in the 10059 // first slot of the buffer and stack address in the third. Following the 10060 // X86 target code, we'll store the jump address in the second slot. We also 10061 // need to save the TOC pointer (R2) to handle jumps between shared 10062 // libraries, and that will be stored in the fourth slot. The thread 10063 // identifier (R13) is not affected. 10064 10065 // thisMBB: 10066 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 10067 const int64_t TOCOffset = 3 * PVT.getStoreSize(); 10068 const int64_t BPOffset = 4 * PVT.getStoreSize(); 10069 10070 // Prepare IP either in reg. 10071 const TargetRegisterClass *PtrRC = getRegClassFor(PVT); 10072 unsigned LabelReg = MRI.createVirtualRegister(PtrRC); 10073 unsigned BufReg = MI.getOperand(1).getReg(); 10074 10075 if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { 10076 setUsesTOCBasePtr(*MBB->getParent()); 10077 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) 10078 .addReg(PPC::X2) 10079 .addImm(TOCOffset) 10080 .addReg(BufReg) 10081 .cloneMemRefs(MI); 10082 } 10083 10084 // Naked functions never have a base pointer, and so we use r1. For all 10085 // other functions, this decision must be delayed until during PEI. 10086 unsigned BaseReg; 10087 if (MF->getFunction().hasFnAttribute(Attribute::Naked)) 10088 BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1; 10089 else 10090 BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP; 10091 10092 MIB = BuildMI(*thisMBB, MI, DL, 10093 TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW)) 10094 .addReg(BaseReg) 10095 .addImm(BPOffset) 10096 .addReg(BufReg) 10097 .cloneMemRefs(MI); 10098 10099 // Setup 10100 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); 10101 MIB.addRegMask(TRI->getNoPreservedMask()); 10102 10103 BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); 10104 10105 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup)) 10106 .addMBB(mainMBB); 10107 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB); 10108 10109 thisMBB->addSuccessor(mainMBB, BranchProbability::getZero()); 10110 thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne()); 10111 10112 // mainMBB: 10113 // mainDstReg = 0 10114 MIB = 10115 BuildMI(mainMBB, DL, 10116 TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); 10117 10118 // Store IP 10119 if (Subtarget.isPPC64()) { 10120 MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD)) 10121 .addReg(LabelReg) 10122 .addImm(LabelOffset) 10123 .addReg(BufReg); 10124 } else { 10125 MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW)) 10126 .addReg(LabelReg) 10127 .addImm(LabelOffset) 10128 .addReg(BufReg); 10129 } 10130 MIB.cloneMemRefs(MI); 10131 10132 BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0); 10133 mainMBB->addSuccessor(sinkMBB); 10134 10135 // sinkMBB: 10136 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 10137 TII->get(PPC::PHI), DstReg) 10138 .addReg(mainDstReg).addMBB(mainMBB) 10139 .addReg(restoreDstReg).addMBB(thisMBB); 10140 10141 MI.eraseFromParent(); 10142 return sinkMBB; 10143 } 10144 10145 MachineBasicBlock * 10146 PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, 10147 MachineBasicBlock *MBB) const { 10148 DebugLoc DL = MI.getDebugLoc(); 10149 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 10150 10151 MachineFunction *MF = MBB->getParent(); 10152 MachineRegisterInfo &MRI = MF->getRegInfo(); 10153 10154 MVT PVT = getPointerTy(MF->getDataLayout()); 10155 assert((PVT == MVT::i64 || PVT == MVT::i32) && 10156 "Invalid Pointer Size!"); 10157 10158 const TargetRegisterClass *RC = 10159 (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; 10160 unsigned Tmp = MRI.createVirtualRegister(RC); 10161 // Since FP is only updated here but NOT referenced, it's treated as GPR. 10162 unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; 10163 unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; 10164 unsigned BP = 10165 (PVT == MVT::i64) 10166 ? PPC::X30 10167 : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29 10168 : PPC::R30); 10169 10170 MachineInstrBuilder MIB; 10171 10172 const int64_t LabelOffset = 1 * PVT.getStoreSize(); 10173 const int64_t SPOffset = 2 * PVT.getStoreSize(); 10174 const int64_t TOCOffset = 3 * PVT.getStoreSize(); 10175 const int64_t BPOffset = 4 * PVT.getStoreSize(); 10176 10177 unsigned BufReg = MI.getOperand(0).getReg(); 10178 10179 // Reload FP (the jumped-to function may not have had a 10180 // frame pointer, and if so, then its r31 will be restored 10181 // as necessary). 10182 if (PVT == MVT::i64) { 10183 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP) 10184 .addImm(0) 10185 .addReg(BufReg); 10186 } else { 10187 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP) 10188 .addImm(0) 10189 .addReg(BufReg); 10190 } 10191 MIB.cloneMemRefs(MI); 10192 10193 // Reload IP 10194 if (PVT == MVT::i64) { 10195 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp) 10196 .addImm(LabelOffset) 10197 .addReg(BufReg); 10198 } else { 10199 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp) 10200 .addImm(LabelOffset) 10201 .addReg(BufReg); 10202 } 10203 MIB.cloneMemRefs(MI); 10204 10205 // Reload SP 10206 if (PVT == MVT::i64) { 10207 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP) 10208 .addImm(SPOffset) 10209 .addReg(BufReg); 10210 } else { 10211 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP) 10212 .addImm(SPOffset) 10213 .addReg(BufReg); 10214 } 10215 MIB.cloneMemRefs(MI); 10216 10217 // Reload BP 10218 if (PVT == MVT::i64) { 10219 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP) 10220 .addImm(BPOffset) 10221 .addReg(BufReg); 10222 } else { 10223 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP) 10224 .addImm(BPOffset) 10225 .addReg(BufReg); 10226 } 10227 MIB.cloneMemRefs(MI); 10228 10229 // Reload TOC 10230 if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) { 10231 setUsesTOCBasePtr(*MBB->getParent()); 10232 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) 10233 .addImm(TOCOffset) 10234 .addReg(BufReg) 10235 .cloneMemRefs(MI); 10236 } 10237 10238 // Jump 10239 BuildMI(*MBB, MI, DL, 10240 TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp); 10241 BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR)); 10242 10243 MI.eraseFromParent(); 10244 return MBB; 10245 } 10246 10247 MachineBasicBlock * 10248 PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 10249 MachineBasicBlock *BB) const { 10250 if (MI.getOpcode() == TargetOpcode::STACKMAP || 10251 MI.getOpcode() == TargetOpcode::PATCHPOINT) { 10252 if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() && 10253 MI.getOpcode() == TargetOpcode::PATCHPOINT) { 10254 // Call lowering should have added an r2 operand to indicate a dependence 10255 // on the TOC base pointer value. It can't however, because there is no 10256 // way to mark the dependence as implicit there, and so the stackmap code 10257 // will confuse it with a regular operand. Instead, add the dependence 10258 // here. 10259 MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true)); 10260 } 10261 10262 return emitPatchPoint(MI, BB); 10263 } 10264 10265 if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 || 10266 MI.getOpcode() == PPC::EH_SjLj_SetJmp64) { 10267 return emitEHSjLjSetJmp(MI, BB); 10268 } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 || 10269 MI.getOpcode() == PPC::EH_SjLj_LongJmp64) { 10270 return emitEHSjLjLongJmp(MI, BB); 10271 } 10272 10273 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 10274 10275 // To "insert" these instructions we actually have to insert their 10276 // control-flow patterns. 10277 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10278 MachineFunction::iterator It = ++BB->getIterator(); 10279 10280 MachineFunction *F = BB->getParent(); 10281 10282 if (MI.getOpcode() == PPC::SELECT_CC_I4 || 10283 MI.getOpcode() == PPC::SELECT_CC_I8 || MI.getOpcode() == PPC::SELECT_I4 || 10284 MI.getOpcode() == PPC::SELECT_I8) { 10285 SmallVector<MachineOperand, 2> Cond; 10286 if (MI.getOpcode() == PPC::SELECT_CC_I4 || 10287 MI.getOpcode() == PPC::SELECT_CC_I8) 10288 Cond.push_back(MI.getOperand(4)); 10289 else 10290 Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET)); 10291 Cond.push_back(MI.getOperand(1)); 10292 10293 DebugLoc dl = MI.getDebugLoc(); 10294 TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond, 10295 MI.getOperand(2).getReg(), MI.getOperand(3).getReg()); 10296 } else if (MI.getOpcode() == PPC::SELECT_CC_I4 || 10297 MI.getOpcode() == PPC::SELECT_CC_I8 || 10298 MI.getOpcode() == PPC::SELECT_CC_F4 || 10299 MI.getOpcode() == PPC::SELECT_CC_F8 || 10300 MI.getOpcode() == PPC::SELECT_CC_F16 || 10301 MI.getOpcode() == PPC::SELECT_CC_QFRC || 10302 MI.getOpcode() == PPC::SELECT_CC_QSRC || 10303 MI.getOpcode() == PPC::SELECT_CC_QBRC || 10304 MI.getOpcode() == PPC::SELECT_CC_VRRC || 10305 MI.getOpcode() == PPC::SELECT_CC_VSFRC || 10306 MI.getOpcode() == PPC::SELECT_CC_VSSRC || 10307 MI.getOpcode() == PPC::SELECT_CC_VSRC || 10308 MI.getOpcode() == PPC::SELECT_CC_SPE4 || 10309 MI.getOpcode() == PPC::SELECT_CC_SPE || 10310 MI.getOpcode() == PPC::SELECT_I4 || 10311 MI.getOpcode() == PPC::SELECT_I8 || 10312 MI.getOpcode() == PPC::SELECT_F4 || 10313 MI.getOpcode() == PPC::SELECT_F8 || 10314 MI.getOpcode() == PPC::SELECT_F16 || 10315 MI.getOpcode() == PPC::SELECT_QFRC || 10316 MI.getOpcode() == PPC::SELECT_QSRC || 10317 MI.getOpcode() == PPC::SELECT_QBRC || 10318 MI.getOpcode() == PPC::SELECT_SPE || 10319 MI.getOpcode() == PPC::SELECT_SPE4 || 10320 MI.getOpcode() == PPC::SELECT_VRRC || 10321 MI.getOpcode() == PPC::SELECT_VSFRC || 10322 MI.getOpcode() == PPC::SELECT_VSSRC || 10323 MI.getOpcode() == PPC::SELECT_VSRC) { 10324 // The incoming instruction knows the destination vreg to set, the 10325 // condition code register to branch on, the true/false values to 10326 // select between, and a branch opcode to use. 10327 10328 // thisMBB: 10329 // ... 10330 // TrueVal = ... 10331 // cmpTY ccX, r1, r2 10332 // bCC copy1MBB 10333 // fallthrough --> copy0MBB 10334 MachineBasicBlock *thisMBB = BB; 10335 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10336 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10337 DebugLoc dl = MI.getDebugLoc(); 10338 F->insert(It, copy0MBB); 10339 F->insert(It, sinkMBB); 10340 10341 // Transfer the remainder of BB and its successor edges to sinkMBB. 10342 sinkMBB->splice(sinkMBB->begin(), BB, 10343 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10344 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10345 10346 // Next, add the true and fallthrough blocks as its successors. 10347 BB->addSuccessor(copy0MBB); 10348 BB->addSuccessor(sinkMBB); 10349 10350 if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 || 10351 MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || 10352 MI.getOpcode() == PPC::SELECT_F16 || 10353 MI.getOpcode() == PPC::SELECT_SPE4 || 10354 MI.getOpcode() == PPC::SELECT_SPE || 10355 MI.getOpcode() == PPC::SELECT_QFRC || 10356 MI.getOpcode() == PPC::SELECT_QSRC || 10357 MI.getOpcode() == PPC::SELECT_QBRC || 10358 MI.getOpcode() == PPC::SELECT_VRRC || 10359 MI.getOpcode() == PPC::SELECT_VSFRC || 10360 MI.getOpcode() == PPC::SELECT_VSSRC || 10361 MI.getOpcode() == PPC::SELECT_VSRC) { 10362 BuildMI(BB, dl, TII->get(PPC::BC)) 10363 .addReg(MI.getOperand(1).getReg()) 10364 .addMBB(sinkMBB); 10365 } else { 10366 unsigned SelectPred = MI.getOperand(4).getImm(); 10367 BuildMI(BB, dl, TII->get(PPC::BCC)) 10368 .addImm(SelectPred) 10369 .addReg(MI.getOperand(1).getReg()) 10370 .addMBB(sinkMBB); 10371 } 10372 10373 // copy0MBB: 10374 // %FalseValue = ... 10375 // # fallthrough to sinkMBB 10376 BB = copy0MBB; 10377 10378 // Update machine-CFG edges 10379 BB->addSuccessor(sinkMBB); 10380 10381 // sinkMBB: 10382 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10383 // ... 10384 BB = sinkMBB; 10385 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg()) 10386 .addReg(MI.getOperand(3).getReg()) 10387 .addMBB(copy0MBB) 10388 .addReg(MI.getOperand(2).getReg()) 10389 .addMBB(thisMBB); 10390 } else if (MI.getOpcode() == PPC::ReadTB) { 10391 // To read the 64-bit time-base register on a 32-bit target, we read the 10392 // two halves. Should the counter have wrapped while it was being read, we 10393 // need to try again. 10394 // ... 10395 // readLoop: 10396 // mfspr Rx,TBU # load from TBU 10397 // mfspr Ry,TB # load from TB 10398 // mfspr Rz,TBU # load from TBU 10399 // cmpw crX,Rx,Rz # check if 'old'='new' 10400 // bne readLoop # branch if they're not equal 10401 // ... 10402 10403 MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB); 10404 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10405 DebugLoc dl = MI.getDebugLoc(); 10406 F->insert(It, readMBB); 10407 F->insert(It, sinkMBB); 10408 10409 // Transfer the remainder of BB and its successor edges to sinkMBB. 10410 sinkMBB->splice(sinkMBB->begin(), BB, 10411 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10412 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10413 10414 BB->addSuccessor(readMBB); 10415 BB = readMBB; 10416 10417 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10418 unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); 10419 unsigned LoReg = MI.getOperand(0).getReg(); 10420 unsigned HiReg = MI.getOperand(1).getReg(); 10421 10422 BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269); 10423 BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268); 10424 BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269); 10425 10426 unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); 10427 10428 BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg) 10429 .addReg(HiReg) 10430 .addReg(ReadAgainReg); 10431 BuildMI(BB, dl, TII->get(PPC::BCC)) 10432 .addImm(PPC::PRED_NE) 10433 .addReg(CmpReg) 10434 .addMBB(readMBB); 10435 10436 BB->addSuccessor(readMBB); 10437 BB->addSuccessor(sinkMBB); 10438 } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8) 10439 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4); 10440 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16) 10441 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4); 10442 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32) 10443 BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4); 10444 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64) 10445 BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8); 10446 10447 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8) 10448 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND); 10449 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16) 10450 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND); 10451 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32) 10452 BB = EmitAtomicBinary(MI, BB, 4, PPC::AND); 10453 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64) 10454 BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8); 10455 10456 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8) 10457 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR); 10458 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16) 10459 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR); 10460 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32) 10461 BB = EmitAtomicBinary(MI, BB, 4, PPC::OR); 10462 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64) 10463 BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8); 10464 10465 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8) 10466 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR); 10467 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16) 10468 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR); 10469 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32) 10470 BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR); 10471 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64) 10472 BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8); 10473 10474 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) 10475 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); 10476 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) 10477 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); 10478 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) 10479 BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND); 10480 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) 10481 BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8); 10482 10483 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) 10484 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); 10485 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16) 10486 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF); 10487 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32) 10488 BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF); 10489 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64) 10490 BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8); 10491 10492 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8) 10493 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE); 10494 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16) 10495 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE); 10496 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32) 10497 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE); 10498 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64) 10499 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE); 10500 10501 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8) 10502 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE); 10503 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16) 10504 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE); 10505 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32) 10506 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE); 10507 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64) 10508 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE); 10509 10510 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8) 10511 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE); 10512 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16) 10513 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE); 10514 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32) 10515 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE); 10516 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64) 10517 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE); 10518 10519 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8) 10520 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE); 10521 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16) 10522 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE); 10523 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32) 10524 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE); 10525 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64) 10526 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE); 10527 10528 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8) 10529 BB = EmitPartwordAtomicBinary(MI, BB, true, 0); 10530 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16) 10531 BB = EmitPartwordAtomicBinary(MI, BB, false, 0); 10532 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32) 10533 BB = EmitAtomicBinary(MI, BB, 4, 0); 10534 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64) 10535 BB = EmitAtomicBinary(MI, BB, 8, 0); 10536 else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || 10537 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 || 10538 (Subtarget.hasPartwordAtomics() && 10539 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) || 10540 (Subtarget.hasPartwordAtomics() && 10541 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) { 10542 bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64; 10543 10544 auto LoadMnemonic = PPC::LDARX; 10545 auto StoreMnemonic = PPC::STDCX; 10546 switch (MI.getOpcode()) { 10547 default: 10548 llvm_unreachable("Compare and swap of unknown size"); 10549 case PPC::ATOMIC_CMP_SWAP_I8: 10550 LoadMnemonic = PPC::LBARX; 10551 StoreMnemonic = PPC::STBCX; 10552 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); 10553 break; 10554 case PPC::ATOMIC_CMP_SWAP_I16: 10555 LoadMnemonic = PPC::LHARX; 10556 StoreMnemonic = PPC::STHCX; 10557 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics."); 10558 break; 10559 case PPC::ATOMIC_CMP_SWAP_I32: 10560 LoadMnemonic = PPC::LWARX; 10561 StoreMnemonic = PPC::STWCX; 10562 break; 10563 case PPC::ATOMIC_CMP_SWAP_I64: 10564 LoadMnemonic = PPC::LDARX; 10565 StoreMnemonic = PPC::STDCX; 10566 break; 10567 } 10568 unsigned dest = MI.getOperand(0).getReg(); 10569 unsigned ptrA = MI.getOperand(1).getReg(); 10570 unsigned ptrB = MI.getOperand(2).getReg(); 10571 unsigned oldval = MI.getOperand(3).getReg(); 10572 unsigned newval = MI.getOperand(4).getReg(); 10573 DebugLoc dl = MI.getDebugLoc(); 10574 10575 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); 10576 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); 10577 MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); 10578 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 10579 F->insert(It, loop1MBB); 10580 F->insert(It, loop2MBB); 10581 F->insert(It, midMBB); 10582 F->insert(It, exitMBB); 10583 exitMBB->splice(exitMBB->begin(), BB, 10584 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10585 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 10586 10587 // thisMBB: 10588 // ... 10589 // fallthrough --> loopMBB 10590 BB->addSuccessor(loop1MBB); 10591 10592 // loop1MBB: 10593 // l[bhwd]arx dest, ptr 10594 // cmp[wd] dest, oldval 10595 // bne- midMBB 10596 // loop2MBB: 10597 // st[bhwd]cx. newval, ptr 10598 // bne- loopMBB 10599 // b exitBB 10600 // midMBB: 10601 // st[bhwd]cx. dest, ptr 10602 // exitBB: 10603 BB = loop1MBB; 10604 BuildMI(BB, dl, TII->get(LoadMnemonic), dest).addReg(ptrA).addReg(ptrB); 10605 BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0) 10606 .addReg(oldval) 10607 .addReg(dest); 10608 BuildMI(BB, dl, TII->get(PPC::BCC)) 10609 .addImm(PPC::PRED_NE) 10610 .addReg(PPC::CR0) 10611 .addMBB(midMBB); 10612 BB->addSuccessor(loop2MBB); 10613 BB->addSuccessor(midMBB); 10614 10615 BB = loop2MBB; 10616 BuildMI(BB, dl, TII->get(StoreMnemonic)) 10617 .addReg(newval) 10618 .addReg(ptrA) 10619 .addReg(ptrB); 10620 BuildMI(BB, dl, TII->get(PPC::BCC)) 10621 .addImm(PPC::PRED_NE) 10622 .addReg(PPC::CR0) 10623 .addMBB(loop1MBB); 10624 BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); 10625 BB->addSuccessor(loop1MBB); 10626 BB->addSuccessor(exitMBB); 10627 10628 BB = midMBB; 10629 BuildMI(BB, dl, TII->get(StoreMnemonic)) 10630 .addReg(dest) 10631 .addReg(ptrA) 10632 .addReg(ptrB); 10633 BB->addSuccessor(exitMBB); 10634 10635 // exitMBB: 10636 // ... 10637 BB = exitMBB; 10638 } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 || 10639 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) { 10640 // We must use 64-bit registers for addresses when targeting 64-bit, 10641 // since we're actually doing arithmetic on them. Other registers 10642 // can be 32-bit. 10643 bool is64bit = Subtarget.isPPC64(); 10644 bool isLittleEndian = Subtarget.isLittleEndian(); 10645 bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; 10646 10647 unsigned dest = MI.getOperand(0).getReg(); 10648 unsigned ptrA = MI.getOperand(1).getReg(); 10649 unsigned ptrB = MI.getOperand(2).getReg(); 10650 unsigned oldval = MI.getOperand(3).getReg(); 10651 unsigned newval = MI.getOperand(4).getReg(); 10652 DebugLoc dl = MI.getDebugLoc(); 10653 10654 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); 10655 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); 10656 MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); 10657 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); 10658 F->insert(It, loop1MBB); 10659 F->insert(It, loop2MBB); 10660 F->insert(It, midMBB); 10661 F->insert(It, exitMBB); 10662 exitMBB->splice(exitMBB->begin(), BB, 10663 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10664 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 10665 10666 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10667 const TargetRegisterClass *RC = 10668 is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; 10669 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; 10670 10671 unsigned PtrReg = RegInfo.createVirtualRegister(RC); 10672 unsigned Shift1Reg = RegInfo.createVirtualRegister(GPRC); 10673 unsigned ShiftReg = 10674 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC); 10675 unsigned NewVal2Reg = RegInfo.createVirtualRegister(GPRC); 10676 unsigned NewVal3Reg = RegInfo.createVirtualRegister(GPRC); 10677 unsigned OldVal2Reg = RegInfo.createVirtualRegister(GPRC); 10678 unsigned OldVal3Reg = RegInfo.createVirtualRegister(GPRC); 10679 unsigned MaskReg = RegInfo.createVirtualRegister(GPRC); 10680 unsigned Mask2Reg = RegInfo.createVirtualRegister(GPRC); 10681 unsigned Mask3Reg = RegInfo.createVirtualRegister(GPRC); 10682 unsigned Tmp2Reg = RegInfo.createVirtualRegister(GPRC); 10683 unsigned Tmp4Reg = RegInfo.createVirtualRegister(GPRC); 10684 unsigned TmpDestReg = RegInfo.createVirtualRegister(GPRC); 10685 unsigned Ptr1Reg; 10686 unsigned TmpReg = RegInfo.createVirtualRegister(GPRC); 10687 unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; 10688 // thisMBB: 10689 // ... 10690 // fallthrough --> loopMBB 10691 BB->addSuccessor(loop1MBB); 10692 10693 // The 4-byte load must be aligned, while a char or short may be 10694 // anywhere in the word. Hence all this nasty bookkeeping code. 10695 // add ptr1, ptrA, ptrB [copy if ptrA==0] 10696 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] 10697 // xori shift, shift1, 24 [16] 10698 // rlwinm ptr, ptr1, 0, 0, 29 10699 // slw newval2, newval, shift 10700 // slw oldval2, oldval,shift 10701 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] 10702 // slw mask, mask2, shift 10703 // and newval3, newval2, mask 10704 // and oldval3, oldval2, mask 10705 // loop1MBB: 10706 // lwarx tmpDest, ptr 10707 // and tmp, tmpDest, mask 10708 // cmpw tmp, oldval3 10709 // bne- midMBB 10710 // loop2MBB: 10711 // andc tmp2, tmpDest, mask 10712 // or tmp4, tmp2, newval3 10713 // stwcx. tmp4, ptr 10714 // bne- loop1MBB 10715 // b exitBB 10716 // midMBB: 10717 // stwcx. tmpDest, ptr 10718 // exitBB: 10719 // srw dest, tmpDest, shift 10720 if (ptrA != ZeroReg) { 10721 Ptr1Reg = RegInfo.createVirtualRegister(RC); 10722 BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) 10723 .addReg(ptrA) 10724 .addReg(ptrB); 10725 } else { 10726 Ptr1Reg = ptrB; 10727 } 10728 10729 // We need use 32-bit subregister to avoid mismatch register class in 64-bit 10730 // mode. 10731 BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg) 10732 .addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0) 10733 .addImm(3) 10734 .addImm(27) 10735 .addImm(is8bit ? 28 : 27); 10736 if (!isLittleEndian) 10737 BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg) 10738 .addReg(Shift1Reg) 10739 .addImm(is8bit ? 24 : 16); 10740 if (is64bit) 10741 BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) 10742 .addReg(Ptr1Reg) 10743 .addImm(0) 10744 .addImm(61); 10745 else 10746 BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) 10747 .addReg(Ptr1Reg) 10748 .addImm(0) 10749 .addImm(0) 10750 .addImm(29); 10751 BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg) 10752 .addReg(newval) 10753 .addReg(ShiftReg); 10754 BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg) 10755 .addReg(oldval) 10756 .addReg(ShiftReg); 10757 if (is8bit) 10758 BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); 10759 else { 10760 BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); 10761 BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) 10762 .addReg(Mask3Reg) 10763 .addImm(65535); 10764 } 10765 BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) 10766 .addReg(Mask2Reg) 10767 .addReg(ShiftReg); 10768 BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg) 10769 .addReg(NewVal2Reg) 10770 .addReg(MaskReg); 10771 BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg) 10772 .addReg(OldVal2Reg) 10773 .addReg(MaskReg); 10774 10775 BB = loop1MBB; 10776 BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) 10777 .addReg(ZeroReg) 10778 .addReg(PtrReg); 10779 BuildMI(BB, dl, TII->get(PPC::AND), TmpReg) 10780 .addReg(TmpDestReg) 10781 .addReg(MaskReg); 10782 BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0) 10783 .addReg(TmpReg) 10784 .addReg(OldVal3Reg); 10785 BuildMI(BB, dl, TII->get(PPC::BCC)) 10786 .addImm(PPC::PRED_NE) 10787 .addReg(PPC::CR0) 10788 .addMBB(midMBB); 10789 BB->addSuccessor(loop2MBB); 10790 BB->addSuccessor(midMBB); 10791 10792 BB = loop2MBB; 10793 BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg) 10794 .addReg(TmpDestReg) 10795 .addReg(MaskReg); 10796 BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg) 10797 .addReg(Tmp2Reg) 10798 .addReg(NewVal3Reg); 10799 BuildMI(BB, dl, TII->get(PPC::STWCX)) 10800 .addReg(Tmp4Reg) 10801 .addReg(ZeroReg) 10802 .addReg(PtrReg); 10803 BuildMI(BB, dl, TII->get(PPC::BCC)) 10804 .addImm(PPC::PRED_NE) 10805 .addReg(PPC::CR0) 10806 .addMBB(loop1MBB); 10807 BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); 10808 BB->addSuccessor(loop1MBB); 10809 BB->addSuccessor(exitMBB); 10810 10811 BB = midMBB; 10812 BuildMI(BB, dl, TII->get(PPC::STWCX)) 10813 .addReg(TmpDestReg) 10814 .addReg(ZeroReg) 10815 .addReg(PtrReg); 10816 BB->addSuccessor(exitMBB); 10817 10818 // exitMBB: 10819 // ... 10820 BB = exitMBB; 10821 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest) 10822 .addReg(TmpReg) 10823 .addReg(ShiftReg); 10824 } else if (MI.getOpcode() == PPC::FADDrtz) { 10825 // This pseudo performs an FADD with rounding mode temporarily forced 10826 // to round-to-zero. We emit this via custom inserter since the FPSCR 10827 // is not modeled at the SelectionDAG level. 10828 unsigned Dest = MI.getOperand(0).getReg(); 10829 unsigned Src1 = MI.getOperand(1).getReg(); 10830 unsigned Src2 = MI.getOperand(2).getReg(); 10831 DebugLoc dl = MI.getDebugLoc(); 10832 10833 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10834 unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); 10835 10836 // Save FPSCR value. 10837 BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg); 10838 10839 // Set rounding mode to round-to-zero. 10840 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31); 10841 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30); 10842 10843 // Perform addition. 10844 BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2); 10845 10846 // Restore FPSCR value. 10847 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg); 10848 } else if (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || 10849 MI.getOpcode() == PPC::ANDIo_1_GT_BIT || 10850 MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || 10851 MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) { 10852 unsigned Opcode = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || 10853 MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) 10854 ? PPC::ANDIo8 10855 : PPC::ANDIo; 10856 bool isEQ = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || 10857 MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8); 10858 10859 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10860 unsigned Dest = RegInfo.createVirtualRegister( 10861 Opcode == PPC::ANDIo ? &PPC::GPRCRegClass : &PPC::G8RCRegClass); 10862 10863 DebugLoc dl = MI.getDebugLoc(); 10864 BuildMI(*BB, MI, dl, TII->get(Opcode), Dest) 10865 .addReg(MI.getOperand(1).getReg()) 10866 .addImm(1); 10867 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), 10868 MI.getOperand(0).getReg()) 10869 .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT); 10870 } else if (MI.getOpcode() == PPC::TCHECK_RET) { 10871 DebugLoc Dl = MI.getDebugLoc(); 10872 MachineRegisterInfo &RegInfo = F->getRegInfo(); 10873 unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); 10874 BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg); 10875 return BB; 10876 } else { 10877 llvm_unreachable("Unexpected instr type to insert"); 10878 } 10879 10880 MI.eraseFromParent(); // The pseudo instruction is gone now. 10881 return BB; 10882 } 10883 10884 //===----------------------------------------------------------------------===// 10885 // Target Optimization Hooks 10886 //===----------------------------------------------------------------------===// 10887 10888 static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) { 10889 // For the estimates, convergence is quadratic, so we essentially double the 10890 // number of digits correct after every iteration. For both FRE and FRSQRTE, 10891 // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(), 10892 // this is 2^-14. IEEE float has 23 digits and double has 52 digits. 10893 int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3; 10894 if (VT.getScalarType() == MVT::f64) 10895 RefinementSteps++; 10896 return RefinementSteps; 10897 } 10898 10899 SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 10900 int Enabled, int &RefinementSteps, 10901 bool &UseOneConstNR, 10902 bool Reciprocal) const { 10903 EVT VT = Operand.getValueType(); 10904 if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || 10905 (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || 10906 (VT == MVT::v4f32 && Subtarget.hasAltivec()) || 10907 (VT == MVT::v2f64 && Subtarget.hasVSX()) || 10908 (VT == MVT::v4f32 && Subtarget.hasQPX()) || 10909 (VT == MVT::v4f64 && Subtarget.hasQPX())) { 10910 if (RefinementSteps == ReciprocalEstimate::Unspecified) 10911 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); 10912 10913 UseOneConstNR = true; 10914 return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand); 10915 } 10916 return SDValue(); 10917 } 10918 10919 SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, 10920 int Enabled, 10921 int &RefinementSteps) const { 10922 EVT VT = Operand.getValueType(); 10923 if ((VT == MVT::f32 && Subtarget.hasFRES()) || 10924 (VT == MVT::f64 && Subtarget.hasFRE()) || 10925 (VT == MVT::v4f32 && Subtarget.hasAltivec()) || 10926 (VT == MVT::v2f64 && Subtarget.hasVSX()) || 10927 (VT == MVT::v4f32 && Subtarget.hasQPX()) || 10928 (VT == MVT::v4f64 && Subtarget.hasQPX())) { 10929 if (RefinementSteps == ReciprocalEstimate::Unspecified) 10930 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); 10931 return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); 10932 } 10933 return SDValue(); 10934 } 10935 10936 unsigned PPCTargetLowering::combineRepeatedFPDivisors() const { 10937 // Note: This functionality is used only when unsafe-fp-math is enabled, and 10938 // on cores with reciprocal estimates (which are used when unsafe-fp-math is 10939 // enabled for division), this functionality is redundant with the default 10940 // combiner logic (once the division -> reciprocal/multiply transformation 10941 // has taken place). As a result, this matters more for older cores than for 10942 // newer ones. 10943 10944 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 10945 // reciprocal if there are two or more FDIVs (for embedded cores with only 10946 // one FP pipeline) for three or more FDIVs (for generic OOO cores). 10947 switch (Subtarget.getDarwinDirective()) { 10948 default: 10949 return 3; 10950 case PPC::DIR_440: 10951 case PPC::DIR_A2: 10952 case PPC::DIR_E500: 10953 case PPC::DIR_E500mc: 10954 case PPC::DIR_E5500: 10955 return 2; 10956 } 10957 } 10958 10959 // isConsecutiveLSLoc needs to work even if all adds have not yet been 10960 // collapsed, and so we need to look through chains of them. 10961 static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base, 10962 int64_t& Offset, SelectionDAG &DAG) { 10963 if (DAG.isBaseWithConstantOffset(Loc)) { 10964 Base = Loc.getOperand(0); 10965 Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue(); 10966 10967 // The base might itself be a base plus an offset, and if so, accumulate 10968 // that as well. 10969 getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG); 10970 } 10971 } 10972 10973 static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, 10974 unsigned Bytes, int Dist, 10975 SelectionDAG &DAG) { 10976 if (VT.getSizeInBits() / 8 != Bytes) 10977 return false; 10978 10979 SDValue BaseLoc = Base->getBasePtr(); 10980 if (Loc.getOpcode() == ISD::FrameIndex) { 10981 if (BaseLoc.getOpcode() != ISD::FrameIndex) 10982 return false; 10983 const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 10984 int FI = cast<FrameIndexSDNode>(Loc)->getIndex(); 10985 int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex(); 10986 int FS = MFI.getObjectSize(FI); 10987 int BFS = MFI.getObjectSize(BFI); 10988 if (FS != BFS || FS != (int)Bytes) return false; 10989 return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes); 10990 } 10991 10992 SDValue Base1 = Loc, Base2 = BaseLoc; 10993 int64_t Offset1 = 0, Offset2 = 0; 10994 getBaseWithConstantOffset(Loc, Base1, Offset1, DAG); 10995 getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG); 10996 if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes)) 10997 return true; 10998 10999 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11000 const GlobalValue *GV1 = nullptr; 11001 const GlobalValue *GV2 = nullptr; 11002 Offset1 = 0; 11003 Offset2 = 0; 11004 bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1); 11005 bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); 11006 if (isGA1 && isGA2 && GV1 == GV2) 11007 return Offset1 == (Offset2 + Dist*Bytes); 11008 return false; 11009 } 11010 11011 // Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does 11012 // not enforce equality of the chain operands. 11013 static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, 11014 unsigned Bytes, int Dist, 11015 SelectionDAG &DAG) { 11016 if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) { 11017 EVT VT = LS->getMemoryVT(); 11018 SDValue Loc = LS->getBasePtr(); 11019 return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG); 11020 } 11021 11022 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 11023 EVT VT; 11024 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 11025 default: return false; 11026 case Intrinsic::ppc_qpx_qvlfd: 11027 case Intrinsic::ppc_qpx_qvlfda: 11028 VT = MVT::v4f64; 11029 break; 11030 case Intrinsic::ppc_qpx_qvlfs: 11031 case Intrinsic::ppc_qpx_qvlfsa: 11032 VT = MVT::v4f32; 11033 break; 11034 case Intrinsic::ppc_qpx_qvlfcd: 11035 case Intrinsic::ppc_qpx_qvlfcda: 11036 VT = MVT::v2f64; 11037 break; 11038 case Intrinsic::ppc_qpx_qvlfcs: 11039 case Intrinsic::ppc_qpx_qvlfcsa: 11040 VT = MVT::v2f32; 11041 break; 11042 case Intrinsic::ppc_qpx_qvlfiwa: 11043 case Intrinsic::ppc_qpx_qvlfiwz: 11044 case Intrinsic::ppc_altivec_lvx: 11045 case Intrinsic::ppc_altivec_lvxl: 11046 case Intrinsic::ppc_vsx_lxvw4x: 11047 case Intrinsic::ppc_vsx_lxvw4x_be: 11048 VT = MVT::v4i32; 11049 break; 11050 case Intrinsic::ppc_vsx_lxvd2x: 11051 case Intrinsic::ppc_vsx_lxvd2x_be: 11052 VT = MVT::v2f64; 11053 break; 11054 case Intrinsic::ppc_altivec_lvebx: 11055 VT = MVT::i8; 11056 break; 11057 case Intrinsic::ppc_altivec_lvehx: 11058 VT = MVT::i16; 11059 break; 11060 case Intrinsic::ppc_altivec_lvewx: 11061 VT = MVT::i32; 11062 break; 11063 } 11064 11065 return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG); 11066 } 11067 11068 if (N->getOpcode() == ISD::INTRINSIC_VOID) { 11069 EVT VT; 11070 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 11071 default: return false; 11072 case Intrinsic::ppc_qpx_qvstfd: 11073 case Intrinsic::ppc_qpx_qvstfda: 11074 VT = MVT::v4f64; 11075 break; 11076 case Intrinsic::ppc_qpx_qvstfs: 11077 case Intrinsic::ppc_qpx_qvstfsa: 11078 VT = MVT::v4f32; 11079 break; 11080 case Intrinsic::ppc_qpx_qvstfcd: 11081 case Intrinsic::ppc_qpx_qvstfcda: 11082 VT = MVT::v2f64; 11083 break; 11084 case Intrinsic::ppc_qpx_qvstfcs: 11085 case Intrinsic::ppc_qpx_qvstfcsa: 11086 VT = MVT::v2f32; 11087 break; 11088 case Intrinsic::ppc_qpx_qvstfiw: 11089 case Intrinsic::ppc_qpx_qvstfiwa: 11090 case Intrinsic::ppc_altivec_stvx: 11091 case Intrinsic::ppc_altivec_stvxl: 11092 case Intrinsic::ppc_vsx_stxvw4x: 11093 VT = MVT::v4i32; 11094 break; 11095 case Intrinsic::ppc_vsx_stxvd2x: 11096 VT = MVT::v2f64; 11097 break; 11098 case Intrinsic::ppc_vsx_stxvw4x_be: 11099 VT = MVT::v4i32; 11100 break; 11101 case Intrinsic::ppc_vsx_stxvd2x_be: 11102 VT = MVT::v2f64; 11103 break; 11104 case Intrinsic::ppc_altivec_stvebx: 11105 VT = MVT::i8; 11106 break; 11107 case Intrinsic::ppc_altivec_stvehx: 11108 VT = MVT::i16; 11109 break; 11110 case Intrinsic::ppc_altivec_stvewx: 11111 VT = MVT::i32; 11112 break; 11113 } 11114 11115 return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG); 11116 } 11117 11118 return false; 11119 } 11120 11121 // Return true is there is a nearyby consecutive load to the one provided 11122 // (regardless of alignment). We search up and down the chain, looking though 11123 // token factors and other loads (but nothing else). As a result, a true result 11124 // indicates that it is safe to create a new consecutive load adjacent to the 11125 // load provided. 11126 static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { 11127 SDValue Chain = LD->getChain(); 11128 EVT VT = LD->getMemoryVT(); 11129 11130 SmallSet<SDNode *, 16> LoadRoots; 11131 SmallVector<SDNode *, 8> Queue(1, Chain.getNode()); 11132 SmallSet<SDNode *, 16> Visited; 11133 11134 // First, search up the chain, branching to follow all token-factor operands. 11135 // If we find a consecutive load, then we're done, otherwise, record all 11136 // nodes just above the top-level loads and token factors. 11137 while (!Queue.empty()) { 11138 SDNode *ChainNext = Queue.pop_back_val(); 11139 if (!Visited.insert(ChainNext).second) 11140 continue; 11141 11142 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) { 11143 if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) 11144 return true; 11145 11146 if (!Visited.count(ChainLD->getChain().getNode())) 11147 Queue.push_back(ChainLD->getChain().getNode()); 11148 } else if (ChainNext->getOpcode() == ISD::TokenFactor) { 11149 for (const SDUse &O : ChainNext->ops()) 11150 if (!Visited.count(O.getNode())) 11151 Queue.push_back(O.getNode()); 11152 } else 11153 LoadRoots.insert(ChainNext); 11154 } 11155 11156 // Second, search down the chain, starting from the top-level nodes recorded 11157 // in the first phase. These top-level nodes are the nodes just above all 11158 // loads and token factors. Starting with their uses, recursively look though 11159 // all loads (just the chain uses) and token factors to find a consecutive 11160 // load. 11161 Visited.clear(); 11162 Queue.clear(); 11163 11164 for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(), 11165 IE = LoadRoots.end(); I != IE; ++I) { 11166 Queue.push_back(*I); 11167 11168 while (!Queue.empty()) { 11169 SDNode *LoadRoot = Queue.pop_back_val(); 11170 if (!Visited.insert(LoadRoot).second) 11171 continue; 11172 11173 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot)) 11174 if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) 11175 return true; 11176 11177 for (SDNode::use_iterator UI = LoadRoot->use_begin(), 11178 UE = LoadRoot->use_end(); UI != UE; ++UI) 11179 if (((isa<MemSDNode>(*UI) && 11180 cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) || 11181 UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) 11182 Queue.push_back(*UI); 11183 } 11184 } 11185 11186 return false; 11187 } 11188 11189 /// This function is called when we have proved that a SETCC node can be replaced 11190 /// by subtraction (and other supporting instructions) so that the result of 11191 /// comparison is kept in a GPR instead of CR. This function is purely for 11192 /// codegen purposes and has some flags to guide the codegen process. 11193 static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement, 11194 bool Swap, SDLoc &DL, SelectionDAG &DAG) { 11195 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); 11196 11197 // Zero extend the operands to the largest legal integer. Originally, they 11198 // must be of a strictly smaller size. 11199 auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0), 11200 DAG.getConstant(Size, DL, MVT::i32)); 11201 auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1), 11202 DAG.getConstant(Size, DL, MVT::i32)); 11203 11204 // Swap if needed. Depends on the condition code. 11205 if (Swap) 11206 std::swap(Op0, Op1); 11207 11208 // Subtract extended integers. 11209 auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1); 11210 11211 // Move the sign bit to the least significant position and zero out the rest. 11212 // Now the least significant bit carries the result of original comparison. 11213 auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode, 11214 DAG.getConstant(Size - 1, DL, MVT::i32)); 11215 auto Final = Shifted; 11216 11217 // Complement the result if needed. Based on the condition code. 11218 if (Complement) 11219 Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted, 11220 DAG.getConstant(1, DL, MVT::i64)); 11221 11222 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final); 11223 } 11224 11225 SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N, 11226 DAGCombinerInfo &DCI) const { 11227 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); 11228 11229 SelectionDAG &DAG = DCI.DAG; 11230 SDLoc DL(N); 11231 11232 // Size of integers being compared has a critical role in the following 11233 // analysis, so we prefer to do this when all types are legal. 11234 if (!DCI.isAfterLegalizeDAG()) 11235 return SDValue(); 11236 11237 // If all users of SETCC extend its value to a legal integer type 11238 // then we replace SETCC with a subtraction 11239 for (SDNode::use_iterator UI = N->use_begin(), 11240 UE = N->use_end(); UI != UE; ++UI) { 11241 if (UI->getOpcode() != ISD::ZERO_EXTEND) 11242 return SDValue(); 11243 } 11244 11245 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 11246 auto OpSize = N->getOperand(0).getValueSizeInBits(); 11247 11248 unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits(); 11249 11250 if (OpSize < Size) { 11251 switch (CC) { 11252 default: break; 11253 case ISD::SETULT: 11254 return generateEquivalentSub(N, Size, false, false, DL, DAG); 11255 case ISD::SETULE: 11256 return generateEquivalentSub(N, Size, true, true, DL, DAG); 11257 case ISD::SETUGT: 11258 return generateEquivalentSub(N, Size, false, true, DL, DAG); 11259 case ISD::SETUGE: 11260 return generateEquivalentSub(N, Size, true, false, DL, DAG); 11261 } 11262 } 11263 11264 return SDValue(); 11265 } 11266 11267 SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, 11268 DAGCombinerInfo &DCI) const { 11269 SelectionDAG &DAG = DCI.DAG; 11270 SDLoc dl(N); 11271 11272 assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits"); 11273 // If we're tracking CR bits, we need to be careful that we don't have: 11274 // trunc(binary-ops(zext(x), zext(y))) 11275 // or 11276 // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) 11277 // such that we're unnecessarily moving things into GPRs when it would be 11278 // better to keep them in CR bits. 11279 11280 // Note that trunc here can be an actual i1 trunc, or can be the effective 11281 // truncation that comes from a setcc or select_cc. 11282 if (N->getOpcode() == ISD::TRUNCATE && 11283 N->getValueType(0) != MVT::i1) 11284 return SDValue(); 11285 11286 if (N->getOperand(0).getValueType() != MVT::i32 && 11287 N->getOperand(0).getValueType() != MVT::i64) 11288 return SDValue(); 11289 11290 if (N->getOpcode() == ISD::SETCC || 11291 N->getOpcode() == ISD::SELECT_CC) { 11292 // If we're looking at a comparison, then we need to make sure that the 11293 // high bits (all except for the first) don't matter the result. 11294 ISD::CondCode CC = 11295 cast<CondCodeSDNode>(N->getOperand( 11296 N->getOpcode() == ISD::SETCC ? 2 : 4))->get(); 11297 unsigned OpBits = N->getOperand(0).getValueSizeInBits(); 11298 11299 if (ISD::isSignedIntSetCC(CC)) { 11300 if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits || 11301 DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits) 11302 return SDValue(); 11303 } else if (ISD::isUnsignedIntSetCC(CC)) { 11304 if (!DAG.MaskedValueIsZero(N->getOperand(0), 11305 APInt::getHighBitsSet(OpBits, OpBits-1)) || 11306 !DAG.MaskedValueIsZero(N->getOperand(1), 11307 APInt::getHighBitsSet(OpBits, OpBits-1))) 11308 return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI) 11309 : SDValue()); 11310 } else { 11311 // This is neither a signed nor an unsigned comparison, just make sure 11312 // that the high bits are equal. 11313 KnownBits Op1Known = DAG.computeKnownBits(N->getOperand(0)); 11314 KnownBits Op2Known = DAG.computeKnownBits(N->getOperand(1)); 11315 11316 // We don't really care about what is known about the first bit (if 11317 // anything), so clear it in all masks prior to comparing them. 11318 Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0); 11319 Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0); 11320 11321 if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One) 11322 return SDValue(); 11323 } 11324 } 11325 11326 // We now know that the higher-order bits are irrelevant, we just need to 11327 // make sure that all of the intermediate operations are bit operations, and 11328 // all inputs are extensions. 11329 if (N->getOperand(0).getOpcode() != ISD::AND && 11330 N->getOperand(0).getOpcode() != ISD::OR && 11331 N->getOperand(0).getOpcode() != ISD::XOR && 11332 N->getOperand(0).getOpcode() != ISD::SELECT && 11333 N->getOperand(0).getOpcode() != ISD::SELECT_CC && 11334 N->getOperand(0).getOpcode() != ISD::TRUNCATE && 11335 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND && 11336 N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && 11337 N->getOperand(0).getOpcode() != ISD::ANY_EXTEND) 11338 return SDValue(); 11339 11340 if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) && 11341 N->getOperand(1).getOpcode() != ISD::AND && 11342 N->getOperand(1).getOpcode() != ISD::OR && 11343 N->getOperand(1).getOpcode() != ISD::XOR && 11344 N->getOperand(1).getOpcode() != ISD::SELECT && 11345 N->getOperand(1).getOpcode() != ISD::SELECT_CC && 11346 N->getOperand(1).getOpcode() != ISD::TRUNCATE && 11347 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND && 11348 N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && 11349 N->getOperand(1).getOpcode() != ISD::ANY_EXTEND) 11350 return SDValue(); 11351 11352 SmallVector<SDValue, 4> Inputs; 11353 SmallVector<SDValue, 8> BinOps, PromOps; 11354 SmallPtrSet<SDNode *, 16> Visited; 11355 11356 for (unsigned i = 0; i < 2; ++i) { 11357 if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND || 11358 N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND || 11359 N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) && 11360 N->getOperand(i).getOperand(0).getValueType() == MVT::i1) || 11361 isa<ConstantSDNode>(N->getOperand(i))) 11362 Inputs.push_back(N->getOperand(i)); 11363 else 11364 BinOps.push_back(N->getOperand(i)); 11365 11366 if (N->getOpcode() == ISD::TRUNCATE) 11367 break; 11368 } 11369 11370 // Visit all inputs, collect all binary operations (and, or, xor and 11371 // select) that are all fed by extensions. 11372 while (!BinOps.empty()) { 11373 SDValue BinOp = BinOps.back(); 11374 BinOps.pop_back(); 11375 11376 if (!Visited.insert(BinOp.getNode()).second) 11377 continue; 11378 11379 PromOps.push_back(BinOp); 11380 11381 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { 11382 // The condition of the select is not promoted. 11383 if (BinOp.getOpcode() == ISD::SELECT && i == 0) 11384 continue; 11385 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) 11386 continue; 11387 11388 if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || 11389 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || 11390 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) && 11391 BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) || 11392 isa<ConstantSDNode>(BinOp.getOperand(i))) { 11393 Inputs.push_back(BinOp.getOperand(i)); 11394 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || 11395 BinOp.getOperand(i).getOpcode() == ISD::OR || 11396 BinOp.getOperand(i).getOpcode() == ISD::XOR || 11397 BinOp.getOperand(i).getOpcode() == ISD::SELECT || 11398 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC || 11399 BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || 11400 BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || 11401 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || 11402 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) { 11403 BinOps.push_back(BinOp.getOperand(i)); 11404 } else { 11405 // We have an input that is not an extension or another binary 11406 // operation; we'll abort this transformation. 11407 return SDValue(); 11408 } 11409 } 11410 } 11411 11412 // Make sure that this is a self-contained cluster of operations (which 11413 // is not quite the same thing as saying that everything has only one 11414 // use). 11415 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 11416 if (isa<ConstantSDNode>(Inputs[i])) 11417 continue; 11418 11419 for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), 11420 UE = Inputs[i].getNode()->use_end(); 11421 UI != UE; ++UI) { 11422 SDNode *User = *UI; 11423 if (User != N && !Visited.count(User)) 11424 return SDValue(); 11425 11426 // Make sure that we're not going to promote the non-output-value 11427 // operand(s) or SELECT or SELECT_CC. 11428 // FIXME: Although we could sometimes handle this, and it does occur in 11429 // practice that one of the condition inputs to the select is also one of 11430 // the outputs, we currently can't deal with this. 11431 if (User->getOpcode() == ISD::SELECT) { 11432 if (User->getOperand(0) == Inputs[i]) 11433 return SDValue(); 11434 } else if (User->getOpcode() == ISD::SELECT_CC) { 11435 if (User->getOperand(0) == Inputs[i] || 11436 User->getOperand(1) == Inputs[i]) 11437 return SDValue(); 11438 } 11439 } 11440 } 11441 11442 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { 11443 for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), 11444 UE = PromOps[i].getNode()->use_end(); 11445 UI != UE; ++UI) { 11446 SDNode *User = *UI; 11447 if (User != N && !Visited.count(User)) 11448 return SDValue(); 11449 11450 // Make sure that we're not going to promote the non-output-value 11451 // operand(s) or SELECT or SELECT_CC. 11452 // FIXME: Although we could sometimes handle this, and it does occur in 11453 // practice that one of the condition inputs to the select is also one of 11454 // the outputs, we currently can't deal with this. 11455 if (User->getOpcode() == ISD::SELECT) { 11456 if (User->getOperand(0) == PromOps[i]) 11457 return SDValue(); 11458 } else if (User->getOpcode() == ISD::SELECT_CC) { 11459 if (User->getOperand(0) == PromOps[i] || 11460 User->getOperand(1) == PromOps[i]) 11461 return SDValue(); 11462 } 11463 } 11464 } 11465 11466 // Replace all inputs with the extension operand. 11467 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 11468 // Constants may have users outside the cluster of to-be-promoted nodes, 11469 // and so we need to replace those as we do the promotions. 11470 if (isa<ConstantSDNode>(Inputs[i])) 11471 continue; 11472 else 11473 DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); 11474 } 11475 11476 std::list<HandleSDNode> PromOpHandles; 11477 for (auto &PromOp : PromOps) 11478 PromOpHandles.emplace_back(PromOp); 11479 11480 // Replace all operations (these are all the same, but have a different 11481 // (i1) return type). DAG.getNode will validate that the types of 11482 // a binary operator match, so go through the list in reverse so that 11483 // we've likely promoted both operands first. Any intermediate truncations or 11484 // extensions disappear. 11485 while (!PromOpHandles.empty()) { 11486 SDValue PromOp = PromOpHandles.back().getValue(); 11487 PromOpHandles.pop_back(); 11488 11489 if (PromOp.getOpcode() == ISD::TRUNCATE || 11490 PromOp.getOpcode() == ISD::SIGN_EXTEND || 11491 PromOp.getOpcode() == ISD::ZERO_EXTEND || 11492 PromOp.getOpcode() == ISD::ANY_EXTEND) { 11493 if (!isa<ConstantSDNode>(PromOp.getOperand(0)) && 11494 PromOp.getOperand(0).getValueType() != MVT::i1) { 11495 // The operand is not yet ready (see comment below). 11496 PromOpHandles.emplace_front(PromOp); 11497 continue; 11498 } 11499 11500 SDValue RepValue = PromOp.getOperand(0); 11501 if (isa<ConstantSDNode>(RepValue)) 11502 RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue); 11503 11504 DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue); 11505 continue; 11506 } 11507 11508 unsigned C; 11509 switch (PromOp.getOpcode()) { 11510 default: C = 0; break; 11511 case ISD::SELECT: C = 1; break; 11512 case ISD::SELECT_CC: C = 2; break; 11513 } 11514 11515 if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) && 11516 PromOp.getOperand(C).getValueType() != MVT::i1) || 11517 (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) && 11518 PromOp.getOperand(C+1).getValueType() != MVT::i1)) { 11519 // The to-be-promoted operands of this node have not yet been 11520 // promoted (this should be rare because we're going through the 11521 // list backward, but if one of the operands has several users in 11522 // this cluster of to-be-promoted nodes, it is possible). 11523 PromOpHandles.emplace_front(PromOp); 11524 continue; 11525 } 11526 11527 SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), 11528 PromOp.getNode()->op_end()); 11529 11530 // If there are any constant inputs, make sure they're replaced now. 11531 for (unsigned i = 0; i < 2; ++i) 11532 if (isa<ConstantSDNode>(Ops[C+i])) 11533 Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]); 11534 11535 DAG.ReplaceAllUsesOfValueWith(PromOp, 11536 DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops)); 11537 } 11538 11539 // Now we're left with the initial truncation itself. 11540 if (N->getOpcode() == ISD::TRUNCATE) 11541 return N->getOperand(0); 11542 11543 // Otherwise, this is a comparison. The operands to be compared have just 11544 // changed type (to i1), but everything else is the same. 11545 return SDValue(N, 0); 11546 } 11547 11548 SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, 11549 DAGCombinerInfo &DCI) const { 11550 SelectionDAG &DAG = DCI.DAG; 11551 SDLoc dl(N); 11552 11553 // If we're tracking CR bits, we need to be careful that we don't have: 11554 // zext(binary-ops(trunc(x), trunc(y))) 11555 // or 11556 // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) 11557 // such that we're unnecessarily moving things into CR bits that can more 11558 // efficiently stay in GPRs. Note that if we're not certain that the high 11559 // bits are set as required by the final extension, we still may need to do 11560 // some masking to get the proper behavior. 11561 11562 // This same functionality is important on PPC64 when dealing with 11563 // 32-to-64-bit extensions; these occur often when 32-bit values are used as 11564 // the return values of functions. Because it is so similar, it is handled 11565 // here as well. 11566 11567 if (N->getValueType(0) != MVT::i32 && 11568 N->getValueType(0) != MVT::i64) 11569 return SDValue(); 11570 11571 if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) || 11572 (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64()))) 11573 return SDValue(); 11574 11575 if (N->getOperand(0).getOpcode() != ISD::AND && 11576 N->getOperand(0).getOpcode() != ISD::OR && 11577 N->getOperand(0).getOpcode() != ISD::XOR && 11578 N->getOperand(0).getOpcode() != ISD::SELECT && 11579 N->getOperand(0).getOpcode() != ISD::SELECT_CC) 11580 return SDValue(); 11581 11582 SmallVector<SDValue, 4> Inputs; 11583 SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps; 11584 SmallPtrSet<SDNode *, 16> Visited; 11585 11586 // Visit all inputs, collect all binary operations (and, or, xor and 11587 // select) that are all fed by truncations. 11588 while (!BinOps.empty()) { 11589 SDValue BinOp = BinOps.back(); 11590 BinOps.pop_back(); 11591 11592 if (!Visited.insert(BinOp.getNode()).second) 11593 continue; 11594 11595 PromOps.push_back(BinOp); 11596 11597 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { 11598 // The condition of the select is not promoted. 11599 if (BinOp.getOpcode() == ISD::SELECT && i == 0) 11600 continue; 11601 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) 11602 continue; 11603 11604 if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || 11605 isa<ConstantSDNode>(BinOp.getOperand(i))) { 11606 Inputs.push_back(BinOp.getOperand(i)); 11607 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || 11608 BinOp.getOperand(i).getOpcode() == ISD::OR || 11609 BinOp.getOperand(i).getOpcode() == ISD::XOR || 11610 BinOp.getOperand(i).getOpcode() == ISD::SELECT || 11611 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) { 11612 BinOps.push_back(BinOp.getOperand(i)); 11613 } else { 11614 // We have an input that is not a truncation or another binary 11615 // operation; we'll abort this transformation. 11616 return SDValue(); 11617 } 11618 } 11619 } 11620 11621 // The operands of a select that must be truncated when the select is 11622 // promoted because the operand is actually part of the to-be-promoted set. 11623 DenseMap<SDNode *, EVT> SelectTruncOp[2]; 11624 11625 // Make sure that this is a self-contained cluster of operations (which 11626 // is not quite the same thing as saying that everything has only one 11627 // use). 11628 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 11629 if (isa<ConstantSDNode>(Inputs[i])) 11630 continue; 11631 11632 for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), 11633 UE = Inputs[i].getNode()->use_end(); 11634 UI != UE; ++UI) { 11635 SDNode *User = *UI; 11636 if (User != N && !Visited.count(User)) 11637 return SDValue(); 11638 11639 // If we're going to promote the non-output-value operand(s) or SELECT or 11640 // SELECT_CC, record them for truncation. 11641 if (User->getOpcode() == ISD::SELECT) { 11642 if (User->getOperand(0) == Inputs[i]) 11643 SelectTruncOp[0].insert(std::make_pair(User, 11644 User->getOperand(0).getValueType())); 11645 } else if (User->getOpcode() == ISD::SELECT_CC) { 11646 if (User->getOperand(0) == Inputs[i]) 11647 SelectTruncOp[0].insert(std::make_pair(User, 11648 User->getOperand(0).getValueType())); 11649 if (User->getOperand(1) == Inputs[i]) 11650 SelectTruncOp[1].insert(std::make_pair(User, 11651 User->getOperand(1).getValueType())); 11652 } 11653 } 11654 } 11655 11656 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { 11657 for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), 11658 UE = PromOps[i].getNode()->use_end(); 11659 UI != UE; ++UI) { 11660 SDNode *User = *UI; 11661 if (User != N && !Visited.count(User)) 11662 return SDValue(); 11663 11664 // If we're going to promote the non-output-value operand(s) or SELECT or 11665 // SELECT_CC, record them for truncation. 11666 if (User->getOpcode() == ISD::SELECT) { 11667 if (User->getOperand(0) == PromOps[i]) 11668 SelectTruncOp[0].insert(std::make_pair(User, 11669 User->getOperand(0).getValueType())); 11670 } else if (User->getOpcode() == ISD::SELECT_CC) { 11671 if (User->getOperand(0) == PromOps[i]) 11672 SelectTruncOp[0].insert(std::make_pair(User, 11673 User->getOperand(0).getValueType())); 11674 if (User->getOperand(1) == PromOps[i]) 11675 SelectTruncOp[1].insert(std::make_pair(User, 11676 User->getOperand(1).getValueType())); 11677 } 11678 } 11679 } 11680 11681 unsigned PromBits = N->getOperand(0).getValueSizeInBits(); 11682 bool ReallyNeedsExt = false; 11683 if (N->getOpcode() != ISD::ANY_EXTEND) { 11684 // If all of the inputs are not already sign/zero extended, then 11685 // we'll still need to do that at the end. 11686 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 11687 if (isa<ConstantSDNode>(Inputs[i])) 11688 continue; 11689 11690 unsigned OpBits = 11691 Inputs[i].getOperand(0).getValueSizeInBits(); 11692 assert(PromBits < OpBits && "Truncation not to a smaller bit count?"); 11693 11694 if ((N->getOpcode() == ISD::ZERO_EXTEND && 11695 !DAG.MaskedValueIsZero(Inputs[i].getOperand(0), 11696 APInt::getHighBitsSet(OpBits, 11697 OpBits-PromBits))) || 11698 (N->getOpcode() == ISD::SIGN_EXTEND && 11699 DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) < 11700 (OpBits-(PromBits-1)))) { 11701 ReallyNeedsExt = true; 11702 break; 11703 } 11704 } 11705 } 11706 11707 // Replace all inputs, either with the truncation operand, or a 11708 // truncation or extension to the final output type. 11709 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { 11710 // Constant inputs need to be replaced with the to-be-promoted nodes that 11711 // use them because they might have users outside of the cluster of 11712 // promoted nodes. 11713 if (isa<ConstantSDNode>(Inputs[i])) 11714 continue; 11715 11716 SDValue InSrc = Inputs[i].getOperand(0); 11717 if (Inputs[i].getValueType() == N->getValueType(0)) 11718 DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc); 11719 else if (N->getOpcode() == ISD::SIGN_EXTEND) 11720 DAG.ReplaceAllUsesOfValueWith(Inputs[i], 11721 DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0))); 11722 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11723 DAG.ReplaceAllUsesOfValueWith(Inputs[i], 11724 DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0))); 11725 else 11726 DAG.ReplaceAllUsesOfValueWith(Inputs[i], 11727 DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0))); 11728 } 11729 11730 std::list<HandleSDNode> PromOpHandles; 11731 for (auto &PromOp : PromOps) 11732 PromOpHandles.emplace_back(PromOp); 11733 11734 // Replace all operations (these are all the same, but have a different 11735 // (promoted) return type). DAG.getNode will validate that the types of 11736 // a binary operator match, so go through the list in reverse so that 11737 // we've likely promoted both operands first. 11738 while (!PromOpHandles.empty()) { 11739 SDValue PromOp = PromOpHandles.back().getValue(); 11740 PromOpHandles.pop_back(); 11741 11742 unsigned C; 11743 switch (PromOp.getOpcode()) { 11744 default: C = 0; break; 11745 case ISD::SELECT: C = 1; break; 11746 case ISD::SELECT_CC: C = 2; break; 11747 } 11748 11749 if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) && 11750 PromOp.getOperand(C).getValueType() != N->getValueType(0)) || 11751 (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) && 11752 PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) { 11753 // The to-be-promoted operands of this node have not yet been 11754 // promoted (this should be rare because we're going through the 11755 // list backward, but if one of the operands has several users in 11756 // this cluster of to-be-promoted nodes, it is possible). 11757 PromOpHandles.emplace_front(PromOp); 11758 continue; 11759 } 11760 11761 // For SELECT and SELECT_CC nodes, we do a similar check for any 11762 // to-be-promoted comparison inputs. 11763 if (PromOp.getOpcode() == ISD::SELECT || 11764 PromOp.getOpcode() == ISD::SELECT_CC) { 11765 if ((SelectTruncOp[0].count(PromOp.getNode()) && 11766 PromOp.getOperand(0).getValueType() != N->getValueType(0)) || 11767 (SelectTruncOp[1].count(PromOp.getNode()) && 11768 PromOp.getOperand(1).getValueType() != N->getValueType(0))) { 11769 PromOpHandles.emplace_front(PromOp); 11770 continue; 11771 } 11772 } 11773 11774 SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), 11775 PromOp.getNode()->op_end()); 11776 11777 // If this node has constant inputs, then they'll need to be promoted here. 11778 for (unsigned i = 0; i < 2; ++i) { 11779 if (!isa<ConstantSDNode>(Ops[C+i])) 11780 continue; 11781 if (Ops[C+i].getValueType() == N->getValueType(0)) 11782 continue; 11783 11784 if (N->getOpcode() == ISD::SIGN_EXTEND) 11785 Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); 11786 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11787 Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); 11788 else 11789 Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); 11790 } 11791 11792 // If we've promoted the comparison inputs of a SELECT or SELECT_CC, 11793 // truncate them again to the original value type. 11794 if (PromOp.getOpcode() == ISD::SELECT || 11795 PromOp.getOpcode() == ISD::SELECT_CC) { 11796 auto SI0 = SelectTruncOp[0].find(PromOp.getNode()); 11797 if (SI0 != SelectTruncOp[0].end()) 11798 Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]); 11799 auto SI1 = SelectTruncOp[1].find(PromOp.getNode()); 11800 if (SI1 != SelectTruncOp[1].end()) 11801 Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]); 11802 } 11803 11804 DAG.ReplaceAllUsesOfValueWith(PromOp, 11805 DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops)); 11806 } 11807 11808 // Now we're left with the initial extension itself. 11809 if (!ReallyNeedsExt) 11810 return N->getOperand(0); 11811 11812 // To zero extend, just mask off everything except for the first bit (in the 11813 // i1 case). 11814 if (N->getOpcode() == ISD::ZERO_EXTEND) 11815 return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0), 11816 DAG.getConstant(APInt::getLowBitsSet( 11817 N->getValueSizeInBits(0), PromBits), 11818 dl, N->getValueType(0))); 11819 11820 assert(N->getOpcode() == ISD::SIGN_EXTEND && 11821 "Invalid extension type"); 11822 EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout()); 11823 SDValue ShiftCst = 11824 DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy); 11825 return DAG.getNode( 11826 ISD::SRA, dl, N->getValueType(0), 11827 DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst), 11828 ShiftCst); 11829 } 11830 11831 SDValue PPCTargetLowering::combineSetCC(SDNode *N, 11832 DAGCombinerInfo &DCI) const { 11833 assert(N->getOpcode() == ISD::SETCC && 11834 "Should be called with a SETCC node"); 11835 11836 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 11837 if (CC == ISD::SETNE || CC == ISD::SETEQ) { 11838 SDValue LHS = N->getOperand(0); 11839 SDValue RHS = N->getOperand(1); 11840 11841 // If there is a '0 - y' pattern, canonicalize the pattern to the RHS. 11842 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) && 11843 LHS.hasOneUse()) 11844 std::swap(LHS, RHS); 11845 11846 // x == 0-y --> x+y == 0 11847 // x != 0-y --> x+y != 0 11848 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && 11849 RHS.hasOneUse()) { 11850 SDLoc DL(N); 11851 SelectionDAG &DAG = DCI.DAG; 11852 EVT VT = N->getValueType(0); 11853 EVT OpVT = LHS.getValueType(); 11854 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1)); 11855 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); 11856 } 11857 } 11858 11859 return DAGCombineTruncBoolExt(N, DCI); 11860 } 11861 11862 // Is this an extending load from an f32 to an f64? 11863 static bool isFPExtLoad(SDValue Op) { 11864 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode())) 11865 return LD->getExtensionType() == ISD::EXTLOAD && 11866 Op.getValueType() == MVT::f64; 11867 return false; 11868 } 11869 11870 /// Reduces the number of fp-to-int conversion when building a vector. 11871 /// 11872 /// If this vector is built out of floating to integer conversions, 11873 /// transform it to a vector built out of floating point values followed by a 11874 /// single floating to integer conversion of the vector. 11875 /// Namely (build_vector (fptosi $A), (fptosi $B), ...) 11876 /// becomes (fptosi (build_vector ($A, $B, ...))) 11877 SDValue PPCTargetLowering:: 11878 combineElementTruncationToVectorTruncation(SDNode *N, 11879 DAGCombinerInfo &DCI) const { 11880 assert(N->getOpcode() == ISD::BUILD_VECTOR && 11881 "Should be called with a BUILD_VECTOR node"); 11882 11883 SelectionDAG &DAG = DCI.DAG; 11884 SDLoc dl(N); 11885 11886 SDValue FirstInput = N->getOperand(0); 11887 assert(FirstInput.getOpcode() == PPCISD::MFVSR && 11888 "The input operand must be an fp-to-int conversion."); 11889 11890 // This combine happens after legalization so the fp_to_[su]i nodes are 11891 // already converted to PPCSISD nodes. 11892 unsigned FirstConversion = FirstInput.getOperand(0).getOpcode(); 11893 if (FirstConversion == PPCISD::FCTIDZ || 11894 FirstConversion == PPCISD::FCTIDUZ || 11895 FirstConversion == PPCISD::FCTIWZ || 11896 FirstConversion == PPCISD::FCTIWUZ) { 11897 bool IsSplat = true; 11898 bool Is32Bit = FirstConversion == PPCISD::FCTIWZ || 11899 FirstConversion == PPCISD::FCTIWUZ; 11900 EVT SrcVT = FirstInput.getOperand(0).getValueType(); 11901 SmallVector<SDValue, 4> Ops; 11902 EVT TargetVT = N->getValueType(0); 11903 for (int i = 0, e = N->getNumOperands(); i < e; ++i) { 11904 SDValue NextOp = N->getOperand(i); 11905 if (NextOp.getOpcode() != PPCISD::MFVSR) 11906 return SDValue(); 11907 unsigned NextConversion = NextOp.getOperand(0).getOpcode(); 11908 if (NextConversion != FirstConversion) 11909 return SDValue(); 11910 // If we are converting to 32-bit integers, we need to add an FP_ROUND. 11911 // This is not valid if the input was originally double precision. It is 11912 // also not profitable to do unless this is an extending load in which 11913 // case doing this combine will allow us to combine consecutive loads. 11914 if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0))) 11915 return SDValue(); 11916 if (N->getOperand(i) != FirstInput) 11917 IsSplat = false; 11918 } 11919 11920 // If this is a splat, we leave it as-is since there will be only a single 11921 // fp-to-int conversion followed by a splat of the integer. This is better 11922 // for 32-bit and smaller ints and neutral for 64-bit ints. 11923 if (IsSplat) 11924 return SDValue(); 11925 11926 // Now that we know we have the right type of node, get its operands 11927 for (int i = 0, e = N->getNumOperands(); i < e; ++i) { 11928 SDValue In = N->getOperand(i).getOperand(0); 11929 if (Is32Bit) { 11930 // For 32-bit values, we need to add an FP_ROUND node (if we made it 11931 // here, we know that all inputs are extending loads so this is safe). 11932 if (In.isUndef()) 11933 Ops.push_back(DAG.getUNDEF(SrcVT)); 11934 else { 11935 SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl, 11936 MVT::f32, In.getOperand(0), 11937 DAG.getIntPtrConstant(1, dl)); 11938 Ops.push_back(Trunc); 11939 } 11940 } else 11941 Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0)); 11942 } 11943 11944 unsigned Opcode; 11945 if (FirstConversion == PPCISD::FCTIDZ || 11946 FirstConversion == PPCISD::FCTIWZ) 11947 Opcode = ISD::FP_TO_SINT; 11948 else 11949 Opcode = ISD::FP_TO_UINT; 11950 11951 EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32; 11952 SDValue BV = DAG.getBuildVector(NewVT, dl, Ops); 11953 return DAG.getNode(Opcode, dl, TargetVT, BV); 11954 } 11955 return SDValue(); 11956 } 11957 11958 /// Reduce the number of loads when building a vector. 11959 /// 11960 /// Building a vector out of multiple loads can be converted to a load 11961 /// of the vector type if the loads are consecutive. If the loads are 11962 /// consecutive but in descending order, a shuffle is added at the end 11963 /// to reorder the vector. 11964 static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { 11965 assert(N->getOpcode() == ISD::BUILD_VECTOR && 11966 "Should be called with a BUILD_VECTOR node"); 11967 11968 SDLoc dl(N); 11969 bool InputsAreConsecutiveLoads = true; 11970 bool InputsAreReverseConsecutive = true; 11971 unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8; 11972 SDValue FirstInput = N->getOperand(0); 11973 bool IsRoundOfExtLoad = false; 11974 11975 if (FirstInput.getOpcode() == ISD::FP_ROUND && 11976 FirstInput.getOperand(0).getOpcode() == ISD::LOAD) { 11977 LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0)); 11978 IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD; 11979 } 11980 // Not a build vector of (possibly fp_rounded) loads. 11981 if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) || 11982 N->getNumOperands() == 1) 11983 return SDValue(); 11984 11985 for (int i = 1, e = N->getNumOperands(); i < e; ++i) { 11986 // If any inputs are fp_round(extload), they all must be. 11987 if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND) 11988 return SDValue(); 11989 11990 SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) : 11991 N->getOperand(i); 11992 if (NextInput.getOpcode() != ISD::LOAD) 11993 return SDValue(); 11994 11995 SDValue PreviousInput = 11996 IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1); 11997 LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput); 11998 LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput); 11999 12000 // If any inputs are fp_round(extload), they all must be. 12001 if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD) 12002 return SDValue(); 12003 12004 if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG)) 12005 InputsAreConsecutiveLoads = false; 12006 if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG)) 12007 InputsAreReverseConsecutive = false; 12008 12009 // Exit early if the loads are neither consecutive nor reverse consecutive. 12010 if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive) 12011 return SDValue(); 12012 } 12013 12014 assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) && 12015 "The loads cannot be both consecutive and reverse consecutive."); 12016 12017 SDValue FirstLoadOp = 12018 IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput; 12019 SDValue LastLoadOp = 12020 IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) : 12021 N->getOperand(N->getNumOperands()-1); 12022 12023 LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp); 12024 LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp); 12025 if (InputsAreConsecutiveLoads) { 12026 assert(LD1 && "Input needs to be a LoadSDNode."); 12027 return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(), 12028 LD1->getBasePtr(), LD1->getPointerInfo(), 12029 LD1->getAlignment()); 12030 } 12031 if (InputsAreReverseConsecutive) { 12032 assert(LDL && "Input needs to be a LoadSDNode."); 12033 SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(), 12034 LDL->getBasePtr(), LDL->getPointerInfo(), 12035 LDL->getAlignment()); 12036 SmallVector<int, 16> Ops; 12037 for (int i = N->getNumOperands() - 1; i >= 0; i--) 12038 Ops.push_back(i); 12039 12040 return DAG.getVectorShuffle(N->getValueType(0), dl, Load, 12041 DAG.getUNDEF(N->getValueType(0)), Ops); 12042 } 12043 return SDValue(); 12044 } 12045 12046 // This function adds the required vector_shuffle needed to get 12047 // the elements of the vector extract in the correct position 12048 // as specified by the CorrectElems encoding. 12049 static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG, 12050 SDValue Input, uint64_t Elems, 12051 uint64_t CorrectElems) { 12052 SDLoc dl(N); 12053 12054 unsigned NumElems = Input.getValueType().getVectorNumElements(); 12055 SmallVector<int, 16> ShuffleMask(NumElems, -1); 12056 12057 // Knowing the element indices being extracted from the original 12058 // vector and the order in which they're being inserted, just put 12059 // them at element indices required for the instruction. 12060 for (unsigned i = 0; i < N->getNumOperands(); i++) { 12061 if (DAG.getDataLayout().isLittleEndian()) 12062 ShuffleMask[CorrectElems & 0xF] = Elems & 0xF; 12063 else 12064 ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4; 12065 CorrectElems = CorrectElems >> 8; 12066 Elems = Elems >> 8; 12067 } 12068 12069 SDValue Shuffle = 12070 DAG.getVectorShuffle(Input.getValueType(), dl, Input, 12071 DAG.getUNDEF(Input.getValueType()), ShuffleMask); 12072 12073 EVT Ty = N->getValueType(0); 12074 SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle); 12075 return BV; 12076 } 12077 12078 // Look for build vector patterns where input operands come from sign 12079 // extended vector_extract elements of specific indices. If the correct indices 12080 // aren't used, add a vector shuffle to fix up the indices and create a new 12081 // PPCISD:SExtVElems node which selects the vector sign extend instructions 12082 // during instruction selection. 12083 static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) { 12084 // This array encodes the indices that the vector sign extend instructions 12085 // extract from when extending from one type to another for both BE and LE. 12086 // The right nibble of each byte corresponds to the LE incides. 12087 // and the left nibble of each byte corresponds to the BE incides. 12088 // For example: 0x3074B8FC byte->word 12089 // For LE: the allowed indices are: 0x0,0x4,0x8,0xC 12090 // For BE: the allowed indices are: 0x3,0x7,0xB,0xF 12091 // For example: 0x000070F8 byte->double word 12092 // For LE: the allowed indices are: 0x0,0x8 12093 // For BE: the allowed indices are: 0x7,0xF 12094 uint64_t TargetElems[] = { 12095 0x3074B8FC, // b->w 12096 0x000070F8, // b->d 12097 0x10325476, // h->w 12098 0x00003074, // h->d 12099 0x00001032, // w->d 12100 }; 12101 12102 uint64_t Elems = 0; 12103 int Index; 12104 SDValue Input; 12105 12106 auto isSExtOfVecExtract = [&](SDValue Op) -> bool { 12107 if (!Op) 12108 return false; 12109 if (Op.getOpcode() != ISD::SIGN_EXTEND && 12110 Op.getOpcode() != ISD::SIGN_EXTEND_INREG) 12111 return false; 12112 12113 // A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value 12114 // of the right width. 12115 SDValue Extract = Op.getOperand(0); 12116 if (Extract.getOpcode() == ISD::ANY_EXTEND) 12117 Extract = Extract.getOperand(0); 12118 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12119 return false; 12120 12121 ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1)); 12122 if (!ExtOp) 12123 return false; 12124 12125 Index = ExtOp->getZExtValue(); 12126 if (Input && Input != Extract.getOperand(0)) 12127 return false; 12128 12129 if (!Input) 12130 Input = Extract.getOperand(0); 12131 12132 Elems = Elems << 8; 12133 Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4; 12134 Elems |= Index; 12135 12136 return true; 12137 }; 12138 12139 // If the build vector operands aren't sign extended vector extracts, 12140 // of the same input vector, then return. 12141 for (unsigned i = 0; i < N->getNumOperands(); i++) { 12142 if (!isSExtOfVecExtract(N->getOperand(i))) { 12143 return SDValue(); 12144 } 12145 } 12146 12147 // If the vector extract indicies are not correct, add the appropriate 12148 // vector_shuffle. 12149 int TgtElemArrayIdx; 12150 int InputSize = Input.getValueType().getScalarSizeInBits(); 12151 int OutputSize = N->getValueType(0).getScalarSizeInBits(); 12152 if (InputSize + OutputSize == 40) 12153 TgtElemArrayIdx = 0; 12154 else if (InputSize + OutputSize == 72) 12155 TgtElemArrayIdx = 1; 12156 else if (InputSize + OutputSize == 48) 12157 TgtElemArrayIdx = 2; 12158 else if (InputSize + OutputSize == 80) 12159 TgtElemArrayIdx = 3; 12160 else if (InputSize + OutputSize == 96) 12161 TgtElemArrayIdx = 4; 12162 else 12163 return SDValue(); 12164 12165 uint64_t CorrectElems = TargetElems[TgtElemArrayIdx]; 12166 CorrectElems = DAG.getDataLayout().isLittleEndian() 12167 ? CorrectElems & 0x0F0F0F0F0F0F0F0F 12168 : CorrectElems & 0xF0F0F0F0F0F0F0F0; 12169 if (Elems != CorrectElems) { 12170 return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems); 12171 } 12172 12173 // Regular lowering will catch cases where a shuffle is not needed. 12174 return SDValue(); 12175 } 12176 12177 SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, 12178 DAGCombinerInfo &DCI) const { 12179 assert(N->getOpcode() == ISD::BUILD_VECTOR && 12180 "Should be called with a BUILD_VECTOR node"); 12181 12182 SelectionDAG &DAG = DCI.DAG; 12183 SDLoc dl(N); 12184 12185 if (!Subtarget.hasVSX()) 12186 return SDValue(); 12187 12188 // The target independent DAG combiner will leave a build_vector of 12189 // float-to-int conversions intact. We can generate MUCH better code for 12190 // a float-to-int conversion of a vector of floats. 12191 SDValue FirstInput = N->getOperand(0); 12192 if (FirstInput.getOpcode() == PPCISD::MFVSR) { 12193 SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI); 12194 if (Reduced) 12195 return Reduced; 12196 } 12197 12198 // If we're building a vector out of consecutive loads, just load that 12199 // vector type. 12200 SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG); 12201 if (Reduced) 12202 return Reduced; 12203 12204 // If we're building a vector out of extended elements from another vector 12205 // we have P9 vector integer extend instructions. The code assumes legal 12206 // input types (i.e. it can't handle things like v4i16) so do not run before 12207 // legalization. 12208 if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) { 12209 Reduced = combineBVOfVecSExt(N, DAG); 12210 if (Reduced) 12211 return Reduced; 12212 } 12213 12214 12215 if (N->getValueType(0) != MVT::v2f64) 12216 return SDValue(); 12217 12218 // Looking for: 12219 // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1)) 12220 if (FirstInput.getOpcode() != ISD::SINT_TO_FP && 12221 FirstInput.getOpcode() != ISD::UINT_TO_FP) 12222 return SDValue(); 12223 if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP && 12224 N->getOperand(1).getOpcode() != ISD::UINT_TO_FP) 12225 return SDValue(); 12226 if (FirstInput.getOpcode() != N->getOperand(1).getOpcode()) 12227 return SDValue(); 12228 12229 SDValue Ext1 = FirstInput.getOperand(0); 12230 SDValue Ext2 = N->getOperand(1).getOperand(0); 12231 if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 12232 Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12233 return SDValue(); 12234 12235 ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1)); 12236 ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1)); 12237 if (!Ext1Op || !Ext2Op) 12238 return SDValue(); 12239 if (Ext1.getValueType() != MVT::i32 || 12240 Ext2.getValueType() != MVT::i32) 12241 if (Ext1.getOperand(0) != Ext2.getOperand(0)) 12242 return SDValue(); 12243 12244 int FirstElem = Ext1Op->getZExtValue(); 12245 int SecondElem = Ext2Op->getZExtValue(); 12246 int SubvecIdx; 12247 if (FirstElem == 0 && SecondElem == 1) 12248 SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0; 12249 else if (FirstElem == 2 && SecondElem == 3) 12250 SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1; 12251 else 12252 return SDValue(); 12253 12254 SDValue SrcVec = Ext1.getOperand(0); 12255 auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ? 12256 PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP; 12257 return DAG.getNode(NodeType, dl, MVT::v2f64, 12258 SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl)); 12259 } 12260 12261 SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, 12262 DAGCombinerInfo &DCI) const { 12263 assert((N->getOpcode() == ISD::SINT_TO_FP || 12264 N->getOpcode() == ISD::UINT_TO_FP) && 12265 "Need an int -> FP conversion node here"); 12266 12267 if (useSoftFloat() || !Subtarget.has64BitSupport()) 12268 return SDValue(); 12269 12270 SelectionDAG &DAG = DCI.DAG; 12271 SDLoc dl(N); 12272 SDValue Op(N, 0); 12273 12274 // Don't handle ppc_fp128 here or conversions that are out-of-range capable 12275 // from the hardware. 12276 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) 12277 return SDValue(); 12278 if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) || 12279 Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64)) 12280 return SDValue(); 12281 12282 SDValue FirstOperand(Op.getOperand(0)); 12283 bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD && 12284 (FirstOperand.getValueType() == MVT::i8 || 12285 FirstOperand.getValueType() == MVT::i16); 12286 if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) { 12287 bool Signed = N->getOpcode() == ISD::SINT_TO_FP; 12288 bool DstDouble = Op.getValueType() == MVT::f64; 12289 unsigned ConvOp = Signed ? 12290 (DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) : 12291 (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS); 12292 SDValue WidthConst = 12293 DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2, 12294 dl, false); 12295 LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode()); 12296 SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst }; 12297 SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl, 12298 DAG.getVTList(MVT::f64, MVT::Other), 12299 Ops, MVT::i8, LDN->getMemOperand()); 12300 12301 // For signed conversion, we need to sign-extend the value in the VSR 12302 if (Signed) { 12303 SDValue ExtOps[] = { Ld, WidthConst }; 12304 SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps); 12305 return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext); 12306 } else 12307 return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld); 12308 } 12309 12310 12311 // For i32 intermediate values, unfortunately, the conversion functions 12312 // leave the upper 32 bits of the value are undefined. Within the set of 12313 // scalar instructions, we have no method for zero- or sign-extending the 12314 // value. Thus, we cannot handle i32 intermediate values here. 12315 if (Op.getOperand(0).getValueType() == MVT::i32) 12316 return SDValue(); 12317 12318 assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && 12319 "UINT_TO_FP is supported only with FPCVT"); 12320 12321 // If we have FCFIDS, then use it when converting to single-precision. 12322 // Otherwise, convert to double-precision and then round. 12323 unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 12324 ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS 12325 : PPCISD::FCFIDS) 12326 : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU 12327 : PPCISD::FCFID); 12328 MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) 12329 ? MVT::f32 12330 : MVT::f64; 12331 12332 // If we're converting from a float, to an int, and back to a float again, 12333 // then we don't need the store/load pair at all. 12334 if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT && 12335 Subtarget.hasFPCVT()) || 12336 (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) { 12337 SDValue Src = Op.getOperand(0).getOperand(0); 12338 if (Src.getValueType() == MVT::f32) { 12339 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); 12340 DCI.AddToWorklist(Src.getNode()); 12341 } else if (Src.getValueType() != MVT::f64) { 12342 // Make sure that we don't pick up a ppc_fp128 source value. 12343 return SDValue(); 12344 } 12345 12346 unsigned FCTOp = 12347 Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ : 12348 PPCISD::FCTIDUZ; 12349 12350 SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src); 12351 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp); 12352 12353 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) { 12354 FP = DAG.getNode(ISD::FP_ROUND, dl, 12355 MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); 12356 DCI.AddToWorklist(FP.getNode()); 12357 } 12358 12359 return FP; 12360 } 12361 12362 return SDValue(); 12363 } 12364 12365 // expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for 12366 // builtins) into loads with swaps. 12367 SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, 12368 DAGCombinerInfo &DCI) const { 12369 SelectionDAG &DAG = DCI.DAG; 12370 SDLoc dl(N); 12371 SDValue Chain; 12372 SDValue Base; 12373 MachineMemOperand *MMO; 12374 12375 switch (N->getOpcode()) { 12376 default: 12377 llvm_unreachable("Unexpected opcode for little endian VSX load"); 12378 case ISD::LOAD: { 12379 LoadSDNode *LD = cast<LoadSDNode>(N); 12380 Chain = LD->getChain(); 12381 Base = LD->getBasePtr(); 12382 MMO = LD->getMemOperand(); 12383 // If the MMO suggests this isn't a load of a full vector, leave 12384 // things alone. For a built-in, we have to make the change for 12385 // correctness, so if there is a size problem that will be a bug. 12386 if (MMO->getSize() < 16) 12387 return SDValue(); 12388 break; 12389 } 12390 case ISD::INTRINSIC_W_CHAIN: { 12391 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N); 12392 Chain = Intrin->getChain(); 12393 // Similarly to the store case below, Intrin->getBasePtr() doesn't get 12394 // us what we want. Get operand 2 instead. 12395 Base = Intrin->getOperand(2); 12396 MMO = Intrin->getMemOperand(); 12397 break; 12398 } 12399 } 12400 12401 MVT VecTy = N->getValueType(0).getSimpleVT(); 12402 12403 // Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is 12404 // aligned and the type is a vector with elements up to 4 bytes 12405 if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) 12406 && VecTy.getScalarSizeInBits() <= 32 ) { 12407 return SDValue(); 12408 } 12409 12410 SDValue LoadOps[] = { Chain, Base }; 12411 SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl, 12412 DAG.getVTList(MVT::v2f64, MVT::Other), 12413 LoadOps, MVT::v2f64, MMO); 12414 12415 DCI.AddToWorklist(Load.getNode()); 12416 Chain = Load.getValue(1); 12417 SDValue Swap = DAG.getNode( 12418 PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load); 12419 DCI.AddToWorklist(Swap.getNode()); 12420 12421 // Add a bitcast if the resulting load type doesn't match v2f64. 12422 if (VecTy != MVT::v2f64) { 12423 SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap); 12424 DCI.AddToWorklist(N.getNode()); 12425 // Package {bitcast value, swap's chain} to match Load's shape. 12426 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other), 12427 N, Swap.getValue(1)); 12428 } 12429 12430 return Swap; 12431 } 12432 12433 // expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for 12434 // builtins) into stores with swaps. 12435 SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, 12436 DAGCombinerInfo &DCI) const { 12437 SelectionDAG &DAG = DCI.DAG; 12438 SDLoc dl(N); 12439 SDValue Chain; 12440 SDValue Base; 12441 unsigned SrcOpnd; 12442 MachineMemOperand *MMO; 12443 12444 switch (N->getOpcode()) { 12445 default: 12446 llvm_unreachable("Unexpected opcode for little endian VSX store"); 12447 case ISD::STORE: { 12448 StoreSDNode *ST = cast<StoreSDNode>(N); 12449 Chain = ST->getChain(); 12450 Base = ST->getBasePtr(); 12451 MMO = ST->getMemOperand(); 12452 SrcOpnd = 1; 12453 // If the MMO suggests this isn't a store of a full vector, leave 12454 // things alone. For a built-in, we have to make the change for 12455 // correctness, so if there is a size problem that will be a bug. 12456 if (MMO->getSize() < 16) 12457 return SDValue(); 12458 break; 12459 } 12460 case ISD::INTRINSIC_VOID: { 12461 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N); 12462 Chain = Intrin->getChain(); 12463 // Intrin->getBasePtr() oddly does not get what we want. 12464 Base = Intrin->getOperand(3); 12465 MMO = Intrin->getMemOperand(); 12466 SrcOpnd = 2; 12467 break; 12468 } 12469 } 12470 12471 SDValue Src = N->getOperand(SrcOpnd); 12472 MVT VecTy = Src.getValueType().getSimpleVT(); 12473 12474 // Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is 12475 // aligned and the type is a vector with elements up to 4 bytes 12476 if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16) 12477 && VecTy.getScalarSizeInBits() <= 32 ) { 12478 return SDValue(); 12479 } 12480 12481 // All stores are done as v2f64 and possible bit cast. 12482 if (VecTy != MVT::v2f64) { 12483 Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src); 12484 DCI.AddToWorklist(Src.getNode()); 12485 } 12486 12487 SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl, 12488 DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src); 12489 DCI.AddToWorklist(Swap.getNode()); 12490 Chain = Swap.getValue(1); 12491 SDValue StoreOps[] = { Chain, Swap, Base }; 12492 SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl, 12493 DAG.getVTList(MVT::Other), 12494 StoreOps, VecTy, MMO); 12495 DCI.AddToWorklist(Store.getNode()); 12496 return Store; 12497 } 12498 12499 // Handle DAG combine for STORE (FP_TO_INT F). 12500 SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N, 12501 DAGCombinerInfo &DCI) const { 12502 12503 SelectionDAG &DAG = DCI.DAG; 12504 SDLoc dl(N); 12505 unsigned Opcode = N->getOperand(1).getOpcode(); 12506 12507 assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) 12508 && "Not a FP_TO_INT Instruction!"); 12509 12510 SDValue Val = N->getOperand(1).getOperand(0); 12511 EVT Op1VT = N->getOperand(1).getValueType(); 12512 EVT ResVT = Val.getValueType(); 12513 12514 // Floating point types smaller than 32 bits are not legal on Power. 12515 if (ResVT.getScalarSizeInBits() < 32) 12516 return SDValue(); 12517 12518 // Only perform combine for conversion to i64/i32 or power9 i16/i8. 12519 bool ValidTypeForStoreFltAsInt = 12520 (Op1VT == MVT::i32 || Op1VT == MVT::i64 || 12521 (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8))); 12522 12523 if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Altivec() || 12524 cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt) 12525 return SDValue(); 12526 12527 // Extend f32 values to f64 12528 if (ResVT.getScalarSizeInBits() == 32) { 12529 Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val); 12530 DCI.AddToWorklist(Val.getNode()); 12531 } 12532 12533 // Set signed or unsigned conversion opcode. 12534 unsigned ConvOpcode = (Opcode == ISD::FP_TO_SINT) ? 12535 PPCISD::FP_TO_SINT_IN_VSR : 12536 PPCISD::FP_TO_UINT_IN_VSR; 12537 12538 Val = DAG.getNode(ConvOpcode, 12539 dl, ResVT == MVT::f128 ? MVT::f128 : MVT::f64, Val); 12540 DCI.AddToWorklist(Val.getNode()); 12541 12542 // Set number of bytes being converted. 12543 unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8; 12544 SDValue Ops[] = { N->getOperand(0), Val, N->getOperand(2), 12545 DAG.getIntPtrConstant(ByteSize, dl, false), 12546 DAG.getValueType(Op1VT) }; 12547 12548 Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl, 12549 DAG.getVTList(MVT::Other), Ops, 12550 cast<StoreSDNode>(N)->getMemoryVT(), 12551 cast<StoreSDNode>(N)->getMemOperand()); 12552 12553 DCI.AddToWorklist(Val.getNode()); 12554 return Val; 12555 } 12556 12557 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, 12558 DAGCombinerInfo &DCI) const { 12559 SelectionDAG &DAG = DCI.DAG; 12560 SDLoc dl(N); 12561 switch (N->getOpcode()) { 12562 default: break; 12563 case ISD::ADD: 12564 return combineADD(N, DCI); 12565 case ISD::SHL: 12566 return combineSHL(N, DCI); 12567 case ISD::SRA: 12568 return combineSRA(N, DCI); 12569 case ISD::SRL: 12570 return combineSRL(N, DCI); 12571 case PPCISD::SHL: 12572 if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. 12573 return N->getOperand(0); 12574 break; 12575 case PPCISD::SRL: 12576 if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0. 12577 return N->getOperand(0); 12578 break; 12579 case PPCISD::SRA: 12580 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 12581 if (C->isNullValue() || // 0 >>s V -> 0. 12582 C->isAllOnesValue()) // -1 >>s V -> -1. 12583 return N->getOperand(0); 12584 } 12585 break; 12586 case ISD::SIGN_EXTEND: 12587 case ISD::ZERO_EXTEND: 12588 case ISD::ANY_EXTEND: 12589 return DAGCombineExtBoolTrunc(N, DCI); 12590 case ISD::TRUNCATE: 12591 return combineTRUNCATE(N, DCI); 12592 case ISD::SETCC: 12593 if (SDValue CSCC = combineSetCC(N, DCI)) 12594 return CSCC; 12595 LLVM_FALLTHROUGH; 12596 case ISD::SELECT_CC: 12597 return DAGCombineTruncBoolExt(N, DCI); 12598 case ISD::SINT_TO_FP: 12599 case ISD::UINT_TO_FP: 12600 return combineFPToIntToFP(N, DCI); 12601 case ISD::STORE: { 12602 12603 EVT Op1VT = N->getOperand(1).getValueType(); 12604 unsigned Opcode = N->getOperand(1).getOpcode(); 12605 12606 if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) { 12607 SDValue Val= combineStoreFPToInt(N, DCI); 12608 if (Val) 12609 return Val; 12610 } 12611 12612 // Turn STORE (BSWAP) -> sthbrx/stwbrx. 12613 if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP && 12614 N->getOperand(1).getNode()->hasOneUse() && 12615 (Op1VT == MVT::i32 || Op1VT == MVT::i16 || 12616 (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) { 12617 12618 // STBRX can only handle simple types and it makes no sense to store less 12619 // two bytes in byte-reversed order. 12620 EVT mVT = cast<StoreSDNode>(N)->getMemoryVT(); 12621 if (mVT.isExtended() || mVT.getSizeInBits() < 16) 12622 break; 12623 12624 SDValue BSwapOp = N->getOperand(1).getOperand(0); 12625 // Do an any-extend to 32-bits if this is a half-word input. 12626 if (BSwapOp.getValueType() == MVT::i16) 12627 BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp); 12628 12629 // If the type of BSWAP operand is wider than stored memory width 12630 // it need to be shifted to the right side before STBRX. 12631 if (Op1VT.bitsGT(mVT)) { 12632 int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits(); 12633 BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp, 12634 DAG.getConstant(Shift, dl, MVT::i32)); 12635 // Need to truncate if this is a bswap of i64 stored as i32/i16. 12636 if (Op1VT == MVT::i64) 12637 BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp); 12638 } 12639 12640 SDValue Ops[] = { 12641 N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT) 12642 }; 12643 return 12644 DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other), 12645 Ops, cast<StoreSDNode>(N)->getMemoryVT(), 12646 cast<StoreSDNode>(N)->getMemOperand()); 12647 } 12648 12649 // STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0> 12650 // So it can increase the chance of CSE constant construction. 12651 if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() && 12652 isa<ConstantSDNode>(N->getOperand(1)) && Op1VT == MVT::i32) { 12653 // Need to sign-extended to 64-bits to handle negative values. 12654 EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT(); 12655 uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1), 12656 MemVT.getSizeInBits()); 12657 SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64); 12658 12659 // DAG.getTruncStore() can't be used here because it doesn't accept 12660 // the general (base + offset) addressing mode. 12661 // So we use UpdateNodeOperands and setTruncatingStore instead. 12662 DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2), 12663 N->getOperand(3)); 12664 cast<StoreSDNode>(N)->setTruncatingStore(true); 12665 return SDValue(N, 0); 12666 } 12667 12668 // For little endian, VSX stores require generating xxswapd/lxvd2x. 12669 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. 12670 if (Op1VT.isSimple()) { 12671 MVT StoreVT = Op1VT.getSimpleVT(); 12672 if (Subtarget.needsSwapsForVSXMemOps() && 12673 (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 || 12674 StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32)) 12675 return expandVSXStoreForLE(N, DCI); 12676 } 12677 break; 12678 } 12679 case ISD::LOAD: { 12680 LoadSDNode *LD = cast<LoadSDNode>(N); 12681 EVT VT = LD->getValueType(0); 12682 12683 // For little endian, VSX loads require generating lxvd2x/xxswapd. 12684 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. 12685 if (VT.isSimple()) { 12686 MVT LoadVT = VT.getSimpleVT(); 12687 if (Subtarget.needsSwapsForVSXMemOps() && 12688 (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 || 12689 LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32)) 12690 return expandVSXLoadForLE(N, DCI); 12691 } 12692 12693 // We sometimes end up with a 64-bit integer load, from which we extract 12694 // two single-precision floating-point numbers. This happens with 12695 // std::complex<float>, and other similar structures, because of the way we 12696 // canonicalize structure copies. However, if we lack direct moves, 12697 // then the final bitcasts from the extracted integer values to the 12698 // floating-point numbers turn into store/load pairs. Even with direct moves, 12699 // just loading the two floating-point numbers is likely better. 12700 auto ReplaceTwoFloatLoad = [&]() { 12701 if (VT != MVT::i64) 12702 return false; 12703 12704 if (LD->getExtensionType() != ISD::NON_EXTLOAD || 12705 LD->isVolatile()) 12706 return false; 12707 12708 // We're looking for a sequence like this: 12709 // t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64 12710 // t16: i64 = srl t13, Constant:i32<32> 12711 // t17: i32 = truncate t16 12712 // t18: f32 = bitcast t17 12713 // t19: i32 = truncate t13 12714 // t20: f32 = bitcast t19 12715 12716 if (!LD->hasNUsesOfValue(2, 0)) 12717 return false; 12718 12719 auto UI = LD->use_begin(); 12720 while (UI.getUse().getResNo() != 0) ++UI; 12721 SDNode *Trunc = *UI++; 12722 while (UI.getUse().getResNo() != 0) ++UI; 12723 SDNode *RightShift = *UI; 12724 if (Trunc->getOpcode() != ISD::TRUNCATE) 12725 std::swap(Trunc, RightShift); 12726 12727 if (Trunc->getOpcode() != ISD::TRUNCATE || 12728 Trunc->getValueType(0) != MVT::i32 || 12729 !Trunc->hasOneUse()) 12730 return false; 12731 if (RightShift->getOpcode() != ISD::SRL || 12732 !isa<ConstantSDNode>(RightShift->getOperand(1)) || 12733 RightShift->getConstantOperandVal(1) != 32 || 12734 !RightShift->hasOneUse()) 12735 return false; 12736 12737 SDNode *Trunc2 = *RightShift->use_begin(); 12738 if (Trunc2->getOpcode() != ISD::TRUNCATE || 12739 Trunc2->getValueType(0) != MVT::i32 || 12740 !Trunc2->hasOneUse()) 12741 return false; 12742 12743 SDNode *Bitcast = *Trunc->use_begin(); 12744 SDNode *Bitcast2 = *Trunc2->use_begin(); 12745 12746 if (Bitcast->getOpcode() != ISD::BITCAST || 12747 Bitcast->getValueType(0) != MVT::f32) 12748 return false; 12749 if (Bitcast2->getOpcode() != ISD::BITCAST || 12750 Bitcast2->getValueType(0) != MVT::f32) 12751 return false; 12752 12753 if (Subtarget.isLittleEndian()) 12754 std::swap(Bitcast, Bitcast2); 12755 12756 // Bitcast has the second float (in memory-layout order) and Bitcast2 12757 // has the first one. 12758 12759 SDValue BasePtr = LD->getBasePtr(); 12760 if (LD->isIndexed()) { 12761 assert(LD->getAddressingMode() == ISD::PRE_INC && 12762 "Non-pre-inc AM on PPC?"); 12763 BasePtr = 12764 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, 12765 LD->getOffset()); 12766 } 12767 12768 auto MMOFlags = 12769 LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile; 12770 SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr, 12771 LD->getPointerInfo(), LD->getAlignment(), 12772 MMOFlags, LD->getAAInfo()); 12773 SDValue AddPtr = 12774 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), 12775 BasePtr, DAG.getIntPtrConstant(4, dl)); 12776 SDValue FloatLoad2 = DAG.getLoad( 12777 MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr, 12778 LD->getPointerInfo().getWithOffset(4), 12779 MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo()); 12780 12781 if (LD->isIndexed()) { 12782 // Note that DAGCombine should re-form any pre-increment load(s) from 12783 // what is produced here if that makes sense. 12784 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr); 12785 } 12786 12787 DCI.CombineTo(Bitcast2, FloatLoad); 12788 DCI.CombineTo(Bitcast, FloatLoad2); 12789 12790 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1), 12791 SDValue(FloatLoad2.getNode(), 1)); 12792 return true; 12793 }; 12794 12795 if (ReplaceTwoFloatLoad()) 12796 return SDValue(N, 0); 12797 12798 EVT MemVT = LD->getMemoryVT(); 12799 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); 12800 unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); 12801 Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); 12802 unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy); 12803 if (LD->isUnindexed() && VT.isVector() && 12804 ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && 12805 // P8 and later hardware should just use LOAD. 12806 !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || 12807 VT == MVT::v4i32 || VT == MVT::v4f32)) || 12808 (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) && 12809 LD->getAlignment() >= ScalarABIAlignment)) && 12810 LD->getAlignment() < ABIAlignment) { 12811 // This is a type-legal unaligned Altivec or QPX load. 12812 SDValue Chain = LD->getChain(); 12813 SDValue Ptr = LD->getBasePtr(); 12814 bool isLittleEndian = Subtarget.isLittleEndian(); 12815 12816 // This implements the loading of unaligned vectors as described in 12817 // the venerable Apple Velocity Engine overview. Specifically: 12818 // https://developer.apple.com/hardwaredrivers/ve/alignment.html 12819 // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html 12820 // 12821 // The general idea is to expand a sequence of one or more unaligned 12822 // loads into an alignment-based permutation-control instruction (lvsl 12823 // or lvsr), a series of regular vector loads (which always truncate 12824 // their input address to an aligned address), and a series of 12825 // permutations. The results of these permutations are the requested 12826 // loaded values. The trick is that the last "extra" load is not taken 12827 // from the address you might suspect (sizeof(vector) bytes after the 12828 // last requested load), but rather sizeof(vector) - 1 bytes after the 12829 // last requested vector. The point of this is to avoid a page fault if 12830 // the base address happened to be aligned. This works because if the 12831 // base address is aligned, then adding less than a full vector length 12832 // will cause the last vector in the sequence to be (re)loaded. 12833 // Otherwise, the next vector will be fetched as you might suspect was 12834 // necessary. 12835 12836 // We might be able to reuse the permutation generation from 12837 // a different base address offset from this one by an aligned amount. 12838 // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this 12839 // optimization later. 12840 Intrinsic::ID Intr, IntrLD, IntrPerm; 12841 MVT PermCntlTy, PermTy, LDTy; 12842 if (Subtarget.hasAltivec()) { 12843 Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr : 12844 Intrinsic::ppc_altivec_lvsl; 12845 IntrLD = Intrinsic::ppc_altivec_lvx; 12846 IntrPerm = Intrinsic::ppc_altivec_vperm; 12847 PermCntlTy = MVT::v16i8; 12848 PermTy = MVT::v4i32; 12849 LDTy = MVT::v4i32; 12850 } else { 12851 Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld : 12852 Intrinsic::ppc_qpx_qvlpcls; 12853 IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd : 12854 Intrinsic::ppc_qpx_qvlfs; 12855 IntrPerm = Intrinsic::ppc_qpx_qvfperm; 12856 PermCntlTy = MVT::v4f64; 12857 PermTy = MVT::v4f64; 12858 LDTy = MemVT.getSimpleVT(); 12859 } 12860 12861 SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy); 12862 12863 // Create the new MMO for the new base load. It is like the original MMO, 12864 // but represents an area in memory almost twice the vector size centered 12865 // on the original address. If the address is unaligned, we might start 12866 // reading up to (sizeof(vector)-1) bytes below the address of the 12867 // original unaligned load. 12868 MachineFunction &MF = DAG.getMachineFunction(); 12869 MachineMemOperand *BaseMMO = 12870 MF.getMachineMemOperand(LD->getMemOperand(), 12871 -(long)MemVT.getStoreSize()+1, 12872 2*MemVT.getStoreSize()-1); 12873 12874 // Create the new base load. 12875 SDValue LDXIntID = 12876 DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout())); 12877 SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr }; 12878 SDValue BaseLoad = 12879 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, 12880 DAG.getVTList(PermTy, MVT::Other), 12881 BaseLoadOps, LDTy, BaseMMO); 12882 12883 // Note that the value of IncOffset (which is provided to the next 12884 // load's pointer info offset value, and thus used to calculate the 12885 // alignment), and the value of IncValue (which is actually used to 12886 // increment the pointer value) are different! This is because we 12887 // require the next load to appear to be aligned, even though it 12888 // is actually offset from the base pointer by a lesser amount. 12889 int IncOffset = VT.getSizeInBits() / 8; 12890 int IncValue = IncOffset; 12891 12892 // Walk (both up and down) the chain looking for another load at the real 12893 // (aligned) offset (the alignment of the other load does not matter in 12894 // this case). If found, then do not use the offset reduction trick, as 12895 // that will prevent the loads from being later combined (as they would 12896 // otherwise be duplicates). 12897 if (!findConsecutiveLoad(LD, DAG)) 12898 --IncValue; 12899 12900 SDValue Increment = 12901 DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout())); 12902 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); 12903 12904 MachineMemOperand *ExtraMMO = 12905 MF.getMachineMemOperand(LD->getMemOperand(), 12906 1, 2*MemVT.getStoreSize()-1); 12907 SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr }; 12908 SDValue ExtraLoad = 12909 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, 12910 DAG.getVTList(PermTy, MVT::Other), 12911 ExtraLoadOps, LDTy, ExtraMMO); 12912 12913 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 12914 BaseLoad.getValue(1), ExtraLoad.getValue(1)); 12915 12916 // Because vperm has a big-endian bias, we must reverse the order 12917 // of the input vectors and complement the permute control vector 12918 // when generating little endian code. We have already handled the 12919 // latter by using lvsr instead of lvsl, so just reverse BaseLoad 12920 // and ExtraLoad here. 12921 SDValue Perm; 12922 if (isLittleEndian) 12923 Perm = BuildIntrinsicOp(IntrPerm, 12924 ExtraLoad, BaseLoad, PermCntl, DAG, dl); 12925 else 12926 Perm = BuildIntrinsicOp(IntrPerm, 12927 BaseLoad, ExtraLoad, PermCntl, DAG, dl); 12928 12929 if (VT != PermTy) 12930 Perm = Subtarget.hasAltivec() ? 12931 DAG.getNode(ISD::BITCAST, dl, VT, Perm) : 12932 DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX 12933 DAG.getTargetConstant(1, dl, MVT::i64)); 12934 // second argument is 1 because this rounding 12935 // is always exact. 12936 12937 // The output of the permutation is our loaded result, the TokenFactor is 12938 // our new chain. 12939 DCI.CombineTo(N, Perm, TF); 12940 return SDValue(N, 0); 12941 } 12942 } 12943 break; 12944 case ISD::INTRINSIC_WO_CHAIN: { 12945 bool isLittleEndian = Subtarget.isLittleEndian(); 12946 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 12947 Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr 12948 : Intrinsic::ppc_altivec_lvsl); 12949 if ((IID == Intr || 12950 IID == Intrinsic::ppc_qpx_qvlpcld || 12951 IID == Intrinsic::ppc_qpx_qvlpcls) && 12952 N->getOperand(1)->getOpcode() == ISD::ADD) { 12953 SDValue Add = N->getOperand(1); 12954 12955 int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ? 12956 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */; 12957 12958 if (DAG.MaskedValueIsZero(Add->getOperand(1), 12959 APInt::getAllOnesValue(Bits /* alignment */) 12960 .zext(Add.getScalarValueSizeInBits()))) { 12961 SDNode *BasePtr = Add->getOperand(0).getNode(); 12962 for (SDNode::use_iterator UI = BasePtr->use_begin(), 12963 UE = BasePtr->use_end(); 12964 UI != UE; ++UI) { 12965 if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && 12966 cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) { 12967 // We've found another LVSL/LVSR, and this address is an aligned 12968 // multiple of that one. The results will be the same, so use the 12969 // one we've just found instead. 12970 12971 return SDValue(*UI, 0); 12972 } 12973 } 12974 } 12975 12976 if (isa<ConstantSDNode>(Add->getOperand(1))) { 12977 SDNode *BasePtr = Add->getOperand(0).getNode(); 12978 for (SDNode::use_iterator UI = BasePtr->use_begin(), 12979 UE = BasePtr->use_end(); UI != UE; ++UI) { 12980 if (UI->getOpcode() == ISD::ADD && 12981 isa<ConstantSDNode>(UI->getOperand(1)) && 12982 (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() - 12983 cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) % 12984 (1ULL << Bits) == 0) { 12985 SDNode *OtherAdd = *UI; 12986 for (SDNode::use_iterator VI = OtherAdd->use_begin(), 12987 VE = OtherAdd->use_end(); VI != VE; ++VI) { 12988 if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && 12989 cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) { 12990 return SDValue(*VI, 0); 12991 } 12992 } 12993 } 12994 } 12995 } 12996 } 12997 12998 // Combine vmaxsw/h/b(a, a's negation) to abs(a) 12999 // Expose the vabsduw/h/b opportunity for down stream 13000 if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() && 13001 (IID == Intrinsic::ppc_altivec_vmaxsw || 13002 IID == Intrinsic::ppc_altivec_vmaxsh || 13003 IID == Intrinsic::ppc_altivec_vmaxsb)) { 13004 SDValue V1 = N->getOperand(1); 13005 SDValue V2 = N->getOperand(2); 13006 if ((V1.getSimpleValueType() == MVT::v4i32 || 13007 V1.getSimpleValueType() == MVT::v8i16 || 13008 V1.getSimpleValueType() == MVT::v16i8) && 13009 V1.getSimpleValueType() == V2.getSimpleValueType()) { 13010 // (0-a, a) 13011 if (V1.getOpcode() == ISD::SUB && 13012 ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && 13013 V1.getOperand(1) == V2) { 13014 return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2); 13015 } 13016 // (a, 0-a) 13017 if (V2.getOpcode() == ISD::SUB && 13018 ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && 13019 V2.getOperand(1) == V1) { 13020 return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1); 13021 } 13022 // (x-y, y-x) 13023 if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB && 13024 V1.getOperand(0) == V2.getOperand(1) && 13025 V1.getOperand(1) == V2.getOperand(0)) { 13026 return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1); 13027 } 13028 } 13029 } 13030 } 13031 13032 break; 13033 case ISD::INTRINSIC_W_CHAIN: 13034 // For little endian, VSX loads require generating lxvd2x/xxswapd. 13035 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. 13036 if (Subtarget.needsSwapsForVSXMemOps()) { 13037 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 13038 default: 13039 break; 13040 case Intrinsic::ppc_vsx_lxvw4x: 13041 case Intrinsic::ppc_vsx_lxvd2x: 13042 return expandVSXLoadForLE(N, DCI); 13043 } 13044 } 13045 break; 13046 case ISD::INTRINSIC_VOID: 13047 // For little endian, VSX stores require generating xxswapd/stxvd2x. 13048 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. 13049 if (Subtarget.needsSwapsForVSXMemOps()) { 13050 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 13051 default: 13052 break; 13053 case Intrinsic::ppc_vsx_stxvw4x: 13054 case Intrinsic::ppc_vsx_stxvd2x: 13055 return expandVSXStoreForLE(N, DCI); 13056 } 13057 } 13058 break; 13059 case ISD::BSWAP: 13060 // Turn BSWAP (LOAD) -> lhbrx/lwbrx. 13061 if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && 13062 N->getOperand(0).hasOneUse() && 13063 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 || 13064 (Subtarget.hasLDBRX() && Subtarget.isPPC64() && 13065 N->getValueType(0) == MVT::i64))) { 13066 SDValue Load = N->getOperand(0); 13067 LoadSDNode *LD = cast<LoadSDNode>(Load); 13068 // Create the byte-swapping load. 13069 SDValue Ops[] = { 13070 LD->getChain(), // Chain 13071 LD->getBasePtr(), // Ptr 13072 DAG.getValueType(N->getValueType(0)) // VT 13073 }; 13074 SDValue BSLoad = 13075 DAG.getMemIntrinsicNode(PPCISD::LBRX, dl, 13076 DAG.getVTList(N->getValueType(0) == MVT::i64 ? 13077 MVT::i64 : MVT::i32, MVT::Other), 13078 Ops, LD->getMemoryVT(), LD->getMemOperand()); 13079 13080 // If this is an i16 load, insert the truncate. 13081 SDValue ResVal = BSLoad; 13082 if (N->getValueType(0) == MVT::i16) 13083 ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad); 13084 13085 // First, combine the bswap away. This makes the value produced by the 13086 // load dead. 13087 DCI.CombineTo(N, ResVal); 13088 13089 // Next, combine the load away, we give it a bogus result value but a real 13090 // chain result. The result value is dead because the bswap is dead. 13091 DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1)); 13092 13093 // Return N so it doesn't get rechecked! 13094 return SDValue(N, 0); 13095 } 13096 break; 13097 case PPCISD::VCMP: 13098 // If a VCMPo node already exists with exactly the same operands as this 13099 // node, use its result instead of this node (VCMPo computes both a CR6 and 13100 // a normal output). 13101 // 13102 if (!N->getOperand(0).hasOneUse() && 13103 !N->getOperand(1).hasOneUse() && 13104 !N->getOperand(2).hasOneUse()) { 13105 13106 // Scan all of the users of the LHS, looking for VCMPo's that match. 13107 SDNode *VCMPoNode = nullptr; 13108 13109 SDNode *LHSN = N->getOperand(0).getNode(); 13110 for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end(); 13111 UI != E; ++UI) 13112 if (UI->getOpcode() == PPCISD::VCMPo && 13113 UI->getOperand(1) == N->getOperand(1) && 13114 UI->getOperand(2) == N->getOperand(2) && 13115 UI->getOperand(0) == N->getOperand(0)) { 13116 VCMPoNode = *UI; 13117 break; 13118 } 13119 13120 // If there is no VCMPo node, or if the flag value has a single use, don't 13121 // transform this. 13122 if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1)) 13123 break; 13124 13125 // Look at the (necessarily single) use of the flag value. If it has a 13126 // chain, this transformation is more complex. Note that multiple things 13127 // could use the value result, which we should ignore. 13128 SDNode *FlagUser = nullptr; 13129 for (SDNode::use_iterator UI = VCMPoNode->use_begin(); 13130 FlagUser == nullptr; ++UI) { 13131 assert(UI != VCMPoNode->use_end() && "Didn't find user!"); 13132 SDNode *User = *UI; 13133 for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { 13134 if (User->getOperand(i) == SDValue(VCMPoNode, 1)) { 13135 FlagUser = User; 13136 break; 13137 } 13138 } 13139 } 13140 13141 // If the user is a MFOCRF instruction, we know this is safe. 13142 // Otherwise we give up for right now. 13143 if (FlagUser->getOpcode() == PPCISD::MFOCRF) 13144 return SDValue(VCMPoNode, 0); 13145 } 13146 break; 13147 case ISD::BRCOND: { 13148 SDValue Cond = N->getOperand(1); 13149 SDValue Target = N->getOperand(2); 13150 13151 if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && 13152 cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() == 13153 Intrinsic::ppc_is_decremented_ctr_nonzero) { 13154 13155 // We now need to make the intrinsic dead (it cannot be instruction 13156 // selected). 13157 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0)); 13158 assert(Cond.getNode()->hasOneUse() && 13159 "Counter decrement has more than one use"); 13160 13161 return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other, 13162 N->getOperand(0), Target); 13163 } 13164 } 13165 break; 13166 case ISD::BR_CC: { 13167 // If this is a branch on an altivec predicate comparison, lower this so 13168 // that we don't have to do a MFOCRF: instead, branch directly on CR6. This 13169 // lowering is done pre-legalize, because the legalizer lowers the predicate 13170 // compare down to code that is difficult to reassemble. 13171 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 13172 SDValue LHS = N->getOperand(2), RHS = N->getOperand(3); 13173 13174 // Sometimes the promoted value of the intrinsic is ANDed by some non-zero 13175 // value. If so, pass-through the AND to get to the intrinsic. 13176 if (LHS.getOpcode() == ISD::AND && 13177 LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && 13178 cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() == 13179 Intrinsic::ppc_is_decremented_ctr_nonzero && 13180 isa<ConstantSDNode>(LHS.getOperand(1)) && 13181 !isNullConstant(LHS.getOperand(1))) 13182 LHS = LHS.getOperand(0); 13183 13184 if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && 13185 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == 13186 Intrinsic::ppc_is_decremented_ctr_nonzero && 13187 isa<ConstantSDNode>(RHS)) { 13188 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && 13189 "Counter decrement comparison is not EQ or NE"); 13190 13191 unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); 13192 bool isBDNZ = (CC == ISD::SETEQ && Val) || 13193 (CC == ISD::SETNE && !Val); 13194 13195 // We now need to make the intrinsic dead (it cannot be instruction 13196 // selected). 13197 DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0)); 13198 assert(LHS.getNode()->hasOneUse() && 13199 "Counter decrement has more than one use"); 13200 13201 return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other, 13202 N->getOperand(0), N->getOperand(4)); 13203 } 13204 13205 int CompareOpc; 13206 bool isDot; 13207 13208 if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && 13209 isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && 13210 getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { 13211 assert(isDot && "Can't compare against a vector result!"); 13212 13213 // If this is a comparison against something other than 0/1, then we know 13214 // that the condition is never/always true. 13215 unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); 13216 if (Val != 0 && Val != 1) { 13217 if (CC == ISD::SETEQ) // Cond never true, remove branch. 13218 return N->getOperand(0); 13219 // Always !=, turn it into an unconditional branch. 13220 return DAG.getNode(ISD::BR, dl, MVT::Other, 13221 N->getOperand(0), N->getOperand(4)); 13222 } 13223 13224 bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0); 13225 13226 // Create the PPCISD altivec 'dot' comparison node. 13227 SDValue Ops[] = { 13228 LHS.getOperand(2), // LHS of compare 13229 LHS.getOperand(3), // RHS of compare 13230 DAG.getConstant(CompareOpc, dl, MVT::i32) 13231 }; 13232 EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue }; 13233 SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); 13234 13235 // Unpack the result based on how the target uses it. 13236 PPC::Predicate CompOpc; 13237 switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) { 13238 default: // Can't happen, don't crash on invalid number though. 13239 case 0: // Branch on the value of the EQ bit of CR6. 13240 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE; 13241 break; 13242 case 1: // Branch on the inverted value of the EQ bit of CR6. 13243 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ; 13244 break; 13245 case 2: // Branch on the value of the LT bit of CR6. 13246 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE; 13247 break; 13248 case 3: // Branch on the inverted value of the LT bit of CR6. 13249 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT; 13250 break; 13251 } 13252 13253 return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0), 13254 DAG.getConstant(CompOpc, dl, MVT::i32), 13255 DAG.getRegister(PPC::CR6, MVT::i32), 13256 N->getOperand(4), CompNode.getValue(1)); 13257 } 13258 break; 13259 } 13260 case ISD::BUILD_VECTOR: 13261 return DAGCombineBuildVector(N, DCI); 13262 case ISD::ABS: 13263 return combineABS(N, DCI); 13264 case ISD::VSELECT: 13265 return combineVSelect(N, DCI); 13266 } 13267 13268 return SDValue(); 13269 } 13270 13271 SDValue 13272 PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 13273 SelectionDAG &DAG, 13274 SmallVectorImpl<SDNode *> &Created) const { 13275 // fold (sdiv X, pow2) 13276 EVT VT = N->getValueType(0); 13277 if (VT == MVT::i64 && !Subtarget.isPPC64()) 13278 return SDValue(); 13279 if ((VT != MVT::i32 && VT != MVT::i64) || 13280 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) 13281 return SDValue(); 13282 13283 SDLoc DL(N); 13284 SDValue N0 = N->getOperand(0); 13285 13286 bool IsNegPow2 = (-Divisor).isPowerOf2(); 13287 unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros(); 13288 SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT); 13289 13290 SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt); 13291 Created.push_back(Op.getNode()); 13292 13293 if (IsNegPow2) { 13294 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); 13295 Created.push_back(Op.getNode()); 13296 } 13297 13298 return Op; 13299 } 13300 13301 //===----------------------------------------------------------------------===// 13302 // Inline Assembly Support 13303 //===----------------------------------------------------------------------===// 13304 13305 void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 13306 KnownBits &Known, 13307 const APInt &DemandedElts, 13308 const SelectionDAG &DAG, 13309 unsigned Depth) const { 13310 Known.resetAll(); 13311 switch (Op.getOpcode()) { 13312 default: break; 13313 case PPCISD::LBRX: { 13314 // lhbrx is known to have the top bits cleared out. 13315 if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16) 13316 Known.Zero = 0xFFFF0000; 13317 break; 13318 } 13319 case ISD::INTRINSIC_WO_CHAIN: { 13320 switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { 13321 default: break; 13322 case Intrinsic::ppc_altivec_vcmpbfp_p: 13323 case Intrinsic::ppc_altivec_vcmpeqfp_p: 13324 case Intrinsic::ppc_altivec_vcmpequb_p: 13325 case Intrinsic::ppc_altivec_vcmpequh_p: 13326 case Intrinsic::ppc_altivec_vcmpequw_p: 13327 case Intrinsic::ppc_altivec_vcmpequd_p: 13328 case Intrinsic::ppc_altivec_vcmpgefp_p: 13329 case Intrinsic::ppc_altivec_vcmpgtfp_p: 13330 case Intrinsic::ppc_altivec_vcmpgtsb_p: 13331 case Intrinsic::ppc_altivec_vcmpgtsh_p: 13332 case Intrinsic::ppc_altivec_vcmpgtsw_p: 13333 case Intrinsic::ppc_altivec_vcmpgtsd_p: 13334 case Intrinsic::ppc_altivec_vcmpgtub_p: 13335 case Intrinsic::ppc_altivec_vcmpgtuh_p: 13336 case Intrinsic::ppc_altivec_vcmpgtuw_p: 13337 case Intrinsic::ppc_altivec_vcmpgtud_p: 13338 Known.Zero = ~1U; // All bits but the low one are known to be zero. 13339 break; 13340 } 13341 } 13342 } 13343 } 13344 13345 unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { 13346 switch (Subtarget.getDarwinDirective()) { 13347 default: break; 13348 case PPC::DIR_970: 13349 case PPC::DIR_PWR4: 13350 case PPC::DIR_PWR5: 13351 case PPC::DIR_PWR5X: 13352 case PPC::DIR_PWR6: 13353 case PPC::DIR_PWR6X: 13354 case PPC::DIR_PWR7: 13355 case PPC::DIR_PWR8: 13356 case PPC::DIR_PWR9: { 13357 if (!ML) 13358 break; 13359 13360 const PPCInstrInfo *TII = Subtarget.getInstrInfo(); 13361 13362 // For small loops (between 5 and 8 instructions), align to a 32-byte 13363 // boundary so that the entire loop fits in one instruction-cache line. 13364 uint64_t LoopSize = 0; 13365 for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) 13366 for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) { 13367 LoopSize += TII->getInstSizeInBytes(*J); 13368 if (LoopSize > 32) 13369 break; 13370 } 13371 13372 if (LoopSize > 16 && LoopSize <= 32) 13373 return 5; 13374 13375 break; 13376 } 13377 } 13378 13379 return TargetLowering::getPrefLoopAlignment(ML); 13380 } 13381 13382 /// getConstraintType - Given a constraint, return the type of 13383 /// constraint it is for this target. 13384 PPCTargetLowering::ConstraintType 13385 PPCTargetLowering::getConstraintType(StringRef Constraint) const { 13386 if (Constraint.size() == 1) { 13387 switch (Constraint[0]) { 13388 default: break; 13389 case 'b': 13390 case 'r': 13391 case 'f': 13392 case 'd': 13393 case 'v': 13394 case 'y': 13395 return C_RegisterClass; 13396 case 'Z': 13397 // FIXME: While Z does indicate a memory constraint, it specifically 13398 // indicates an r+r address (used in conjunction with the 'y' modifier 13399 // in the replacement string). Currently, we're forcing the base 13400 // register to be r0 in the asm printer (which is interpreted as zero) 13401 // and forming the complete address in the second register. This is 13402 // suboptimal. 13403 return C_Memory; 13404 } 13405 } else if (Constraint == "wc") { // individual CR bits. 13406 return C_RegisterClass; 13407 } else if (Constraint == "wa" || Constraint == "wd" || 13408 Constraint == "wf" || Constraint == "ws" || 13409 Constraint == "wi") { 13410 return C_RegisterClass; // VSX registers. 13411 } 13412 return TargetLowering::getConstraintType(Constraint); 13413 } 13414 13415 /// Examine constraint type and operand type and determine a weight value. 13416 /// This object must already have been set up with the operand type 13417 /// and the current alternative constraint selected. 13418 TargetLowering::ConstraintWeight 13419 PPCTargetLowering::getSingleConstraintMatchWeight( 13420 AsmOperandInfo &info, const char *constraint) const { 13421 ConstraintWeight weight = CW_Invalid; 13422 Value *CallOperandVal = info.CallOperandVal; 13423 // If we don't have a value, we can't do a match, 13424 // but allow it at the lowest weight. 13425 if (!CallOperandVal) 13426 return CW_Default; 13427 Type *type = CallOperandVal->getType(); 13428 13429 // Look at the constraint type. 13430 if (StringRef(constraint) == "wc" && type->isIntegerTy(1)) 13431 return CW_Register; // an individual CR bit. 13432 else if ((StringRef(constraint) == "wa" || 13433 StringRef(constraint) == "wd" || 13434 StringRef(constraint) == "wf") && 13435 type->isVectorTy()) 13436 return CW_Register; 13437 else if (StringRef(constraint) == "ws" && type->isDoubleTy()) 13438 return CW_Register; 13439 else if (StringRef(constraint) == "wi" && type->isIntegerTy(64)) 13440 return CW_Register; // just hold 64-bit integers data. 13441 13442 switch (*constraint) { 13443 default: 13444 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 13445 break; 13446 case 'b': 13447 if (type->isIntegerTy()) 13448 weight = CW_Register; 13449 break; 13450 case 'f': 13451 if (type->isFloatTy()) 13452 weight = CW_Register; 13453 break; 13454 case 'd': 13455 if (type->isDoubleTy()) 13456 weight = CW_Register; 13457 break; 13458 case 'v': 13459 if (type->isVectorTy()) 13460 weight = CW_Register; 13461 break; 13462 case 'y': 13463 weight = CW_Register; 13464 break; 13465 case 'Z': 13466 weight = CW_Memory; 13467 break; 13468 } 13469 return weight; 13470 } 13471 13472 std::pair<unsigned, const TargetRegisterClass *> 13473 PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 13474 StringRef Constraint, 13475 MVT VT) const { 13476 if (Constraint.size() == 1) { 13477 // GCC RS6000 Constraint Letters 13478 switch (Constraint[0]) { 13479 case 'b': // R1-R31 13480 if (VT == MVT::i64 && Subtarget.isPPC64()) 13481 return std::make_pair(0U, &PPC::G8RC_NOX0RegClass); 13482 return std::make_pair(0U, &PPC::GPRC_NOR0RegClass); 13483 case 'r': // R0-R31 13484 if (VT == MVT::i64 && Subtarget.isPPC64()) 13485 return std::make_pair(0U, &PPC::G8RCRegClass); 13486 return std::make_pair(0U, &PPC::GPRCRegClass); 13487 // 'd' and 'f' constraints are both defined to be "the floating point 13488 // registers", where one is for 32-bit and the other for 64-bit. We don't 13489 // really care overly much here so just give them all the same reg classes. 13490 case 'd': 13491 case 'f': 13492 if (Subtarget.hasSPE()) { 13493 if (VT == MVT::f32 || VT == MVT::i32) 13494 return std::make_pair(0U, &PPC::SPE4RCRegClass); 13495 if (VT == MVT::f64 || VT == MVT::i64) 13496 return std::make_pair(0U, &PPC::SPERCRegClass); 13497 } else { 13498 if (VT == MVT::f32 || VT == MVT::i32) 13499 return std::make_pair(0U, &PPC::F4RCRegClass); 13500 if (VT == MVT::f64 || VT == MVT::i64) 13501 return std::make_pair(0U, &PPC::F8RCRegClass); 13502 if (VT == MVT::v4f64 && Subtarget.hasQPX()) 13503 return std::make_pair(0U, &PPC::QFRCRegClass); 13504 if (VT == MVT::v4f32 && Subtarget.hasQPX()) 13505 return std::make_pair(0U, &PPC::QSRCRegClass); 13506 } 13507 break; 13508 case 'v': 13509 if (VT == MVT::v4f64 && Subtarget.hasQPX()) 13510 return std::make_pair(0U, &PPC::QFRCRegClass); 13511 if (VT == MVT::v4f32 && Subtarget.hasQPX()) 13512 return std::make_pair(0U, &PPC::QSRCRegClass); 13513 if (Subtarget.hasAltivec()) 13514 return std::make_pair(0U, &PPC::VRRCRegClass); 13515 break; 13516 case 'y': // crrc 13517 return std::make_pair(0U, &PPC::CRRCRegClass); 13518 } 13519 } else if (Constraint == "wc" && Subtarget.useCRBits()) { 13520 // An individual CR bit. 13521 return std::make_pair(0U, &PPC::CRBITRCRegClass); 13522 } else if ((Constraint == "wa" || Constraint == "wd" || 13523 Constraint == "wf" || Constraint == "wi") && 13524 Subtarget.hasVSX()) { 13525 return std::make_pair(0U, &PPC::VSRCRegClass); 13526 } else if (Constraint == "ws" && Subtarget.hasVSX()) { 13527 if (VT == MVT::f32 && Subtarget.hasP8Vector()) 13528 return std::make_pair(0U, &PPC::VSSRCRegClass); 13529 else 13530 return std::make_pair(0U, &PPC::VSFRCRegClass); 13531 } 13532 13533 std::pair<unsigned, const TargetRegisterClass *> R = 13534 TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 13535 13536 // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers 13537 // (which we call X[0-9]+). If a 64-bit value has been requested, and a 13538 // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent 13539 // register. 13540 // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use 13541 // the AsmName field from *RegisterInfo.td, then this would not be necessary. 13542 if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && 13543 PPC::GPRCRegClass.contains(R.first)) 13544 return std::make_pair(TRI->getMatchingSuperReg(R.first, 13545 PPC::sub_32, &PPC::G8RCRegClass), 13546 &PPC::G8RCRegClass); 13547 13548 // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same. 13549 if (!R.second && StringRef("{cc}").equals_lower(Constraint)) { 13550 R.first = PPC::CR0; 13551 R.second = &PPC::CRRCRegClass; 13552 } 13553 13554 return R; 13555 } 13556 13557 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 13558 /// vector. If it is invalid, don't add anything to Ops. 13559 void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 13560 std::string &Constraint, 13561 std::vector<SDValue>&Ops, 13562 SelectionDAG &DAG) const { 13563 SDValue Result; 13564 13565 // Only support length 1 constraints. 13566 if (Constraint.length() > 1) return; 13567 13568 char Letter = Constraint[0]; 13569 switch (Letter) { 13570 default: break; 13571 case 'I': 13572 case 'J': 13573 case 'K': 13574 case 'L': 13575 case 'M': 13576 case 'N': 13577 case 'O': 13578 case 'P': { 13579 ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op); 13580 if (!CST) return; // Must be an immediate to match. 13581 SDLoc dl(Op); 13582 int64_t Value = CST->getSExtValue(); 13583 EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative 13584 // numbers are printed as such. 13585 switch (Letter) { 13586 default: llvm_unreachable("Unknown constraint letter!"); 13587 case 'I': // "I" is a signed 16-bit constant. 13588 if (isInt<16>(Value)) 13589 Result = DAG.getTargetConstant(Value, dl, TCVT); 13590 break; 13591 case 'J': // "J" is a constant with only the high-order 16 bits nonzero. 13592 if (isShiftedUInt<16, 16>(Value)) 13593 Result = DAG.getTargetConstant(Value, dl, TCVT); 13594 break; 13595 case 'L': // "L" is a signed 16-bit constant shifted left 16 bits. 13596 if (isShiftedInt<16, 16>(Value)) 13597 Result = DAG.getTargetConstant(Value, dl, TCVT); 13598 break; 13599 case 'K': // "K" is a constant with only the low-order 16 bits nonzero. 13600 if (isUInt<16>(Value)) 13601 Result = DAG.getTargetConstant(Value, dl, TCVT); 13602 break; 13603 case 'M': // "M" is a constant that is greater than 31. 13604 if (Value > 31) 13605 Result = DAG.getTargetConstant(Value, dl, TCVT); 13606 break; 13607 case 'N': // "N" is a positive constant that is an exact power of two. 13608 if (Value > 0 && isPowerOf2_64(Value)) 13609 Result = DAG.getTargetConstant(Value, dl, TCVT); 13610 break; 13611 case 'O': // "O" is the constant zero. 13612 if (Value == 0) 13613 Result = DAG.getTargetConstant(Value, dl, TCVT); 13614 break; 13615 case 'P': // "P" is a constant whose negation is a signed 16-bit constant. 13616 if (isInt<16>(-Value)) 13617 Result = DAG.getTargetConstant(Value, dl, TCVT); 13618 break; 13619 } 13620 break; 13621 } 13622 } 13623 13624 if (Result.getNode()) { 13625 Ops.push_back(Result); 13626 return; 13627 } 13628 13629 // Handle standard constraint letters. 13630 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 13631 } 13632 13633 // isLegalAddressingMode - Return true if the addressing mode represented 13634 // by AM is legal for this target, for a load/store of the specified type. 13635 bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, 13636 const AddrMode &AM, Type *Ty, 13637 unsigned AS, Instruction *I) const { 13638 // PPC does not allow r+i addressing modes for vectors! 13639 if (Ty->isVectorTy() && AM.BaseOffs != 0) 13640 return false; 13641 13642 // PPC allows a sign-extended 16-bit immediate field. 13643 if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1) 13644 return false; 13645 13646 // No global is ever allowed as a base. 13647 if (AM.BaseGV) 13648 return false; 13649 13650 // PPC only support r+r, 13651 switch (AM.Scale) { 13652 case 0: // "r+i" or just "i", depending on HasBaseReg. 13653 break; 13654 case 1: 13655 if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. 13656 return false; 13657 // Otherwise we have r+r or r+i. 13658 break; 13659 case 2: 13660 if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. 13661 return false; 13662 // Allow 2*r as r+r. 13663 break; 13664 default: 13665 // No other scales are supported. 13666 return false; 13667 } 13668 13669 return true; 13670 } 13671 13672 SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, 13673 SelectionDAG &DAG) const { 13674 MachineFunction &MF = DAG.getMachineFunction(); 13675 MachineFrameInfo &MFI = MF.getFrameInfo(); 13676 MFI.setReturnAddressIsTaken(true); 13677 13678 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 13679 return SDValue(); 13680 13681 SDLoc dl(Op); 13682 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 13683 13684 // Make sure the function does not optimize away the store of the RA to 13685 // the stack. 13686 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); 13687 FuncInfo->setLRStoreRequired(); 13688 bool isPPC64 = Subtarget.isPPC64(); 13689 auto PtrVT = getPointerTy(MF.getDataLayout()); 13690 13691 if (Depth > 0) { 13692 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 13693 SDValue Offset = 13694 DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl, 13695 isPPC64 ? MVT::i64 : MVT::i32); 13696 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 13697 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), 13698 MachinePointerInfo()); 13699 } 13700 13701 // Just load the return address off the stack. 13702 SDValue RetAddrFI = getReturnAddrFrameIndex(DAG); 13703 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, 13704 MachinePointerInfo()); 13705 } 13706 13707 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, 13708 SelectionDAG &DAG) const { 13709 SDLoc dl(Op); 13710 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 13711 13712 MachineFunction &MF = DAG.getMachineFunction(); 13713 MachineFrameInfo &MFI = MF.getFrameInfo(); 13714 MFI.setFrameAddressIsTaken(true); 13715 13716 EVT PtrVT = getPointerTy(MF.getDataLayout()); 13717 bool isPPC64 = PtrVT == MVT::i64; 13718 13719 // Naked functions never have a frame pointer, and so we use r1. For all 13720 // other functions, this decision must be delayed until during PEI. 13721 unsigned FrameReg; 13722 if (MF.getFunction().hasFnAttribute(Attribute::Naked)) 13723 FrameReg = isPPC64 ? PPC::X1 : PPC::R1; 13724 else 13725 FrameReg = isPPC64 ? PPC::FP8 : PPC::FP; 13726 13727 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, 13728 PtrVT); 13729 while (Depth--) 13730 FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(), 13731 FrameAddr, MachinePointerInfo()); 13732 return FrameAddr; 13733 } 13734 13735 // FIXME? Maybe this could be a TableGen attribute on some registers and 13736 // this table could be generated automatically from RegInfo. 13737 unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, 13738 SelectionDAG &DAG) const { 13739 bool isPPC64 = Subtarget.isPPC64(); 13740 bool isDarwinABI = Subtarget.isDarwinABI(); 13741 13742 if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || 13743 (!isPPC64 && VT != MVT::i32)) 13744 report_fatal_error("Invalid register global variable type"); 13745 13746 bool is64Bit = isPPC64 && VT == MVT::i64; 13747 unsigned Reg = StringSwitch<unsigned>(RegName) 13748 .Case("r1", is64Bit ? PPC::X1 : PPC::R1) 13749 .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2) 13750 .Case("r13", (!isPPC64 && isDarwinABI) ? 0 : 13751 (is64Bit ? PPC::X13 : PPC::R13)) 13752 .Default(0); 13753 13754 if (Reg) 13755 return Reg; 13756 report_fatal_error("Invalid register name global variable"); 13757 } 13758 13759 bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const { 13760 // 32-bit SVR4 ABI access everything as got-indirect. 13761 if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) 13762 return true; 13763 13764 CodeModel::Model CModel = getTargetMachine().getCodeModel(); 13765 // If it is small or large code model, module locals are accessed 13766 // indirectly by loading their address from .toc/.got. The difference 13767 // is that for large code model we have ADDISTocHa + LDtocL and for 13768 // small code model we simply have LDtoc. 13769 if (CModel == CodeModel::Small || CModel == CodeModel::Large) 13770 return true; 13771 13772 // JumpTable and BlockAddress are accessed as got-indirect. 13773 if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA)) 13774 return true; 13775 13776 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) { 13777 const GlobalValue *GV = G->getGlobal(); 13778 unsigned char GVFlags = Subtarget.classifyGlobalReference(GV); 13779 // The NLP flag indicates that a global access has to use an 13780 // extra indirection. 13781 if (GVFlags & PPCII::MO_NLP_FLAG) 13782 return true; 13783 } 13784 13785 return false; 13786 } 13787 13788 bool 13789 PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 13790 // The PowerPC target isn't yet aware of offsets. 13791 return false; 13792 } 13793 13794 bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 13795 const CallInst &I, 13796 MachineFunction &MF, 13797 unsigned Intrinsic) const { 13798 switch (Intrinsic) { 13799 case Intrinsic::ppc_qpx_qvlfd: 13800 case Intrinsic::ppc_qpx_qvlfs: 13801 case Intrinsic::ppc_qpx_qvlfcd: 13802 case Intrinsic::ppc_qpx_qvlfcs: 13803 case Intrinsic::ppc_qpx_qvlfiwa: 13804 case Intrinsic::ppc_qpx_qvlfiwz: 13805 case Intrinsic::ppc_altivec_lvx: 13806 case Intrinsic::ppc_altivec_lvxl: 13807 case Intrinsic::ppc_altivec_lvebx: 13808 case Intrinsic::ppc_altivec_lvehx: 13809 case Intrinsic::ppc_altivec_lvewx: 13810 case Intrinsic::ppc_vsx_lxvd2x: 13811 case Intrinsic::ppc_vsx_lxvw4x: { 13812 EVT VT; 13813 switch (Intrinsic) { 13814 case Intrinsic::ppc_altivec_lvebx: 13815 VT = MVT::i8; 13816 break; 13817 case Intrinsic::ppc_altivec_lvehx: 13818 VT = MVT::i16; 13819 break; 13820 case Intrinsic::ppc_altivec_lvewx: 13821 VT = MVT::i32; 13822 break; 13823 case Intrinsic::ppc_vsx_lxvd2x: 13824 VT = MVT::v2f64; 13825 break; 13826 case Intrinsic::ppc_qpx_qvlfd: 13827 VT = MVT::v4f64; 13828 break; 13829 case Intrinsic::ppc_qpx_qvlfs: 13830 VT = MVT::v4f32; 13831 break; 13832 case Intrinsic::ppc_qpx_qvlfcd: 13833 VT = MVT::v2f64; 13834 break; 13835 case Intrinsic::ppc_qpx_qvlfcs: 13836 VT = MVT::v2f32; 13837 break; 13838 default: 13839 VT = MVT::v4i32; 13840 break; 13841 } 13842 13843 Info.opc = ISD::INTRINSIC_W_CHAIN; 13844 Info.memVT = VT; 13845 Info.ptrVal = I.getArgOperand(0); 13846 Info.offset = -VT.getStoreSize()+1; 13847 Info.size = 2*VT.getStoreSize()-1; 13848 Info.align = 1; 13849 Info.flags = MachineMemOperand::MOLoad; 13850 return true; 13851 } 13852 case Intrinsic::ppc_qpx_qvlfda: 13853 case Intrinsic::ppc_qpx_qvlfsa: 13854 case Intrinsic::ppc_qpx_qvlfcda: 13855 case Intrinsic::ppc_qpx_qvlfcsa: 13856 case Intrinsic::ppc_qpx_qvlfiwaa: 13857 case Intrinsic::ppc_qpx_qvlfiwza: { 13858 EVT VT; 13859 switch (Intrinsic) { 13860 case Intrinsic::ppc_qpx_qvlfda: 13861 VT = MVT::v4f64; 13862 break; 13863 case Intrinsic::ppc_qpx_qvlfsa: 13864 VT = MVT::v4f32; 13865 break; 13866 case Intrinsic::ppc_qpx_qvlfcda: 13867 VT = MVT::v2f64; 13868 break; 13869 case Intrinsic::ppc_qpx_qvlfcsa: 13870 VT = MVT::v2f32; 13871 break; 13872 default: 13873 VT = MVT::v4i32; 13874 break; 13875 } 13876 13877 Info.opc = ISD::INTRINSIC_W_CHAIN; 13878 Info.memVT = VT; 13879 Info.ptrVal = I.getArgOperand(0); 13880 Info.offset = 0; 13881 Info.size = VT.getStoreSize(); 13882 Info.align = 1; 13883 Info.flags = MachineMemOperand::MOLoad; 13884 return true; 13885 } 13886 case Intrinsic::ppc_qpx_qvstfd: 13887 case Intrinsic::ppc_qpx_qvstfs: 13888 case Intrinsic::ppc_qpx_qvstfcd: 13889 case Intrinsic::ppc_qpx_qvstfcs: 13890 case Intrinsic::ppc_qpx_qvstfiw: 13891 case Intrinsic::ppc_altivec_stvx: 13892 case Intrinsic::ppc_altivec_stvxl: 13893 case Intrinsic::ppc_altivec_stvebx: 13894 case Intrinsic::ppc_altivec_stvehx: 13895 case Intrinsic::ppc_altivec_stvewx: 13896 case Intrinsic::ppc_vsx_stxvd2x: 13897 case Intrinsic::ppc_vsx_stxvw4x: { 13898 EVT VT; 13899 switch (Intrinsic) { 13900 case Intrinsic::ppc_altivec_stvebx: 13901 VT = MVT::i8; 13902 break; 13903 case Intrinsic::ppc_altivec_stvehx: 13904 VT = MVT::i16; 13905 break; 13906 case Intrinsic::ppc_altivec_stvewx: 13907 VT = MVT::i32; 13908 break; 13909 case Intrinsic::ppc_vsx_stxvd2x: 13910 VT = MVT::v2f64; 13911 break; 13912 case Intrinsic::ppc_qpx_qvstfd: 13913 VT = MVT::v4f64; 13914 break; 13915 case Intrinsic::ppc_qpx_qvstfs: 13916 VT = MVT::v4f32; 13917 break; 13918 case Intrinsic::ppc_qpx_qvstfcd: 13919 VT = MVT::v2f64; 13920 break; 13921 case Intrinsic::ppc_qpx_qvstfcs: 13922 VT = MVT::v2f32; 13923 break; 13924 default: 13925 VT = MVT::v4i32; 13926 break; 13927 } 13928 13929 Info.opc = ISD::INTRINSIC_VOID; 13930 Info.memVT = VT; 13931 Info.ptrVal = I.getArgOperand(1); 13932 Info.offset = -VT.getStoreSize()+1; 13933 Info.size = 2*VT.getStoreSize()-1; 13934 Info.align = 1; 13935 Info.flags = MachineMemOperand::MOStore; 13936 return true; 13937 } 13938 case Intrinsic::ppc_qpx_qvstfda: 13939 case Intrinsic::ppc_qpx_qvstfsa: 13940 case Intrinsic::ppc_qpx_qvstfcda: 13941 case Intrinsic::ppc_qpx_qvstfcsa: 13942 case Intrinsic::ppc_qpx_qvstfiwa: { 13943 EVT VT; 13944 switch (Intrinsic) { 13945 case Intrinsic::ppc_qpx_qvstfda: 13946 VT = MVT::v4f64; 13947 break; 13948 case Intrinsic::ppc_qpx_qvstfsa: 13949 VT = MVT::v4f32; 13950 break; 13951 case Intrinsic::ppc_qpx_qvstfcda: 13952 VT = MVT::v2f64; 13953 break; 13954 case Intrinsic::ppc_qpx_qvstfcsa: 13955 VT = MVT::v2f32; 13956 break; 13957 default: 13958 VT = MVT::v4i32; 13959 break; 13960 } 13961 13962 Info.opc = ISD::INTRINSIC_VOID; 13963 Info.memVT = VT; 13964 Info.ptrVal = I.getArgOperand(1); 13965 Info.offset = 0; 13966 Info.size = VT.getStoreSize(); 13967 Info.align = 1; 13968 Info.flags = MachineMemOperand::MOStore; 13969 return true; 13970 } 13971 default: 13972 break; 13973 } 13974 13975 return false; 13976 } 13977 13978 /// getOptimalMemOpType - Returns the target specific optimal type for load 13979 /// and store operations as a result of memset, memcpy, and memmove 13980 /// lowering. If DstAlign is zero that means it's safe to destination 13981 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 13982 /// means there isn't a need to check it against alignment requirement, 13983 /// probably because the source does not need to be loaded. If 'IsMemset' is 13984 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that 13985 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy 13986 /// source is constant so it does not need to be loaded. 13987 /// It returns EVT::Other if the type should be determined using generic 13988 /// target-independent logic. 13989 EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, 13990 unsigned DstAlign, unsigned SrcAlign, 13991 bool IsMemset, bool ZeroMemset, 13992 bool MemcpyStrSrc, 13993 MachineFunction &MF) const { 13994 if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { 13995 const Function &F = MF.getFunction(); 13996 // When expanding a memset, require at least two QPX instructions to cover 13997 // the cost of loading the value to be stored from the constant pool. 13998 if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) && 13999 (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) && 14000 !F.hasFnAttribute(Attribute::NoImplicitFloat)) { 14001 return MVT::v4f64; 14002 } 14003 14004 // We should use Altivec/VSX loads and stores when available. For unaligned 14005 // addresses, unaligned VSX loads are only fast starting with the P8. 14006 if (Subtarget.hasAltivec() && Size >= 16 && 14007 (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) || 14008 ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector()))) 14009 return MVT::v4i32; 14010 } 14011 14012 if (Subtarget.isPPC64()) { 14013 return MVT::i64; 14014 } 14015 14016 return MVT::i32; 14017 } 14018 14019 /// Returns true if it is beneficial to convert a load of a constant 14020 /// to just the constant itself. 14021 bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 14022 Type *Ty) const { 14023 assert(Ty->isIntegerTy()); 14024 14025 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 14026 return !(BitSize == 0 || BitSize > 64); 14027 } 14028 14029 bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 14030 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 14031 return false; 14032 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 14033 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 14034 return NumBits1 == 64 && NumBits2 == 32; 14035 } 14036 14037 bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 14038 if (!VT1.isInteger() || !VT2.isInteger()) 14039 return false; 14040 unsigned NumBits1 = VT1.getSizeInBits(); 14041 unsigned NumBits2 = VT2.getSizeInBits(); 14042 return NumBits1 == 64 && NumBits2 == 32; 14043 } 14044 14045 bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 14046 // Generally speaking, zexts are not free, but they are free when they can be 14047 // folded with other operations. 14048 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) { 14049 EVT MemVT = LD->getMemoryVT(); 14050 if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 || 14051 (Subtarget.isPPC64() && MemVT == MVT::i32)) && 14052 (LD->getExtensionType() == ISD::NON_EXTLOAD || 14053 LD->getExtensionType() == ISD::ZEXTLOAD)) 14054 return true; 14055 } 14056 14057 // FIXME: Add other cases... 14058 // - 32-bit shifts with a zext to i64 14059 // - zext after ctlz, bswap, etc. 14060 // - zext after and by a constant mask 14061 14062 return TargetLowering::isZExtFree(Val, VT2); 14063 } 14064 14065 bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const { 14066 assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() && 14067 "invalid fpext types"); 14068 // Extending to float128 is not free. 14069 if (DestVT == MVT::f128) 14070 return false; 14071 return true; 14072 } 14073 14074 bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 14075 return isInt<16>(Imm) || isUInt<16>(Imm); 14076 } 14077 14078 bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { 14079 return isInt<16>(Imm) || isUInt<16>(Imm); 14080 } 14081 14082 bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 14083 unsigned, 14084 unsigned, 14085 bool *Fast) const { 14086 if (DisablePPCUnaligned) 14087 return false; 14088 14089 // PowerPC supports unaligned memory access for simple non-vector types. 14090 // Although accessing unaligned addresses is not as efficient as accessing 14091 // aligned addresses, it is generally more efficient than manual expansion, 14092 // and generally only traps for software emulation when crossing page 14093 // boundaries. 14094 14095 if (!VT.isSimple()) 14096 return false; 14097 14098 if (VT.getSimpleVT().isVector()) { 14099 if (Subtarget.hasVSX()) { 14100 if (VT != MVT::v2f64 && VT != MVT::v2i64 && 14101 VT != MVT::v4f32 && VT != MVT::v4i32) 14102 return false; 14103 } else { 14104 return false; 14105 } 14106 } 14107 14108 if (VT == MVT::ppcf128) 14109 return false; 14110 14111 if (Fast) 14112 *Fast = true; 14113 14114 return true; 14115 } 14116 14117 bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 14118 VT = VT.getScalarType(); 14119 14120 if (!VT.isSimple()) 14121 return false; 14122 14123 switch (VT.getSimpleVT().SimpleTy) { 14124 case MVT::f32: 14125 case MVT::f64: 14126 return true; 14127 case MVT::f128: 14128 return (EnableQuadPrecision && Subtarget.hasP9Vector()); 14129 default: 14130 break; 14131 } 14132 14133 return false; 14134 } 14135 14136 const MCPhysReg * 14137 PPCTargetLowering::getScratchRegisters(CallingConv::ID) const { 14138 // LR is a callee-save register, but we must treat it as clobbered by any call 14139 // site. Hence we include LR in the scratch registers, which are in turn added 14140 // as implicit-defs for stackmaps and patchpoints. The same reasoning applies 14141 // to CTR, which is used by any indirect call. 14142 static const MCPhysReg ScratchRegs[] = { 14143 PPC::X12, PPC::LR8, PPC::CTR8, 0 14144 }; 14145 14146 return ScratchRegs; 14147 } 14148 14149 unsigned PPCTargetLowering::getExceptionPointerRegister( 14150 const Constant *PersonalityFn) const { 14151 return Subtarget.isPPC64() ? PPC::X3 : PPC::R3; 14152 } 14153 14154 unsigned PPCTargetLowering::getExceptionSelectorRegister( 14155 const Constant *PersonalityFn) const { 14156 return Subtarget.isPPC64() ? PPC::X4 : PPC::R4; 14157 } 14158 14159 bool 14160 PPCTargetLowering::shouldExpandBuildVectorWithShuffles( 14161 EVT VT , unsigned DefinedValues) const { 14162 if (VT == MVT::v2i64) 14163 return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves 14164 14165 if (Subtarget.hasVSX() || Subtarget.hasQPX()) 14166 return true; 14167 14168 return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); 14169 } 14170 14171 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const { 14172 if (DisableILPPref || Subtarget.enableMachineScheduler()) 14173 return TargetLowering::getSchedulingPreference(N); 14174 14175 return Sched::ILP; 14176 } 14177 14178 // Create a fast isel object. 14179 FastISel * 14180 PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo, 14181 const TargetLibraryInfo *LibInfo) const { 14182 return PPC::createFastISel(FuncInfo, LibInfo); 14183 } 14184 14185 void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 14186 if (Subtarget.isDarwinABI()) return; 14187 if (!Subtarget.isPPC64()) return; 14188 14189 // Update IsSplitCSR in PPCFunctionInfo 14190 PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>(); 14191 PFI->setIsSplitCSR(true); 14192 } 14193 14194 void PPCTargetLowering::insertCopiesSplitCSR( 14195 MachineBasicBlock *Entry, 14196 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 14197 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); 14198 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 14199 if (!IStart) 14200 return; 14201 14202 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 14203 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 14204 MachineBasicBlock::iterator MBBI = Entry->begin(); 14205 for (const MCPhysReg *I = IStart; *I; ++I) { 14206 const TargetRegisterClass *RC = nullptr; 14207 if (PPC::G8RCRegClass.contains(*I)) 14208 RC = &PPC::G8RCRegClass; 14209 else if (PPC::F8RCRegClass.contains(*I)) 14210 RC = &PPC::F8RCRegClass; 14211 else if (PPC::CRRCRegClass.contains(*I)) 14212 RC = &PPC::CRRCRegClass; 14213 else if (PPC::VRRCRegClass.contains(*I)) 14214 RC = &PPC::VRRCRegClass; 14215 else 14216 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 14217 14218 unsigned NewVR = MRI->createVirtualRegister(RC); 14219 // Create copy from CSR to a virtual register. 14220 // FIXME: this currently does not emit CFI pseudo-instructions, it works 14221 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 14222 // nounwind. If we want to generalize this later, we may need to emit 14223 // CFI pseudo-instructions. 14224 assert(Entry->getParent()->getFunction().hasFnAttribute( 14225 Attribute::NoUnwind) && 14226 "Function should be nounwind in insertCopiesSplitCSR!"); 14227 Entry->addLiveIn(*I); 14228 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 14229 .addReg(*I); 14230 14231 // Insert the copy-back instructions right before the terminator 14232 for (auto *Exit : Exits) 14233 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 14234 TII->get(TargetOpcode::COPY), *I) 14235 .addReg(NewVR); 14236 } 14237 } 14238 14239 // Override to enable LOAD_STACK_GUARD lowering on Linux. 14240 bool PPCTargetLowering::useLoadStackGuardNode() const { 14241 if (!Subtarget.isTargetLinux()) 14242 return TargetLowering::useLoadStackGuardNode(); 14243 return true; 14244 } 14245 14246 // Override to disable global variable loading on Linux. 14247 void PPCTargetLowering::insertSSPDeclarations(Module &M) const { 14248 if (!Subtarget.isTargetLinux()) 14249 return TargetLowering::insertSSPDeclarations(M); 14250 } 14251 14252 bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 14253 if (!VT.isSimple() || !Subtarget.hasVSX()) 14254 return false; 14255 14256 switch(VT.getSimpleVT().SimpleTy) { 14257 default: 14258 // For FP types that are currently not supported by PPC backend, return 14259 // false. Examples: f16, f80. 14260 return false; 14261 case MVT::f32: 14262 case MVT::f64: 14263 case MVT::ppcf128: 14264 return Imm.isPosZero(); 14265 } 14266 } 14267 14268 // For vector shift operation op, fold 14269 // (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y) 14270 static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N, 14271 SelectionDAG &DAG) { 14272 SDValue N0 = N->getOperand(0); 14273 SDValue N1 = N->getOperand(1); 14274 EVT VT = N0.getValueType(); 14275 unsigned OpSizeInBits = VT.getScalarSizeInBits(); 14276 unsigned Opcode = N->getOpcode(); 14277 unsigned TargetOpcode; 14278 14279 switch (Opcode) { 14280 default: 14281 llvm_unreachable("Unexpected shift operation"); 14282 case ISD::SHL: 14283 TargetOpcode = PPCISD::SHL; 14284 break; 14285 case ISD::SRL: 14286 TargetOpcode = PPCISD::SRL; 14287 break; 14288 case ISD::SRA: 14289 TargetOpcode = PPCISD::SRA; 14290 break; 14291 } 14292 14293 if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) && 14294 N1->getOpcode() == ISD::AND) 14295 if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) 14296 if (Mask->getZExtValue() == OpSizeInBits - 1) 14297 return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0)); 14298 14299 return SDValue(); 14300 } 14301 14302 SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const { 14303 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) 14304 return Value; 14305 14306 SDValue N0 = N->getOperand(0); 14307 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 14308 if (!Subtarget.isISA3_0() || 14309 N0.getOpcode() != ISD::SIGN_EXTEND || 14310 N0.getOperand(0).getValueType() != MVT::i32 || 14311 CN1 == nullptr || N->getValueType(0) != MVT::i64) 14312 return SDValue(); 14313 14314 // We can't save an operation here if the value is already extended, and 14315 // the existing shift is easier to combine. 14316 SDValue ExtsSrc = N0.getOperand(0); 14317 if (ExtsSrc.getOpcode() == ISD::TRUNCATE && 14318 ExtsSrc.getOperand(0).getOpcode() == ISD::AssertSext) 14319 return SDValue(); 14320 14321 SDLoc DL(N0); 14322 SDValue ShiftBy = SDValue(CN1, 0); 14323 // We want the shift amount to be i32 on the extswli, but the shift could 14324 // have an i64. 14325 if (ShiftBy.getValueType() == MVT::i64) 14326 ShiftBy = DCI.DAG.getConstant(CN1->getZExtValue(), DL, MVT::i32); 14327 14328 return DCI.DAG.getNode(PPCISD::EXTSWSLI, DL, MVT::i64, N0->getOperand(0), 14329 ShiftBy); 14330 } 14331 14332 SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const { 14333 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) 14334 return Value; 14335 14336 return SDValue(); 14337 } 14338 14339 SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { 14340 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) 14341 return Value; 14342 14343 return SDValue(); 14344 } 14345 14346 // Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1)) 14347 // Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0)) 14348 // When C is zero, the equation (addi Z, -C) can be simplified to Z 14349 // Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types 14350 static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, 14351 const PPCSubtarget &Subtarget) { 14352 if (!Subtarget.isPPC64()) 14353 return SDValue(); 14354 14355 SDValue LHS = N->getOperand(0); 14356 SDValue RHS = N->getOperand(1); 14357 14358 auto isZextOfCompareWithConstant = [](SDValue Op) { 14359 if (Op.getOpcode() != ISD::ZERO_EXTEND || !Op.hasOneUse() || 14360 Op.getValueType() != MVT::i64) 14361 return false; 14362 14363 SDValue Cmp = Op.getOperand(0); 14364 if (Cmp.getOpcode() != ISD::SETCC || !Cmp.hasOneUse() || 14365 Cmp.getOperand(0).getValueType() != MVT::i64) 14366 return false; 14367 14368 if (auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1))) { 14369 int64_t NegConstant = 0 - Constant->getSExtValue(); 14370 // Due to the limitations of the addi instruction, 14371 // -C is required to be [-32768, 32767]. 14372 return isInt<16>(NegConstant); 14373 } 14374 14375 return false; 14376 }; 14377 14378 bool LHSHasPattern = isZextOfCompareWithConstant(LHS); 14379 bool RHSHasPattern = isZextOfCompareWithConstant(RHS); 14380 14381 // If there is a pattern, canonicalize a zext operand to the RHS. 14382 if (LHSHasPattern && !RHSHasPattern) 14383 std::swap(LHS, RHS); 14384 else if (!LHSHasPattern && !RHSHasPattern) 14385 return SDValue(); 14386 14387 SDLoc DL(N); 14388 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue); 14389 SDValue Cmp = RHS.getOperand(0); 14390 SDValue Z = Cmp.getOperand(0); 14391 auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1)); 14392 14393 assert(Constant && "Constant Should not be a null pointer."); 14394 int64_t NegConstant = 0 - Constant->getSExtValue(); 14395 14396 switch(cast<CondCodeSDNode>(Cmp.getOperand(2))->get()) { 14397 default: break; 14398 case ISD::SETNE: { 14399 // when C == 0 14400 // --> addze X, (addic Z, -1).carry 14401 // / 14402 // add X, (zext(setne Z, C))-- 14403 // \ when -32768 <= -C <= 32767 && C != 0 14404 // --> addze X, (addic (addi Z, -C), -1).carry 14405 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, 14406 DAG.getConstant(NegConstant, DL, MVT::i64)); 14407 SDValue AddOrZ = NegConstant != 0 ? Add : Z; 14408 SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue), 14409 AddOrZ, DAG.getConstant(-1ULL, DL, MVT::i64)); 14410 return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), 14411 SDValue(Addc.getNode(), 1)); 14412 } 14413 case ISD::SETEQ: { 14414 // when C == 0 14415 // --> addze X, (subfic Z, 0).carry 14416 // / 14417 // add X, (zext(sete Z, C))-- 14418 // \ when -32768 <= -C <= 32767 && C != 0 14419 // --> addze X, (subfic (addi Z, -C), 0).carry 14420 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, 14421 DAG.getConstant(NegConstant, DL, MVT::i64)); 14422 SDValue AddOrZ = NegConstant != 0 ? Add : Z; 14423 SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue), 14424 DAG.getConstant(0, DL, MVT::i64), AddOrZ); 14425 return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), 14426 SDValue(Subc.getNode(), 1)); 14427 } 14428 } 14429 14430 return SDValue(); 14431 } 14432 14433 SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const { 14434 if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget)) 14435 return Value; 14436 14437 return SDValue(); 14438 } 14439 14440 // Detect TRUNCATE operations on bitcasts of float128 values. 14441 // What we are looking for here is the situtation where we extract a subset 14442 // of bits from a 128 bit float. 14443 // This can be of two forms: 14444 // 1) BITCAST of f128 feeding TRUNCATE 14445 // 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE 14446 // The reason this is required is because we do not have a legal i128 type 14447 // and so we want to prevent having to store the f128 and then reload part 14448 // of it. 14449 SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N, 14450 DAGCombinerInfo &DCI) const { 14451 // If we are using CRBits then try that first. 14452 if (Subtarget.useCRBits()) { 14453 // Check if CRBits did anything and return that if it did. 14454 if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI)) 14455 return CRTruncValue; 14456 } 14457 14458 SDLoc dl(N); 14459 SDValue Op0 = N->getOperand(0); 14460 14461 // Looking for a truncate of i128 to i64. 14462 if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64) 14463 return SDValue(); 14464 14465 int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0; 14466 14467 // SRL feeding TRUNCATE. 14468 if (Op0.getOpcode() == ISD::SRL) { 14469 ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Op0.getOperand(1)); 14470 // The right shift has to be by 64 bits. 14471 if (!ConstNode || ConstNode->getZExtValue() != 64) 14472 return SDValue(); 14473 14474 // Switch the element number to extract. 14475 EltToExtract = EltToExtract ? 0 : 1; 14476 // Update Op0 past the SRL. 14477 Op0 = Op0.getOperand(0); 14478 } 14479 14480 // BITCAST feeding a TRUNCATE possibly via SRL. 14481 if (Op0.getOpcode() == ISD::BITCAST && 14482 Op0.getValueType() == MVT::i128 && 14483 Op0.getOperand(0).getValueType() == MVT::f128) { 14484 SDValue Bitcast = DCI.DAG.getBitcast(MVT::v2i64, Op0.getOperand(0)); 14485 return DCI.DAG.getNode( 14486 ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Bitcast, 14487 DCI.DAG.getTargetConstant(EltToExtract, dl, MVT::i32)); 14488 } 14489 return SDValue(); 14490 } 14491 14492 bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 14493 // Only duplicate to increase tail-calls for the 64bit SysV ABIs. 14494 if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64()) 14495 return false; 14496 14497 // If not a tail call then no need to proceed. 14498 if (!CI->isTailCall()) 14499 return false; 14500 14501 // If tail calls are disabled for the caller then we are done. 14502 const Function *Caller = CI->getParent()->getParent(); 14503 auto Attr = Caller->getFnAttribute("disable-tail-calls"); 14504 if (Attr.getValueAsString() == "true") 14505 return false; 14506 14507 // If sibling calls have been disabled and tail-calls aren't guaranteed 14508 // there is no reason to duplicate. 14509 auto &TM = getTargetMachine(); 14510 if (!TM.Options.GuaranteedTailCallOpt && DisableSCO) 14511 return false; 14512 14513 // Can't tail call a function called indirectly, or if it has variadic args. 14514 const Function *Callee = CI->getCalledFunction(); 14515 if (!Callee || Callee->isVarArg()) 14516 return false; 14517 14518 // Make sure the callee and caller calling conventions are eligible for tco. 14519 if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(), 14520 CI->getCallingConv())) 14521 return false; 14522 14523 // If the function is local then we have a good chance at tail-calling it 14524 return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee); 14525 } 14526 14527 bool PPCTargetLowering::hasBitPreservingFPLogic(EVT VT) const { 14528 if (!Subtarget.hasVSX()) 14529 return false; 14530 if (Subtarget.hasP9Vector() && VT == MVT::f128) 14531 return true; 14532 return VT == MVT::f32 || VT == MVT::f64 || 14533 VT == MVT::v4f32 || VT == MVT::v2f64; 14534 } 14535 14536 bool PPCTargetLowering:: 14537 isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const { 14538 const Value *Mask = AndI.getOperand(1); 14539 // If the mask is suitable for andi. or andis. we should sink the and. 14540 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Mask)) { 14541 // Can't handle constants wider than 64-bits. 14542 if (CI->getBitWidth() > 64) 14543 return false; 14544 int64_t ConstVal = CI->getZExtValue(); 14545 return isUInt<16>(ConstVal) || 14546 (isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF)); 14547 } 14548 14549 // For non-constant masks, we can always use the record-form and. 14550 return true; 14551 } 14552 14553 // Transform (abs (sub (zext a), (zext b))) to (vabsd a b 0) 14554 // Transform (abs (sub (zext a), (zext_invec b))) to (vabsd a b 0) 14555 // Transform (abs (sub (zext_invec a), (zext_invec b))) to (vabsd a b 0) 14556 // Transform (abs (sub (zext_invec a), (zext b))) to (vabsd a b 0) 14557 // Transform (abs (sub a, b) to (vabsd a b 1)) if a & b of type v4i32 14558 SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const { 14559 assert((N->getOpcode() == ISD::ABS) && "Need ABS node here"); 14560 assert(Subtarget.hasP9Altivec() && 14561 "Only combine this when P9 altivec supported!"); 14562 EVT VT = N->getValueType(0); 14563 if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8) 14564 return SDValue(); 14565 14566 SelectionDAG &DAG = DCI.DAG; 14567 SDLoc dl(N); 14568 if (N->getOperand(0).getOpcode() == ISD::SUB) { 14569 // Even for signed integers, if it's known to be positive (as signed 14570 // integer) due to zero-extended inputs. 14571 unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode(); 14572 unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode(); 14573 if ((SubOpcd0 == ISD::ZERO_EXTEND || 14574 SubOpcd0 == ISD::ZERO_EXTEND_VECTOR_INREG) && 14575 (SubOpcd1 == ISD::ZERO_EXTEND || 14576 SubOpcd1 == ISD::ZERO_EXTEND_VECTOR_INREG)) { 14577 return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(), 14578 N->getOperand(0)->getOperand(0), 14579 N->getOperand(0)->getOperand(1), 14580 DAG.getTargetConstant(0, dl, MVT::i32)); 14581 } 14582 14583 // For type v4i32, it can be optimized with xvnegsp + vabsduw 14584 if (N->getOperand(0).getValueType() == MVT::v4i32 && 14585 N->getOperand(0).hasOneUse()) { 14586 return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(), 14587 N->getOperand(0)->getOperand(0), 14588 N->getOperand(0)->getOperand(1), 14589 DAG.getTargetConstant(1, dl, MVT::i32)); 14590 } 14591 } 14592 14593 return SDValue(); 14594 } 14595 14596 // For type v4i32/v8ii16/v16i8, transform 14597 // from (vselect (setcc a, b, setugt), (sub a, b), (sub b, a)) to (vabsd a, b) 14598 // from (vselect (setcc a, b, setuge), (sub a, b), (sub b, a)) to (vabsd a, b) 14599 // from (vselect (setcc a, b, setult), (sub b, a), (sub a, b)) to (vabsd a, b) 14600 // from (vselect (setcc a, b, setule), (sub b, a), (sub a, b)) to (vabsd a, b) 14601 SDValue PPCTargetLowering::combineVSelect(SDNode *N, 14602 DAGCombinerInfo &DCI) const { 14603 assert((N->getOpcode() == ISD::VSELECT) && "Need VSELECT node here"); 14604 assert(Subtarget.hasP9Altivec() && 14605 "Only combine this when P9 altivec supported!"); 14606 14607 SelectionDAG &DAG = DCI.DAG; 14608 SDLoc dl(N); 14609 SDValue Cond = N->getOperand(0); 14610 SDValue TrueOpnd = N->getOperand(1); 14611 SDValue FalseOpnd = N->getOperand(2); 14612 EVT VT = N->getOperand(1).getValueType(); 14613 14614 if (Cond.getOpcode() != ISD::SETCC || TrueOpnd.getOpcode() != ISD::SUB || 14615 FalseOpnd.getOpcode() != ISD::SUB) 14616 return SDValue(); 14617 14618 // ABSD only available for type v4i32/v8i16/v16i8 14619 if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8) 14620 return SDValue(); 14621 14622 // At least to save one more dependent computation 14623 if (!(Cond.hasOneUse() || TrueOpnd.hasOneUse() || FalseOpnd.hasOneUse())) 14624 return SDValue(); 14625 14626 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 14627 14628 // Can only handle unsigned comparison here 14629 switch (CC) { 14630 default: 14631 return SDValue(); 14632 case ISD::SETUGT: 14633 case ISD::SETUGE: 14634 break; 14635 case ISD::SETULT: 14636 case ISD::SETULE: 14637 std::swap(TrueOpnd, FalseOpnd); 14638 break; 14639 } 14640 14641 SDValue CmpOpnd1 = Cond.getOperand(0); 14642 SDValue CmpOpnd2 = Cond.getOperand(1); 14643 14644 // SETCC CmpOpnd1 CmpOpnd2 cond 14645 // TrueOpnd = CmpOpnd1 - CmpOpnd2 14646 // FalseOpnd = CmpOpnd2 - CmpOpnd1 14647 if (TrueOpnd.getOperand(0) == CmpOpnd1 && 14648 TrueOpnd.getOperand(1) == CmpOpnd2 && 14649 FalseOpnd.getOperand(0) == CmpOpnd2 && 14650 FalseOpnd.getOperand(1) == CmpOpnd1) { 14651 return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(1).getValueType(), 14652 CmpOpnd1, CmpOpnd2, 14653 DAG.getTargetConstant(0, dl, MVT::i32)); 14654 } 14655 14656 return SDValue(); 14657 } 14658