1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for SI 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifdef _MSC_VER 16 // Provide M_PI. 17 #define _USE_MATH_DEFINES 18 #include <cmath> 19 #endif 20 21 #include "SIISelLowering.h" 22 #include "AMDGPU.h" 23 #include "AMDGPUIntrinsicInfo.h" 24 #include "AMDGPUSubtarget.h" 25 #include "SIInstrInfo.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "SIRegisterInfo.h" 28 #include "llvm/ADT/BitVector.h" 29 #include "llvm/CodeGen/CallingConvLower.h" 30 #include "llvm/CodeGen/MachineInstrBuilder.h" 31 #include "llvm/CodeGen/MachineRegisterInfo.h" 32 #include "llvm/CodeGen/SelectionDAG.h" 33 #include "llvm/IR/Function.h" 34 #include "llvm/ADT/SmallString.h" 35 36 using namespace llvm; 37 38 SITargetLowering::SITargetLowering(TargetMachine &TM, 39 const AMDGPUSubtarget &STI) 40 : AMDGPUTargetLowering(TM, STI) { 41 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 42 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 43 44 addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 45 addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 46 47 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); 48 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 49 50 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 51 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 52 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 53 54 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); 55 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 56 57 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); 58 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 59 60 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); 61 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 62 63 computeRegisterProperties(STI.getRegisterInfo()); 64 65 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 66 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 67 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 68 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 69 70 setOperationAction(ISD::ADD, MVT::i32, Legal); 71 setOperationAction(ISD::ADDC, MVT::i32, Legal); 72 setOperationAction(ISD::ADDE, MVT::i32, Legal); 73 setOperationAction(ISD::SUBC, MVT::i32, Legal); 74 setOperationAction(ISD::SUBE, MVT::i32, Legal); 75 76 setOperationAction(ISD::FSIN, MVT::f32, Custom); 77 setOperationAction(ISD::FCOS, MVT::f32, Custom); 78 79 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 80 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 81 82 // We need to custom lower vector stores from local memory 83 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 84 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 85 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 86 87 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 88 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 89 90 setOperationAction(ISD::STORE, MVT::i1, Custom); 91 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 92 93 setOperationAction(ISD::SELECT, MVT::i64, Custom); 94 setOperationAction(ISD::SELECT, MVT::f64, Promote); 95 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 96 97 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 98 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 99 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 100 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 101 102 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 103 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 104 105 setOperationAction(ISD::BSWAP, MVT::i32, Legal); 106 107 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); 108 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 109 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 110 111 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); 112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 113 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 114 115 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 116 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 117 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 118 119 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 120 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 121 122 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 123 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 124 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); 125 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 126 127 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 128 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 129 130 for (MVT VT : MVT::integer_valuetypes()) { 131 if (VT == MVT::i64) 132 continue; 133 134 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 135 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); 136 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); 137 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); 138 139 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 140 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); 141 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); 142 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); 143 144 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 145 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); 146 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); 147 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); 148 } 149 150 for (MVT VT : MVT::integer_vector_valuetypes()) { 151 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); 152 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); 153 } 154 155 for (MVT VT : MVT::fp_valuetypes()) 156 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 157 158 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 159 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 160 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 161 162 setOperationAction(ISD::LOAD, MVT::i1, Custom); 163 164 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 165 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 166 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 167 168 // These should use UDIVREM, so set them to expand 169 setOperationAction(ISD::UDIV, MVT::i64, Expand); 170 setOperationAction(ISD::UREM, MVT::i64, Expand); 171 172 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 173 setOperationAction(ISD::SELECT, MVT::i1, Promote); 174 175 // We only support LOAD/STORE and vector manipulation ops for vectors 176 // with > 4 elements. 177 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { 178 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 179 switch(Op) { 180 case ISD::LOAD: 181 case ISD::STORE: 182 case ISD::BUILD_VECTOR: 183 case ISD::BITCAST: 184 case ISD::EXTRACT_VECTOR_ELT: 185 case ISD::INSERT_VECTOR_ELT: 186 case ISD::INSERT_SUBVECTOR: 187 case ISD::EXTRACT_SUBVECTOR: 188 break; 189 case ISD::CONCAT_VECTORS: 190 setOperationAction(Op, VT, Custom); 191 break; 192 default: 193 setOperationAction(Op, VT, Expand); 194 break; 195 } 196 } 197 } 198 199 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 200 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 201 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 202 setOperationAction(ISD::FRINT, MVT::f64, Legal); 203 } 204 205 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 206 setOperationAction(ISD::FDIV, MVT::f32, Custom); 207 setOperationAction(ISD::FDIV, MVT::f64, Custom); 208 209 setTargetDAGCombine(ISD::FADD); 210 setTargetDAGCombine(ISD::FSUB); 211 setTargetDAGCombine(ISD::FMINNUM); 212 setTargetDAGCombine(ISD::FMAXNUM); 213 setTargetDAGCombine(ISD::SMIN); 214 setTargetDAGCombine(ISD::SMAX); 215 setTargetDAGCombine(ISD::UMIN); 216 setTargetDAGCombine(ISD::UMAX); 217 setTargetDAGCombine(ISD::SELECT_CC); 218 setTargetDAGCombine(ISD::SETCC); 219 setTargetDAGCombine(ISD::AND); 220 setTargetDAGCombine(ISD::OR); 221 setTargetDAGCombine(ISD::UINT_TO_FP); 222 223 // All memory operations. Some folding on the pointer operand is done to help 224 // matching the constant offsets in the addressing modes. 225 setTargetDAGCombine(ISD::LOAD); 226 setTargetDAGCombine(ISD::STORE); 227 setTargetDAGCombine(ISD::ATOMIC_LOAD); 228 setTargetDAGCombine(ISD::ATOMIC_STORE); 229 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); 230 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); 231 setTargetDAGCombine(ISD::ATOMIC_SWAP); 232 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); 233 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); 234 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); 235 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); 236 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); 237 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); 238 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); 239 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); 240 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); 241 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); 242 243 setSchedulingPreference(Sched::RegPressure); 244 } 245 246 //===----------------------------------------------------------------------===// 247 // TargetLowering queries 248 //===----------------------------------------------------------------------===// 249 250 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, 251 EVT) const { 252 // SI has some legal vector types, but no legal vector operations. Say no 253 // shuffles are legal in order to prefer scalarizing some vector operations. 254 return false; 255 } 256 257 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, 258 const AddrMode &AM, Type *Ty, 259 unsigned AS) const { 260 // No global is ever allowed as a base. 261 if (AM.BaseGV) 262 return false; 263 264 switch (AS) { 265 case AMDGPUAS::GLOBAL_ADDRESS: 266 case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? 267 case AMDGPUAS::PRIVATE_ADDRESS: 268 case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { 269 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and 270 // additionally can do r + r + i with addr64. 32-bit has more addressing 271 // mode options. Depending on the resource constant, it can also do 272 // (i64 r0) + (i32 r1) * (i14 i). 273 // 274 // SMRD instructions have an 8-bit, dword offset. 275 // 276 // Assume nonunifom access, since the address space isn't enough to know 277 // what instruction we will use, and since we don't know if this is a load 278 // or store and scalar stores are only available on VI. 279 // 280 // We also know if we are doing an extload, we can't do a scalar load. 281 // 282 // Private arrays end up using a scratch buffer most of the time, so also 283 // assume those use MUBUF instructions. Scratch loads / stores are currently 284 // implemented as mubuf instructions with offen bit set, so slightly 285 // different than the normal addr64. 286 if (!isUInt<12>(AM.BaseOffs)) 287 return false; 288 289 // FIXME: Since we can split immediate into soffset and immediate offset, 290 // would it make sense to allow any immediate? 291 292 switch (AM.Scale) { 293 case 0: // r + i or just i, depending on HasBaseReg. 294 return true; 295 case 1: 296 return true; // We have r + r or r + i. 297 case 2: 298 if (AM.HasBaseReg) { 299 // Reject 2 * r + r. 300 return false; 301 } 302 303 // Allow 2 * r as r + r 304 // Or 2 * r + i is allowed as r + r + i. 305 return true; 306 default: // Don't allow n * r 307 return false; 308 } 309 } 310 case AMDGPUAS::LOCAL_ADDRESS: 311 case AMDGPUAS::REGION_ADDRESS: { 312 // Basic, single offset DS instructions allow a 16-bit unsigned immediate 313 // field. 314 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have 315 // an 8-bit dword offset but we don't know the alignment here. 316 if (!isUInt<16>(AM.BaseOffs)) 317 return false; 318 319 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 320 return true; 321 322 if (AM.Scale == 1 && AM.HasBaseReg) 323 return true; 324 325 return false; 326 } 327 case AMDGPUAS::FLAT_ADDRESS: { 328 // Flat instructions do not have offsets, and only have the register 329 // address. 330 return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); 331 } 332 default: 333 llvm_unreachable("unhandled address space"); 334 } 335 } 336 337 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 338 unsigned AddrSpace, 339 unsigned Align, 340 bool *IsFast) const { 341 if (IsFast) 342 *IsFast = false; 343 344 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, 345 // which isn't a simple VT. 346 if (!VT.isSimple() || VT == MVT::Other) 347 return false; 348 349 // TODO - CI+ supports unaligned memory accesses, but this requires driver 350 // support. 351 352 // XXX - The only mention I see of this in the ISA manual is for LDS direct 353 // reads the "byte address and must be dword aligned". Is it also true for the 354 // normal loads and stores? 355 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { 356 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte 357 // aligned, 8 byte access in a single operation using ds_read2/write2_b32 358 // with adjacent offsets. 359 return Align % 4 == 0; 360 } 361 362 // Smaller than dword value must be aligned. 363 // FIXME: This should be allowed on CI+ 364 if (VT.bitsLT(MVT::i32)) 365 return false; 366 367 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 368 // byte-address are ignored, thus forcing Dword alignment. 369 // This applies to private, global, and constant memory. 370 if (IsFast) 371 *IsFast = true; 372 373 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 374 } 375 376 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 377 unsigned SrcAlign, bool IsMemset, 378 bool ZeroMemset, 379 bool MemcpyStrSrc, 380 MachineFunction &MF) const { 381 // FIXME: Should account for address space here. 382 383 // The default fallback uses the private pointer size as a guess for a type to 384 // use. Make sure we switch these to 64-bit accesses. 385 386 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global 387 return MVT::v4i32; 388 389 if (Size >= 8 && DstAlign >= 4) 390 return MVT::v2i32; 391 392 // Use the default. 393 return MVT::Other; 394 } 395 396 TargetLoweringBase::LegalizeTypeAction 397 SITargetLowering::getPreferredVectorAction(EVT VT) const { 398 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) 399 return TypeSplitVector; 400 401 return TargetLoweringBase::getPreferredVectorAction(VT); 402 } 403 404 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 405 Type *Ty) const { 406 const SIInstrInfo *TII = 407 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 408 return TII->isInlineConstant(Imm); 409 } 410 411 static EVT toIntegerVT(EVT VT) { 412 if (VT.isVector()) 413 return VT.changeVectorElementTypeToInteger(); 414 return MVT::getIntegerVT(VT.getSizeInBits()); 415 } 416 417 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 418 SDLoc SL, SDValue Chain, 419 unsigned Offset, bool Signed) const { 420 const DataLayout &DL = DAG.getDataLayout(); 421 MachineFunction &MF = DAG.getMachineFunction(); 422 const SIRegisterInfo *TRI = 423 static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); 424 unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); 425 426 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 427 428 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 429 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 430 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 431 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, 432 MRI.getLiveInVirtReg(InputPtrReg), PtrVT); 433 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 434 DAG.getConstant(Offset, SL, PtrVT)); 435 SDValue PtrOffset = DAG.getUNDEF(PtrVT); 436 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 437 438 unsigned Align = DL.getABITypeAlignment(Ty); 439 440 if (VT != MemVT && VT.isFloatingPoint()) { 441 // Do an integer load and convert. 442 // FIXME: This is mostly because load legalization after type legalization 443 // doesn't handle FP extloads. 444 assert(VT.getScalarType() == MVT::f32 && 445 MemVT.getScalarType() == MVT::f16); 446 447 EVT IVT = toIntegerVT(VT); 448 EVT MemIVT = toIntegerVT(MemVT); 449 SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, 450 IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT, 451 false, // isVolatile 452 true, // isNonTemporal 453 true, // isInvariant 454 Align); // Alignment 455 SDValue Ops[] = { 456 DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load), 457 Load.getValue(1) 458 }; 459 460 return DAG.getMergeValues(Ops, SL); 461 } 462 463 ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 464 return DAG.getLoad(ISD::UNINDEXED, ExtTy, 465 VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, 466 false, // isVolatile 467 true, // isNonTemporal 468 true, // isInvariant 469 Align); // Alignment 470 } 471 472 SDValue SITargetLowering::LowerFormalArguments( 473 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 474 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 475 SmallVectorImpl<SDValue> &InVals) const { 476 const SIRegisterInfo *TRI = 477 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 478 479 MachineFunction &MF = DAG.getMachineFunction(); 480 FunctionType *FType = MF.getFunction()->getFunctionType(); 481 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 482 483 assert(CallConv == CallingConv::C); 484 485 SmallVector<ISD::InputArg, 16> Splits; 486 BitVector Skipped(Ins.size()); 487 488 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 489 const ISD::InputArg &Arg = Ins[i]; 490 491 // First check if it's a PS input addr 492 if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && 493 !Arg.Flags.isByVal()) { 494 495 assert((PSInputNum <= 15) && "Too many PS inputs!"); 496 497 if (!Arg.Used) { 498 // We can savely skip PS inputs 499 Skipped.set(i); 500 ++PSInputNum; 501 continue; 502 } 503 504 Info->PSInputAddr |= 1 << PSInputNum++; 505 } 506 507 // Second split vertices into their elements 508 if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { 509 ISD::InputArg NewArg = Arg; 510 NewArg.Flags.setSplit(); 511 NewArg.VT = Arg.VT.getVectorElementType(); 512 513 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 514 // three or five element vertex only needs three or five registers, 515 // NOT four or eigth. 516 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 517 unsigned NumElements = ParamType->getVectorNumElements(); 518 519 for (unsigned j = 0; j != NumElements; ++j) { 520 Splits.push_back(NewArg); 521 NewArg.PartOffset += NewArg.VT.getStoreSize(); 522 } 523 524 } else if (Info->getShaderType() != ShaderType::COMPUTE) { 525 Splits.push_back(Arg); 526 } 527 } 528 529 SmallVector<CCValAssign, 16> ArgLocs; 530 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 531 *DAG.getContext()); 532 533 // At least one interpolation mode must be enabled or else the GPU will hang. 534 if (Info->getShaderType() == ShaderType::PIXEL && 535 (Info->PSInputAddr & 0x7F) == 0) { 536 Info->PSInputAddr |= 1; 537 CCInfo.AllocateReg(AMDGPU::VGPR0); 538 CCInfo.AllocateReg(AMDGPU::VGPR1); 539 } 540 541 // The pointer to the list of arguments is stored in SGPR0, SGPR1 542 // The pointer to the scratch buffer is stored in SGPR2, SGPR3 543 if (Info->getShaderType() == ShaderType::COMPUTE) { 544 if (Subtarget->isAmdHsaOS()) 545 Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. 546 else 547 Info->NumUserSGPRs = 4; 548 549 unsigned InputPtrReg = 550 TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); 551 unsigned InputPtrRegLo = 552 TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); 553 unsigned InputPtrRegHi = 554 TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); 555 556 unsigned ScratchPtrReg = 557 TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); 558 unsigned ScratchPtrRegLo = 559 TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); 560 unsigned ScratchPtrRegHi = 561 TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); 562 563 CCInfo.AllocateReg(InputPtrRegLo); 564 CCInfo.AllocateReg(InputPtrRegHi); 565 CCInfo.AllocateReg(ScratchPtrRegLo); 566 CCInfo.AllocateReg(ScratchPtrRegHi); 567 MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); 568 MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); 569 } 570 571 if (Info->getShaderType() == ShaderType::COMPUTE) { 572 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 573 Splits); 574 } 575 576 AnalyzeFormalArguments(CCInfo, Splits); 577 578 SmallVector<SDValue, 16> Chains; 579 580 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 581 582 const ISD::InputArg &Arg = Ins[i]; 583 if (Skipped[i]) { 584 InVals.push_back(DAG.getUNDEF(Arg.VT)); 585 continue; 586 } 587 588 CCValAssign &VA = ArgLocs[ArgIdx++]; 589 MVT VT = VA.getLocVT(); 590 591 if (VA.isMemLoc()) { 592 VT = Ins[i].VT; 593 EVT MemVT = Splits[i].VT; 594 const unsigned Offset = Subtarget->getExplicitKernelArgOffset() + 595 VA.getLocMemOffset(); 596 // The first 36 bytes of the input buffer contains information about 597 // thread group and global sizes. 598 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, 599 Offset, Ins[i].Flags.isSExt()); 600 Chains.push_back(Arg.getValue(1)); 601 602 const PointerType *ParamTy = 603 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 604 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && 605 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 606 // On SI local pointers are just offsets into LDS, so they are always 607 // less than 16-bits. On CI and newer they could potentially be 608 // real pointers, so we can't guarantee their size. 609 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, 610 DAG.getValueType(MVT::i16)); 611 } 612 613 InVals.push_back(Arg); 614 Info->ABIArgOffset = Offset + MemVT.getStoreSize(); 615 continue; 616 } 617 assert(VA.isRegLoc() && "Parameter must be in a register!"); 618 619 unsigned Reg = VA.getLocReg(); 620 621 if (VT == MVT::i64) { 622 // For now assume it is a pointer 623 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 624 &AMDGPU::SReg_64RegClass); 625 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 626 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 627 InVals.push_back(Copy); 628 continue; 629 } 630 631 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 632 633 Reg = MF.addLiveIn(Reg, RC); 634 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 635 636 if (Arg.VT.isVector()) { 637 638 // Build a vector from the registers 639 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 640 unsigned NumElements = ParamType->getVectorNumElements(); 641 642 SmallVector<SDValue, 4> Regs; 643 Regs.push_back(Val); 644 for (unsigned j = 1; j != NumElements; ++j) { 645 Reg = ArgLocs[ArgIdx++].getLocReg(); 646 Reg = MF.addLiveIn(Reg, RC); 647 648 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 649 Regs.push_back(Copy); 650 } 651 652 // Fill up the missing vector elements 653 NumElements = Arg.VT.getVectorNumElements() - NumElements; 654 Regs.append(NumElements, DAG.getUNDEF(VT)); 655 656 InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); 657 continue; 658 } 659 660 InVals.push_back(Val); 661 } 662 663 if (Info->getShaderType() != ShaderType::COMPUTE) { 664 unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>( 665 AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); 666 Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); 667 } 668 669 if (Chains.empty()) 670 return Chain; 671 672 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 673 } 674 675 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 676 MachineInstr * MI, MachineBasicBlock * BB) const { 677 678 MachineBasicBlock::iterator I = *MI; 679 const SIInstrInfo *TII = 680 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 681 682 switch (MI->getOpcode()) { 683 default: 684 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 685 case AMDGPU::BRANCH: 686 return BB; 687 case AMDGPU::SI_RegisterStorePseudo: { 688 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 689 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 690 MachineInstrBuilder MIB = 691 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), 692 Reg); 693 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) 694 MIB.addOperand(MI->getOperand(i)); 695 696 MI->eraseFromParent(); 697 break; 698 } 699 } 700 return BB; 701 } 702 703 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 704 // This currently forces unfolding various combinations of fsub into fma with 705 // free fneg'd operands. As long as we have fast FMA (controlled by 706 // isFMAFasterThanFMulAndFAdd), we should perform these. 707 708 // When fma is quarter rate, for f64 where add / sub are at best half rate, 709 // most of these combines appear to be cycle neutral but save on instruction 710 // count / code size. 711 return true; 712 } 713 714 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, 715 EVT VT) const { 716 if (!VT.isVector()) { 717 return MVT::i1; 718 } 719 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 720 } 721 722 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const { 723 return MVT::i32; 724 } 725 726 // Answering this is somewhat tricky and depends on the specific device which 727 // have different rates for fma or all f64 operations. 728 // 729 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 730 // regardless of which device (although the number of cycles differs between 731 // devices), so it is always profitable for f64. 732 // 733 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 734 // only on full rate devices. Normally, we should prefer selecting v_mad_f32 735 // which we can always do even without fused FP ops since it returns the same 736 // result as the separate operations and since it is always full 737 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 738 // however does not support denormals, so we do report fma as faster if we have 739 // a fast fma device and require denormals. 740 // 741 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 742 VT = VT.getScalarType(); 743 744 if (!VT.isSimple()) 745 return false; 746 747 switch (VT.getSimpleVT().SimpleTy) { 748 case MVT::f32: 749 // This is as fast on some subtargets. However, we always have full rate f32 750 // mad available which returns the same result as the separate operations 751 // which we should prefer over fma. We can't use this if we want to support 752 // denormals, so only report this in these cases. 753 return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); 754 case MVT::f64: 755 return true; 756 default: 757 break; 758 } 759 760 return false; 761 } 762 763 //===----------------------------------------------------------------------===// 764 // Custom DAG Lowering Operations 765 //===----------------------------------------------------------------------===// 766 767 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 768 switch (Op.getOpcode()) { 769 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 770 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 771 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 772 case ISD::LOAD: { 773 SDValue Result = LowerLOAD(Op, DAG); 774 assert((!Result.getNode() || 775 Result.getNode()->getNumValues() == 2) && 776 "Load should return a value and a chain"); 777 return Result; 778 } 779 780 case ISD::FSIN: 781 case ISD::FCOS: 782 return LowerTrig(Op, DAG); 783 case ISD::SELECT: return LowerSELECT(Op, DAG); 784 case ISD::FDIV: return LowerFDIV(Op, DAG); 785 case ISD::STORE: return LowerSTORE(Op, DAG); 786 case ISD::GlobalAddress: { 787 MachineFunction &MF = DAG.getMachineFunction(); 788 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 789 return LowerGlobalAddress(MFI, Op, DAG); 790 } 791 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 792 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); 793 } 794 return SDValue(); 795 } 796 797 /// \brief Helper function for LowerBRCOND 798 static SDNode *findUser(SDValue Value, unsigned Opcode) { 799 800 SDNode *Parent = Value.getNode(); 801 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 802 I != E; ++I) { 803 804 if (I.getUse().get() != Value) 805 continue; 806 807 if (I->getOpcode() == Opcode) 808 return *I; 809 } 810 return nullptr; 811 } 812 813 SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { 814 815 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); 816 unsigned FrameIndex = FINode->getIndex(); 817 818 return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); 819 } 820 821 /// This transforms the control flow intrinsics to get the branch destination as 822 /// last parameter, also switches branch target with BR if the need arise 823 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 824 SelectionDAG &DAG) const { 825 826 SDLoc DL(BRCOND); 827 828 SDNode *Intr = BRCOND.getOperand(1).getNode(); 829 SDValue Target = BRCOND.getOperand(2); 830 SDNode *BR = nullptr; 831 832 if (Intr->getOpcode() == ISD::SETCC) { 833 // As long as we negate the condition everything is fine 834 SDNode *SetCC = Intr; 835 assert(SetCC->getConstantOperandVal(1) == 1); 836 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 837 ISD::SETNE); 838 Intr = SetCC->getOperand(0).getNode(); 839 840 } else { 841 // Get the target from BR if we don't negate the condition 842 BR = findUser(BRCOND, ISD::BR); 843 Target = BR->getOperand(1); 844 } 845 846 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 847 848 // Build the result and 849 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 850 851 // operands of the new intrinsic call 852 SmallVector<SDValue, 4> Ops; 853 Ops.push_back(BRCOND.getOperand(0)); 854 Ops.append(Intr->op_begin() + 1, Intr->op_end()); 855 Ops.push_back(Target); 856 857 // build the new intrinsic call 858 SDNode *Result = DAG.getNode( 859 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 860 DAG.getVTList(Res), Ops).getNode(); 861 862 if (BR) { 863 // Give the branch instruction our target 864 SDValue Ops[] = { 865 BR->getOperand(0), 866 BRCOND.getOperand(2) 867 }; 868 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 869 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 870 BR = NewBR.getNode(); 871 } 872 873 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 874 875 // Copy the intrinsic results to registers 876 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 877 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 878 if (!CopyToReg) 879 continue; 880 881 Chain = DAG.getCopyToReg( 882 Chain, DL, 883 CopyToReg->getOperand(1), 884 SDValue(Result, i - 1), 885 SDValue()); 886 887 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 888 } 889 890 // Remove the old intrinsic from the chain 891 DAG.ReplaceAllUsesOfValueWith( 892 SDValue(Intr, Intr->getNumValues() - 1), 893 Intr->getOperand(0)); 894 895 return Chain; 896 } 897 898 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 899 SDValue Op, 900 SelectionDAG &DAG) const { 901 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 902 903 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 904 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 905 906 SDLoc DL(GSD); 907 const GlobalValue *GV = GSD->getGlobal(); 908 MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); 909 910 SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); 911 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); 912 913 SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, 914 DAG.getConstant(0, DL, MVT::i32)); 915 SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, 916 DAG.getConstant(1, DL, MVT::i32)); 917 918 SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), 919 PtrLo, GA); 920 SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), 921 PtrHi, DAG.getConstant(0, DL, MVT::i32), 922 SDValue(Lo.getNode(), 1)); 923 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); 924 } 925 926 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, 927 SDValue V) const { 928 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, 929 // so we will end up with redundant moves to m0. 930 // 931 // We can't use S_MOV_B32, because there is no way to specify m0 as the 932 // destination register. 933 // 934 // We have to use them both. Machine cse will combine all the S_MOV_B32 935 // instructions and the register coalescer eliminate the extra copies. 936 SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); 937 return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), 938 SDValue(M0, 0), SDValue()); // Glue 939 // A Null SDValue creates 940 // a glue result. 941 } 942 943 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 944 SelectionDAG &DAG) const { 945 MachineFunction &MF = DAG.getMachineFunction(); 946 auto MFI = MF.getInfo<SIMachineFunctionInfo>(); 947 const SIRegisterInfo *TRI = 948 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 949 950 EVT VT = Op.getValueType(); 951 SDLoc DL(Op); 952 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 953 954 switch (IntrinsicID) { 955 case Intrinsic::r600_read_ngroups_x: 956 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 957 SI::KernelInputOffsets::NGROUPS_X, false); 958 case Intrinsic::r600_read_ngroups_y: 959 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 960 SI::KernelInputOffsets::NGROUPS_Y, false); 961 case Intrinsic::r600_read_ngroups_z: 962 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 963 SI::KernelInputOffsets::NGROUPS_Z, false); 964 case Intrinsic::r600_read_global_size_x: 965 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 966 SI::KernelInputOffsets::GLOBAL_SIZE_X, false); 967 case Intrinsic::r600_read_global_size_y: 968 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 969 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); 970 case Intrinsic::r600_read_global_size_z: 971 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 972 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); 973 case Intrinsic::r600_read_local_size_x: 974 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 975 SI::KernelInputOffsets::LOCAL_SIZE_X, false); 976 case Intrinsic::r600_read_local_size_y: 977 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 978 SI::KernelInputOffsets::LOCAL_SIZE_Y, false); 979 case Intrinsic::r600_read_local_size_z: 980 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 981 SI::KernelInputOffsets::LOCAL_SIZE_Z, false); 982 983 case Intrinsic::AMDGPU_read_workdim: 984 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 985 getImplicitParameterOffset(MFI, GRID_DIM), false); 986 987 case Intrinsic::r600_read_tgid_x: 988 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 989 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); 990 case Intrinsic::r600_read_tgid_y: 991 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 992 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); 993 case Intrinsic::r600_read_tgid_z: 994 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 995 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); 996 case Intrinsic::r600_read_tidig_x: 997 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 998 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); 999 case Intrinsic::r600_read_tidig_y: 1000 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 1001 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); 1002 case Intrinsic::r600_read_tidig_z: 1003 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 1004 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); 1005 case AMDGPUIntrinsic::SI_load_const: { 1006 SDValue Ops[] = { 1007 Op.getOperand(1), 1008 Op.getOperand(2) 1009 }; 1010 1011 MachineMemOperand *MMO = MF.getMachineMemOperand( 1012 MachinePointerInfo(), 1013 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, 1014 VT.getStoreSize(), 4); 1015 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 1016 Op->getVTList(), Ops, VT, MMO); 1017 } 1018 case AMDGPUIntrinsic::SI_sample: 1019 return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); 1020 case AMDGPUIntrinsic::SI_sampleb: 1021 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); 1022 case AMDGPUIntrinsic::SI_sampled: 1023 return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); 1024 case AMDGPUIntrinsic::SI_samplel: 1025 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); 1026 case AMDGPUIntrinsic::SI_vs_load_input: 1027 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 1028 Op.getOperand(1), 1029 Op.getOperand(2), 1030 Op.getOperand(3)); 1031 1032 case AMDGPUIntrinsic::AMDGPU_fract: 1033 case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. 1034 return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), 1035 DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); 1036 case AMDGPUIntrinsic::SI_fs_constant: { 1037 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 1038 SDValue Glue = M0.getValue(1); 1039 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, 1040 DAG.getConstant(2, DL, MVT::i32), // P0 1041 Op.getOperand(1), Op.getOperand(2), Glue); 1042 } 1043 case AMDGPUIntrinsic::SI_fs_interp: { 1044 SDValue IJ = Op.getOperand(4); 1045 SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 1046 DAG.getConstant(0, DL, MVT::i32)); 1047 SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 1048 DAG.getConstant(1, DL, MVT::i32)); 1049 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 1050 SDValue Glue = M0.getValue(1); 1051 SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, 1052 DAG.getVTList(MVT::f32, MVT::Glue), 1053 I, Op.getOperand(1), Op.getOperand(2), Glue); 1054 Glue = SDValue(P1.getNode(), 1); 1055 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, 1056 Op.getOperand(1), Op.getOperand(2), Glue); 1057 } 1058 default: 1059 return AMDGPUTargetLowering::LowerOperation(Op, DAG); 1060 } 1061 } 1062 1063 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 1064 SelectionDAG &DAG) const { 1065 MachineFunction &MF = DAG.getMachineFunction(); 1066 SDLoc DL(Op); 1067 SDValue Chain = Op.getOperand(0); 1068 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1069 1070 switch (IntrinsicID) { 1071 case AMDGPUIntrinsic::SI_sendmsg: { 1072 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 1073 SDValue Glue = Chain.getValue(1); 1074 return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, 1075 Op.getOperand(2), Glue); 1076 } 1077 case AMDGPUIntrinsic::SI_tbuffer_store: { 1078 SDValue Ops[] = { 1079 Chain, 1080 Op.getOperand(2), 1081 Op.getOperand(3), 1082 Op.getOperand(4), 1083 Op.getOperand(5), 1084 Op.getOperand(6), 1085 Op.getOperand(7), 1086 Op.getOperand(8), 1087 Op.getOperand(9), 1088 Op.getOperand(10), 1089 Op.getOperand(11), 1090 Op.getOperand(12), 1091 Op.getOperand(13), 1092 Op.getOperand(14) 1093 }; 1094 1095 EVT VT = Op.getOperand(3).getValueType(); 1096 1097 MachineMemOperand *MMO = MF.getMachineMemOperand( 1098 MachinePointerInfo(), 1099 MachineMemOperand::MOStore, 1100 VT.getStoreSize(), 4); 1101 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 1102 Op->getVTList(), Ops, VT, MMO); 1103 } 1104 default: 1105 return SDValue(); 1106 } 1107 } 1108 1109 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1110 SDLoc DL(Op); 1111 LoadSDNode *Load = cast<LoadSDNode>(Op); 1112 1113 if (Op.getValueType().isVector()) { 1114 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 1115 "Custom lowering for non-i32 vectors hasn't been implemented."); 1116 unsigned NumElements = Op.getValueType().getVectorNumElements(); 1117 assert(NumElements != 2 && "v2 loads are supported for all address spaces."); 1118 switch (Load->getAddressSpace()) { 1119 default: break; 1120 case AMDGPUAS::GLOBAL_ADDRESS: 1121 case AMDGPUAS::PRIVATE_ADDRESS: 1122 // v4 loads are supported for private and global memory. 1123 if (NumElements <= 4) 1124 break; 1125 // fall-through 1126 case AMDGPUAS::LOCAL_ADDRESS: 1127 return ScalarizeVectorLoad(Op, DAG); 1128 } 1129 } 1130 1131 return AMDGPUTargetLowering::LowerLOAD(Op, DAG); 1132 } 1133 1134 SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, 1135 const SDValue &Op, 1136 SelectionDAG &DAG) const { 1137 return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), 1138 Op.getOperand(2), 1139 Op.getOperand(3), 1140 Op.getOperand(4)); 1141 } 1142 1143 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 1144 if (Op.getValueType() != MVT::i64) 1145 return SDValue(); 1146 1147 SDLoc DL(Op); 1148 SDValue Cond = Op.getOperand(0); 1149 1150 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 1151 SDValue One = DAG.getConstant(1, DL, MVT::i32); 1152 1153 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 1154 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 1155 1156 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 1157 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 1158 1159 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 1160 1161 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 1162 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 1163 1164 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 1165 1166 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); 1167 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 1168 } 1169 1170 // Catch division cases where we can use shortcuts with rcp and rsq 1171 // instructions. 1172 SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { 1173 SDLoc SL(Op); 1174 SDValue LHS = Op.getOperand(0); 1175 SDValue RHS = Op.getOperand(1); 1176 EVT VT = Op.getValueType(); 1177 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; 1178 1179 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 1180 if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && 1181 CLHS->isExactlyValue(1.0)) { 1182 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 1183 // the CI documentation has a worst case error of 1 ulp. 1184 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 1185 // use it as long as we aren't trying to use denormals. 1186 1187 // 1.0 / sqrt(x) -> rsq(x) 1188 // 1189 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 1190 // error seems really high at 2^29 ULP. 1191 if (RHS.getOpcode() == ISD::FSQRT) 1192 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); 1193 1194 // 1.0 / x -> rcp(x) 1195 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 1196 } 1197 } 1198 1199 if (Unsafe) { 1200 // Turn into multiply by the reciprocal. 1201 // x / y -> x * (1.0 / y) 1202 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 1203 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); 1204 } 1205 1206 return SDValue(); 1207 } 1208 1209 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 1210 SDValue FastLowered = LowerFastFDIV(Op, DAG); 1211 if (FastLowered.getNode()) 1212 return FastLowered; 1213 1214 // This uses v_rcp_f32 which does not handle denormals. Let this hit a 1215 // selection error for now rather than do something incorrect. 1216 if (Subtarget->hasFP32Denormals()) 1217 return SDValue(); 1218 1219 SDLoc SL(Op); 1220 SDValue LHS = Op.getOperand(0); 1221 SDValue RHS = Op.getOperand(1); 1222 1223 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); 1224 1225 const APFloat K0Val(BitsToFloat(0x6f800000)); 1226 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); 1227 1228 const APFloat K1Val(BitsToFloat(0x2f800000)); 1229 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); 1230 1231 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 1232 1233 EVT SetCCVT = 1234 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); 1235 1236 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 1237 1238 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); 1239 1240 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); 1241 1242 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); 1243 1244 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); 1245 1246 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); 1247 } 1248 1249 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 1250 if (DAG.getTarget().Options.UnsafeFPMath) 1251 return LowerFastFDIV(Op, DAG); 1252 1253 SDLoc SL(Op); 1254 SDValue X = Op.getOperand(0); 1255 SDValue Y = Op.getOperand(1); 1256 1257 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 1258 1259 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 1260 1261 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 1262 1263 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 1264 1265 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 1266 1267 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 1268 1269 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 1270 1271 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 1272 1273 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 1274 1275 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 1276 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 1277 1278 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, 1279 NegDivScale0, Mul, DivScale1); 1280 1281 SDValue Scale; 1282 1283 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { 1284 // Workaround a hardware bug on SI where the condition output from div_scale 1285 // is not usable. 1286 1287 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); 1288 1289 // Figure out if the scale to use for div_fmas. 1290 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 1291 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 1292 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 1293 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 1294 1295 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 1296 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 1297 1298 SDValue Scale0Hi 1299 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 1300 SDValue Scale1Hi 1301 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 1302 1303 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 1304 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 1305 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 1306 } else { 1307 Scale = DivScale1.getValue(1); 1308 } 1309 1310 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, 1311 Fma4, Fma3, Mul, Scale); 1312 1313 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 1314 } 1315 1316 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 1317 EVT VT = Op.getValueType(); 1318 1319 if (VT == MVT::f32) 1320 return LowerFDIV32(Op, DAG); 1321 1322 if (VT == MVT::f64) 1323 return LowerFDIV64(Op, DAG); 1324 1325 llvm_unreachable("Unexpected type for fdiv"); 1326 } 1327 1328 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1329 SDLoc DL(Op); 1330 StoreSDNode *Store = cast<StoreSDNode>(Op); 1331 EVT VT = Store->getMemoryVT(); 1332 1333 // These stores are legal. 1334 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 1335 if (VT.isVector() && VT.getVectorNumElements() > 4) 1336 return ScalarizeVectorStore(Op, DAG); 1337 return SDValue(); 1338 } 1339 1340 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1341 if (Ret.getNode()) 1342 return Ret; 1343 1344 if (VT.isVector() && VT.getVectorNumElements() >= 8) 1345 return ScalarizeVectorStore(Op, DAG); 1346 1347 if (VT == MVT::i1) 1348 return DAG.getTruncStore(Store->getChain(), DL, 1349 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 1350 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 1351 1352 return SDValue(); 1353 } 1354 1355 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 1356 SDLoc DL(Op); 1357 EVT VT = Op.getValueType(); 1358 SDValue Arg = Op.getOperand(0); 1359 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 1360 DAG.getNode(ISD::FMUL, DL, VT, Arg, 1361 DAG.getConstantFP(0.5/M_PI, DL, 1362 VT))); 1363 1364 switch (Op.getOpcode()) { 1365 case ISD::FCOS: 1366 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); 1367 case ISD::FSIN: 1368 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); 1369 default: 1370 llvm_unreachable("Wrong trig opcode"); 1371 } 1372 } 1373 1374 //===----------------------------------------------------------------------===// 1375 // Custom DAG optimizations 1376 //===----------------------------------------------------------------------===// 1377 1378 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 1379 DAGCombinerInfo &DCI) const { 1380 EVT VT = N->getValueType(0); 1381 EVT ScalarVT = VT.getScalarType(); 1382 if (ScalarVT != MVT::f32) 1383 return SDValue(); 1384 1385 SelectionDAG &DAG = DCI.DAG; 1386 SDLoc DL(N); 1387 1388 SDValue Src = N->getOperand(0); 1389 EVT SrcVT = Src.getValueType(); 1390 1391 // TODO: We could try to match extracting the higher bytes, which would be 1392 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 1393 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 1394 // about in practice. 1395 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { 1396 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 1397 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); 1398 DCI.AddToWorklist(Cvt.getNode()); 1399 return Cvt; 1400 } 1401 } 1402 1403 // We are primarily trying to catch operations on illegal vector types 1404 // before they are expanded. 1405 // For scalars, we can use the more flexible method of checking masked bits 1406 // after legalization. 1407 if (!DCI.isBeforeLegalize() || 1408 !SrcVT.isVector() || 1409 SrcVT.getVectorElementType() != MVT::i8) { 1410 return SDValue(); 1411 } 1412 1413 assert(DCI.isBeforeLegalize() && "Unexpected legal type"); 1414 1415 // Weird sized vectors are a pain to handle, but we know 3 is really the same 1416 // size as 4. 1417 unsigned NElts = SrcVT.getVectorNumElements(); 1418 if (!SrcVT.isSimple() && NElts != 3) 1419 return SDValue(); 1420 1421 // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to 1422 // prevent a mess from expanding to v4i32 and repacking. 1423 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { 1424 EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); 1425 EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); 1426 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); 1427 LoadSDNode *Load = cast<LoadSDNode>(Src); 1428 1429 unsigned AS = Load->getAddressSpace(); 1430 unsigned Align = Load->getAlignment(); 1431 Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); 1432 unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); 1433 1434 // Don't try to replace the load if we have to expand it due to alignment 1435 // problems. Otherwise we will end up scalarizing the load, and trying to 1436 // repack into the vector for no real reason. 1437 if (Align < ABIAlignment && 1438 !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { 1439 return SDValue(); 1440 } 1441 1442 SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, 1443 Load->getChain(), 1444 Load->getBasePtr(), 1445 LoadVT, 1446 Load->getMemOperand()); 1447 1448 // Make sure successors of the original load stay after it by updating 1449 // them to use the new Chain. 1450 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); 1451 1452 SmallVector<SDValue, 4> Elts; 1453 if (RegVT.isVector()) 1454 DAG.ExtractVectorElements(NewLoad, Elts); 1455 else 1456 Elts.push_back(NewLoad); 1457 1458 SmallVector<SDValue, 4> Ops; 1459 1460 unsigned EltIdx = 0; 1461 for (SDValue Elt : Elts) { 1462 unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); 1463 for (unsigned I = 0; I < ComponentsInElt; ++I) { 1464 unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; 1465 SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); 1466 DCI.AddToWorklist(Cvt.getNode()); 1467 Ops.push_back(Cvt); 1468 } 1469 1470 ++EltIdx; 1471 } 1472 1473 assert(Ops.size() == NElts); 1474 1475 return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); 1476 } 1477 1478 return SDValue(); 1479 } 1480 1481 /// \brief Return true if the given offset Size in bytes can be folded into 1482 /// the immediate offsets of a memory instruction for the given address space. 1483 static bool canFoldOffset(unsigned OffsetSize, unsigned AS, 1484 const AMDGPUSubtarget &STI) { 1485 switch (AS) { 1486 case AMDGPUAS::GLOBAL_ADDRESS: { 1487 // MUBUF instructions a 12-bit offset in bytes. 1488 return isUInt<12>(OffsetSize); 1489 } 1490 case AMDGPUAS::CONSTANT_ADDRESS: { 1491 // SMRD instructions have an 8-bit offset in dwords on SI and 1492 // a 20-bit offset in bytes on VI. 1493 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1494 return isUInt<20>(OffsetSize); 1495 else 1496 return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); 1497 } 1498 case AMDGPUAS::LOCAL_ADDRESS: 1499 case AMDGPUAS::REGION_ADDRESS: { 1500 // The single offset versions have a 16-bit offset in bytes. 1501 return isUInt<16>(OffsetSize); 1502 } 1503 case AMDGPUAS::PRIVATE_ADDRESS: 1504 // Indirect register addressing does not use any offsets. 1505 default: 1506 return 0; 1507 } 1508 } 1509 1510 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 1511 1512 // This is a variant of 1513 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 1514 // 1515 // The normal DAG combiner will do this, but only if the add has one use since 1516 // that would increase the number of instructions. 1517 // 1518 // This prevents us from seeing a constant offset that can be folded into a 1519 // memory instruction's addressing mode. If we know the resulting add offset of 1520 // a pointer can be folded into an addressing offset, we can replace the pointer 1521 // operand with the add of new constant offset. This eliminates one of the uses, 1522 // and may allow the remaining use to also be simplified. 1523 // 1524 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, 1525 unsigned AddrSpace, 1526 DAGCombinerInfo &DCI) const { 1527 SDValue N0 = N->getOperand(0); 1528 SDValue N1 = N->getOperand(1); 1529 1530 if (N0.getOpcode() != ISD::ADD) 1531 return SDValue(); 1532 1533 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 1534 if (!CN1) 1535 return SDValue(); 1536 1537 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 1538 if (!CAdd) 1539 return SDValue(); 1540 1541 // If the resulting offset is too large, we can't fold it into the addressing 1542 // mode offset. 1543 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 1544 if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) 1545 return SDValue(); 1546 1547 SelectionDAG &DAG = DCI.DAG; 1548 SDLoc SL(N); 1549 EVT VT = N->getValueType(0); 1550 1551 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 1552 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); 1553 1554 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); 1555 } 1556 1557 SDValue SITargetLowering::performAndCombine(SDNode *N, 1558 DAGCombinerInfo &DCI) const { 1559 if (DCI.isBeforeLegalize()) 1560 return SDValue(); 1561 1562 SelectionDAG &DAG = DCI.DAG; 1563 1564 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 1565 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 1566 SDValue LHS = N->getOperand(0); 1567 SDValue RHS = N->getOperand(1); 1568 1569 if (LHS.getOpcode() == ISD::SETCC && 1570 RHS.getOpcode() == ISD::SETCC) { 1571 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 1572 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 1573 1574 SDValue X = LHS.getOperand(0); 1575 SDValue Y = RHS.getOperand(0); 1576 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) 1577 return SDValue(); 1578 1579 if (LCC == ISD::SETO) { 1580 if (X != LHS.getOperand(1)) 1581 return SDValue(); 1582 1583 if (RCC == ISD::SETUNE) { 1584 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 1585 if (!C1 || !C1->isInfinity() || C1->isNegative()) 1586 return SDValue(); 1587 1588 const uint32_t Mask = SIInstrFlags::N_NORMAL | 1589 SIInstrFlags::N_SUBNORMAL | 1590 SIInstrFlags::N_ZERO | 1591 SIInstrFlags::P_ZERO | 1592 SIInstrFlags::P_SUBNORMAL | 1593 SIInstrFlags::P_NORMAL; 1594 1595 static_assert(((~(SIInstrFlags::S_NAN | 1596 SIInstrFlags::Q_NAN | 1597 SIInstrFlags::N_INFINITY | 1598 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, 1599 "mask not equal"); 1600 1601 SDLoc DL(N); 1602 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 1603 X, DAG.getConstant(Mask, DL, MVT::i32)); 1604 } 1605 } 1606 } 1607 1608 return SDValue(); 1609 } 1610 1611 SDValue SITargetLowering::performOrCombine(SDNode *N, 1612 DAGCombinerInfo &DCI) const { 1613 SelectionDAG &DAG = DCI.DAG; 1614 SDValue LHS = N->getOperand(0); 1615 SDValue RHS = N->getOperand(1); 1616 1617 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 1618 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 1619 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 1620 SDValue Src = LHS.getOperand(0); 1621 if (Src != RHS.getOperand(0)) 1622 return SDValue(); 1623 1624 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 1625 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 1626 if (!CLHS || !CRHS) 1627 return SDValue(); 1628 1629 // Only 10 bits are used. 1630 static const uint32_t MaxMask = 0x3ff; 1631 1632 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 1633 SDLoc DL(N); 1634 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 1635 Src, DAG.getConstant(NewMask, DL, MVT::i32)); 1636 } 1637 1638 return SDValue(); 1639 } 1640 1641 SDValue SITargetLowering::performClassCombine(SDNode *N, 1642 DAGCombinerInfo &DCI) const { 1643 SelectionDAG &DAG = DCI.DAG; 1644 SDValue Mask = N->getOperand(1); 1645 1646 // fp_class x, 0 -> false 1647 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { 1648 if (CMask->isNullValue()) 1649 return DAG.getConstant(0, SDLoc(N), MVT::i1); 1650 } 1651 1652 return SDValue(); 1653 } 1654 1655 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 1656 switch (Opc) { 1657 case ISD::FMAXNUM: 1658 return AMDGPUISD::FMAX3; 1659 case ISD::SMAX: 1660 return AMDGPUISD::SMAX3; 1661 case ISD::UMAX: 1662 return AMDGPUISD::UMAX3; 1663 case ISD::FMINNUM: 1664 return AMDGPUISD::FMIN3; 1665 case ISD::SMIN: 1666 return AMDGPUISD::SMIN3; 1667 case ISD::UMIN: 1668 return AMDGPUISD::UMIN3; 1669 default: 1670 llvm_unreachable("Not a min/max opcode"); 1671 } 1672 } 1673 1674 SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, 1675 DAGCombinerInfo &DCI) const { 1676 SelectionDAG &DAG = DCI.DAG; 1677 1678 unsigned Opc = N->getOpcode(); 1679 SDValue Op0 = N->getOperand(0); 1680 SDValue Op1 = N->getOperand(1); 1681 1682 // Only do this if the inner op has one use since this will just increases 1683 // register pressure for no benefit. 1684 1685 // max(max(a, b), c) 1686 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 1687 SDLoc DL(N); 1688 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 1689 DL, 1690 N->getValueType(0), 1691 Op0.getOperand(0), 1692 Op0.getOperand(1), 1693 Op1); 1694 } 1695 1696 // max(a, max(b, c)) 1697 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 1698 SDLoc DL(N); 1699 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 1700 DL, 1701 N->getValueType(0), 1702 Op0, 1703 Op1.getOperand(0), 1704 Op1.getOperand(1)); 1705 } 1706 1707 return SDValue(); 1708 } 1709 1710 SDValue SITargetLowering::performSetCCCombine(SDNode *N, 1711 DAGCombinerInfo &DCI) const { 1712 SelectionDAG &DAG = DCI.DAG; 1713 SDLoc SL(N); 1714 1715 SDValue LHS = N->getOperand(0); 1716 SDValue RHS = N->getOperand(1); 1717 EVT VT = LHS.getValueType(); 1718 1719 if (VT != MVT::f32 && VT != MVT::f64) 1720 return SDValue(); 1721 1722 // Match isinf pattern 1723 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 1724 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 1725 if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { 1726 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 1727 if (!CRHS) 1728 return SDValue(); 1729 1730 const APFloat &APF = CRHS->getValueAPF(); 1731 if (APF.isInfinity() && !APF.isNegative()) { 1732 unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; 1733 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), 1734 DAG.getConstant(Mask, SL, MVT::i32)); 1735 } 1736 } 1737 1738 return SDValue(); 1739 } 1740 1741 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 1742 DAGCombinerInfo &DCI) const { 1743 SelectionDAG &DAG = DCI.DAG; 1744 SDLoc DL(N); 1745 1746 switch (N->getOpcode()) { 1747 default: 1748 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1749 case ISD::SETCC: 1750 return performSetCCCombine(N, DCI); 1751 case ISD::FMAXNUM: // TODO: What about fmax_legacy? 1752 case ISD::FMINNUM: 1753 case ISD::SMAX: 1754 case ISD::SMIN: 1755 case ISD::UMAX: 1756 case ISD::UMIN: { 1757 if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && 1758 N->getValueType(0) != MVT::f64 && 1759 getTargetMachine().getOptLevel() > CodeGenOpt::None) 1760 return performMin3Max3Combine(N, DCI); 1761 break; 1762 } 1763 1764 case AMDGPUISD::CVT_F32_UBYTE0: 1765 case AMDGPUISD::CVT_F32_UBYTE1: 1766 case AMDGPUISD::CVT_F32_UBYTE2: 1767 case AMDGPUISD::CVT_F32_UBYTE3: { 1768 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 1769 1770 SDValue Src = N->getOperand(0); 1771 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 1772 1773 APInt KnownZero, KnownOne; 1774 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 1775 !DCI.isBeforeLegalizeOps()); 1776 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 1777 if (TLO.ShrinkDemandedConstant(Src, Demanded) || 1778 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { 1779 DCI.CommitTargetLoweringOpt(TLO); 1780 } 1781 1782 break; 1783 } 1784 1785 case ISD::UINT_TO_FP: { 1786 return performUCharToFloatCombine(N, DCI); 1787 1788 case ISD::FADD: { 1789 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 1790 break; 1791 1792 EVT VT = N->getValueType(0); 1793 if (VT != MVT::f32) 1794 break; 1795 1796 // Only do this if we are not trying to support denormals. v_mad_f32 does 1797 // not support denormals ever. 1798 if (Subtarget->hasFP32Denormals()) 1799 break; 1800 1801 SDValue LHS = N->getOperand(0); 1802 SDValue RHS = N->getOperand(1); 1803 1804 // These should really be instruction patterns, but writing patterns with 1805 // source modiifiers is a pain. 1806 1807 // fadd (fadd (a, a), b) -> mad 2.0, a, b 1808 if (LHS.getOpcode() == ISD::FADD) { 1809 SDValue A = LHS.getOperand(0); 1810 if (A == LHS.getOperand(1)) { 1811 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 1812 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); 1813 } 1814 } 1815 1816 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 1817 if (RHS.getOpcode() == ISD::FADD) { 1818 SDValue A = RHS.getOperand(0); 1819 if (A == RHS.getOperand(1)) { 1820 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 1821 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); 1822 } 1823 } 1824 1825 return SDValue(); 1826 } 1827 case ISD::FSUB: { 1828 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 1829 break; 1830 1831 EVT VT = N->getValueType(0); 1832 1833 // Try to get the fneg to fold into the source modifier. This undoes generic 1834 // DAG combines and folds them into the mad. 1835 // 1836 // Only do this if we are not trying to support denormals. v_mad_f32 does 1837 // not support denormals ever. 1838 if (VT == MVT::f32 && 1839 !Subtarget->hasFP32Denormals()) { 1840 SDValue LHS = N->getOperand(0); 1841 SDValue RHS = N->getOperand(1); 1842 if (LHS.getOpcode() == ISD::FADD) { 1843 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 1844 1845 SDValue A = LHS.getOperand(0); 1846 if (A == LHS.getOperand(1)) { 1847 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 1848 SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); 1849 1850 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); 1851 } 1852 } 1853 1854 if (RHS.getOpcode() == ISD::FADD) { 1855 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 1856 1857 SDValue A = RHS.getOperand(0); 1858 if (A == RHS.getOperand(1)) { 1859 const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); 1860 return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); 1861 } 1862 } 1863 1864 return SDValue(); 1865 } 1866 1867 break; 1868 } 1869 } 1870 case ISD::LOAD: 1871 case ISD::STORE: 1872 case ISD::ATOMIC_LOAD: 1873 case ISD::ATOMIC_STORE: 1874 case ISD::ATOMIC_CMP_SWAP: 1875 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 1876 case ISD::ATOMIC_SWAP: 1877 case ISD::ATOMIC_LOAD_ADD: 1878 case ISD::ATOMIC_LOAD_SUB: 1879 case ISD::ATOMIC_LOAD_AND: 1880 case ISD::ATOMIC_LOAD_OR: 1881 case ISD::ATOMIC_LOAD_XOR: 1882 case ISD::ATOMIC_LOAD_NAND: 1883 case ISD::ATOMIC_LOAD_MIN: 1884 case ISD::ATOMIC_LOAD_MAX: 1885 case ISD::ATOMIC_LOAD_UMIN: 1886 case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. 1887 if (DCI.isBeforeLegalize()) 1888 break; 1889 1890 MemSDNode *MemNode = cast<MemSDNode>(N); 1891 SDValue Ptr = MemNode->getBasePtr(); 1892 1893 // TODO: We could also do this for multiplies. 1894 unsigned AS = MemNode->getAddressSpace(); 1895 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { 1896 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); 1897 if (NewPtr) { 1898 SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); 1899 1900 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; 1901 return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); 1902 } 1903 } 1904 break; 1905 } 1906 case ISD::AND: 1907 return performAndCombine(N, DCI); 1908 case ISD::OR: 1909 return performOrCombine(N, DCI); 1910 case AMDGPUISD::FP_CLASS: 1911 return performClassCombine(N, DCI); 1912 } 1913 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1914 } 1915 1916 /// \brief Analyze the possible immediate value Op 1917 /// 1918 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 1919 /// and the immediate value if it's a literal immediate 1920 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 1921 1922 const SIInstrInfo *TII = 1923 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 1924 1925 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 1926 if (TII->isInlineConstant(Node->getAPIntValue())) 1927 return 0; 1928 1929 uint64_t Val = Node->getZExtValue(); 1930 return isUInt<32>(Val) ? Val : -1; 1931 } 1932 1933 if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { 1934 if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) 1935 return 0; 1936 1937 if (Node->getValueType(0) == MVT::f32) 1938 return FloatToBits(Node->getValueAPF().convertToFloat()); 1939 1940 return -1; 1941 } 1942 1943 return -1; 1944 } 1945 1946 /// \brief Helper function for adjustWritemask 1947 static unsigned SubIdx2Lane(unsigned Idx) { 1948 switch (Idx) { 1949 default: return 0; 1950 case AMDGPU::sub0: return 0; 1951 case AMDGPU::sub1: return 1; 1952 case AMDGPU::sub2: return 2; 1953 case AMDGPU::sub3: return 3; 1954 } 1955 } 1956 1957 /// \brief Adjust the writemask of MIMG instructions 1958 void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 1959 SelectionDAG &DAG) const { 1960 SDNode *Users[4] = { }; 1961 unsigned Lane = 0; 1962 unsigned OldDmask = Node->getConstantOperandVal(0); 1963 unsigned NewDmask = 0; 1964 1965 // Try to figure out the used register components 1966 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 1967 I != E; ++I) { 1968 1969 // Abort if we can't understand the usage 1970 if (!I->isMachineOpcode() || 1971 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 1972 return; 1973 1974 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 1975 // Note that subregs are packed, i.e. Lane==0 is the first bit set 1976 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 1977 // set, etc. 1978 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 1979 1980 // Set which texture component corresponds to the lane. 1981 unsigned Comp; 1982 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 1983 assert(Dmask); 1984 Comp = countTrailingZeros(Dmask); 1985 Dmask &= ~(1 << Comp); 1986 } 1987 1988 // Abort if we have more than one user per component 1989 if (Users[Lane]) 1990 return; 1991 1992 Users[Lane] = *I; 1993 NewDmask |= 1 << Comp; 1994 } 1995 1996 // Abort if there's no change 1997 if (NewDmask == OldDmask) 1998 return; 1999 2000 // Adjust the writemask in the node 2001 std::vector<SDValue> Ops; 2002 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); 2003 Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); 2004 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); 2005 2006 // If we only got one lane, replace it with a copy 2007 // (if NewDmask has only one bit set...) 2008 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 2009 SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), 2010 MVT::i32); 2011 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 2012 SDLoc(), Users[Lane]->getValueType(0), 2013 SDValue(Node, 0), RC); 2014 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 2015 return; 2016 } 2017 2018 // Update the users of the node with the new indices 2019 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 2020 2021 SDNode *User = Users[i]; 2022 if (!User) 2023 continue; 2024 2025 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); 2026 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 2027 2028 switch (Idx) { 2029 default: break; 2030 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 2031 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 2032 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 2033 } 2034 } 2035 } 2036 2037 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) 2038 /// with frame index operands. 2039 /// LLVM assumes that inputs are to these instructions are registers. 2040 void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 2041 SelectionDAG &DAG) const { 2042 2043 SmallVector<SDValue, 8> Ops; 2044 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 2045 if (!isa<FrameIndexSDNode>(Node->getOperand(i))) { 2046 Ops.push_back(Node->getOperand(i)); 2047 continue; 2048 } 2049 2050 SDLoc DL(Node); 2051 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 2052 Node->getOperand(i).getValueType(), 2053 Node->getOperand(i)), 0)); 2054 } 2055 2056 DAG.UpdateNodeOperands(Node, Ops); 2057 } 2058 2059 /// \brief Fold the instructions after selecting them. 2060 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 2061 SelectionDAG &DAG) const { 2062 const SIInstrInfo *TII = 2063 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2064 2065 if (TII->isMIMG(Node->getMachineOpcode())) 2066 adjustWritemask(Node, DAG); 2067 2068 if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || 2069 Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { 2070 legalizeTargetIndependentNode(Node, DAG); 2071 return Node; 2072 } 2073 return Node; 2074 } 2075 2076 /// \brief Assign the register class depending on the number of 2077 /// bits set in the writemask 2078 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 2079 SDNode *Node) const { 2080 const SIInstrInfo *TII = 2081 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2082 2083 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2084 TII->legalizeOperands(MI); 2085 2086 if (TII->isMIMG(MI->getOpcode())) { 2087 unsigned VReg = MI->getOperand(0).getReg(); 2088 unsigned Writemask = MI->getOperand(1).getImm(); 2089 unsigned BitsSet = 0; 2090 for (unsigned i = 0; i < 4; ++i) 2091 BitsSet += Writemask & (1 << i) ? 1 : 0; 2092 2093 const TargetRegisterClass *RC; 2094 switch (BitsSet) { 2095 default: return; 2096 case 1: RC = &AMDGPU::VGPR_32RegClass; break; 2097 case 2: RC = &AMDGPU::VReg_64RegClass; break; 2098 case 3: RC = &AMDGPU::VReg_96RegClass; break; 2099 } 2100 2101 unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); 2102 MI->setDesc(TII->get(NewOpcode)); 2103 MRI.setRegClass(VReg, RC); 2104 return; 2105 } 2106 2107 // Replace unused atomics with the no return version. 2108 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); 2109 if (NoRetAtomicOp != -1) { 2110 if (!Node->hasAnyUseOfValue(0)) { 2111 MI->setDesc(TII->get(NoRetAtomicOp)); 2112 MI->RemoveOperand(0); 2113 } 2114 2115 return; 2116 } 2117 } 2118 2119 static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { 2120 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); 2121 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 2122 } 2123 2124 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 2125 SDLoc DL, 2126 SDValue Ptr) const { 2127 const SIInstrInfo *TII = 2128 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2129 #if 1 2130 // XXX - Workaround for moveToVALU not handling different register class 2131 // inserts for REG_SEQUENCE. 2132 2133 // Build the half of the subregister with the constants. 2134 const SDValue Ops0[] = { 2135 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), 2136 buildSMovImm32(DAG, DL, 0), 2137 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 2138 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 2139 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) 2140 }; 2141 2142 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, 2143 MVT::v2i32, Ops0), 0); 2144 2145 // Combine the constants and the pointer. 2146 const SDValue Ops1[] = { 2147 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 2148 Ptr, 2149 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), 2150 SubRegHi, 2151 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) 2152 }; 2153 2154 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 2155 #else 2156 const SDValue Ops[] = { 2157 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), 2158 Ptr, 2159 DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), 2160 buildSMovImm32(DAG, DL, 0), 2161 DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), 2162 buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), 2163 DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) 2164 }; 2165 2166 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 2167 2168 #endif 2169 } 2170 2171 /// \brief Return a resource descriptor with the 'Add TID' bit enabled 2172 /// The TID (Thread ID) is multipled by the stride value (bits [61:48] 2173 /// of the resource descriptor) to create an offset, which is added to the 2174 /// resource ponter. 2175 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, 2176 SDLoc DL, 2177 SDValue Ptr, 2178 uint32_t RsrcDword1, 2179 uint64_t RsrcDword2And3) const { 2180 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 2181 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 2182 if (RsrcDword1) { 2183 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 2184 DAG.getConstant(RsrcDword1, DL, MVT::i32)), 2185 0); 2186 } 2187 2188 SDValue DataLo = buildSMovImm32(DAG, DL, 2189 RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 2190 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 2191 2192 const SDValue Ops[] = { 2193 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 2194 PtrLo, 2195 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 2196 PtrHi, 2197 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), 2198 DataLo, 2199 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), 2200 DataHi, 2201 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) 2202 }; 2203 2204 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 2205 } 2206 2207 MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, 2208 SDLoc DL, 2209 SDValue Ptr) const { 2210 const SIInstrInfo *TII = 2211 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2212 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE | 2213 0xffffffff; // Size 2214 2215 return buildRSRC(DAG, DL, Ptr, 0, Rsrc); 2216 } 2217 2218 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 2219 const TargetRegisterClass *RC, 2220 unsigned Reg, EVT VT) const { 2221 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 2222 2223 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 2224 cast<RegisterSDNode>(VReg)->getReg(), VT); 2225 } 2226 2227 //===----------------------------------------------------------------------===// 2228 // SI Inline Assembly Support 2229 //===----------------------------------------------------------------------===// 2230 2231 std::pair<unsigned, const TargetRegisterClass *> 2232 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 2233 StringRef Constraint, 2234 MVT VT) const { 2235 if (Constraint == "r") { 2236 switch(VT.SimpleTy) { 2237 default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); 2238 case MVT::i64: 2239 return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); 2240 case MVT::i32: 2241 return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); 2242 } 2243 } 2244 2245 if (Constraint.size() > 1) { 2246 const TargetRegisterClass *RC = nullptr; 2247 if (Constraint[1] == 'v') { 2248 RC = &AMDGPU::VGPR_32RegClass; 2249 } else if (Constraint[1] == 's') { 2250 RC = &AMDGPU::SGPR_32RegClass; 2251 } 2252 2253 if (RC) { 2254 uint32_t Idx; 2255 bool Failed = Constraint.substr(2).getAsInteger(10, Idx); 2256 if (!Failed && Idx < RC->getNumRegs()) 2257 return std::make_pair(RC->getRegister(Idx), RC); 2258 } 2259 } 2260 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 2261 } 2262