1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for SI 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifdef _MSC_VER 16 // Provide M_PI. 17 #define _USE_MATH_DEFINES 18 #include <cmath> 19 #endif 20 21 #include "SIISelLowering.h" 22 #include "AMDGPU.h" 23 #include "AMDGPUDiagnosticInfoUnsupported.h" 24 #include "AMDGPUIntrinsicInfo.h" 25 #include "AMDGPUSubtarget.h" 26 #include "SIInstrInfo.h" 27 #include "SIMachineFunctionInfo.h" 28 #include "SIRegisterInfo.h" 29 #include "llvm/ADT/BitVector.h" 30 #include "llvm/CodeGen/CallingConvLower.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/CodeGen/SelectionDAG.h" 34 #include "llvm/IR/Function.h" 35 #include "llvm/ADT/SmallString.h" 36 37 using namespace llvm; 38 39 SITargetLowering::SITargetLowering(TargetMachine &TM, 40 const AMDGPUSubtarget &STI) 41 : AMDGPUTargetLowering(TM, STI) { 42 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 43 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 44 45 addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 46 addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 47 48 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); 49 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 50 51 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 52 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 53 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 54 55 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); 56 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 57 58 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); 59 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 60 61 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); 62 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 63 64 computeRegisterProperties(STI.getRegisterInfo()); 65 66 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 67 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 68 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 69 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 70 71 setOperationAction(ISD::ADD, MVT::i32, Legal); 72 setOperationAction(ISD::ADDC, MVT::i32, Legal); 73 setOperationAction(ISD::ADDE, MVT::i32, Legal); 74 setOperationAction(ISD::SUBC, MVT::i32, Legal); 75 setOperationAction(ISD::SUBE, MVT::i32, Legal); 76 77 setOperationAction(ISD::FSIN, MVT::f32, Custom); 78 setOperationAction(ISD::FCOS, MVT::f32, Custom); 79 80 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 81 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 82 83 // We need to custom lower vector stores from local memory 84 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 85 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 86 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 87 88 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 89 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 90 91 setOperationAction(ISD::STORE, MVT::i1, Custom); 92 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 93 94 setOperationAction(ISD::SELECT, MVT::i64, Custom); 95 setOperationAction(ISD::SELECT, MVT::f64, Promote); 96 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 97 98 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 99 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 100 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 101 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 102 103 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 104 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 105 106 setOperationAction(ISD::BSWAP, MVT::i32, Legal); 107 108 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); 109 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 110 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 111 112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); 113 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 114 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 115 116 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 117 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 118 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 119 120 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 121 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 122 123 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 124 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 125 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); 126 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 127 128 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 129 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 130 131 for (MVT VT : MVT::integer_valuetypes()) { 132 if (VT == MVT::i64) 133 continue; 134 135 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 136 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); 137 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); 138 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); 139 140 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 141 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); 142 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); 143 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); 144 145 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 146 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); 147 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); 148 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); 149 } 150 151 for (MVT VT : MVT::integer_vector_valuetypes()) { 152 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); 153 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); 154 } 155 156 for (MVT VT : MVT::fp_valuetypes()) 157 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 158 159 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 160 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 161 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); 162 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 163 164 setOperationAction(ISD::LOAD, MVT::i1, Custom); 165 166 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 167 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 168 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 169 170 // These should use UDIVREM, so set them to expand 171 setOperationAction(ISD::UDIV, MVT::i64, Expand); 172 setOperationAction(ISD::UREM, MVT::i64, Expand); 173 174 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 175 setOperationAction(ISD::SELECT, MVT::i1, Promote); 176 177 // We only support LOAD/STORE and vector manipulation ops for vectors 178 // with > 4 elements. 179 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { 180 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 181 switch(Op) { 182 case ISD::LOAD: 183 case ISD::STORE: 184 case ISD::BUILD_VECTOR: 185 case ISD::BITCAST: 186 case ISD::EXTRACT_VECTOR_ELT: 187 case ISD::INSERT_VECTOR_ELT: 188 case ISD::INSERT_SUBVECTOR: 189 case ISD::EXTRACT_SUBVECTOR: 190 break; 191 case ISD::CONCAT_VECTORS: 192 setOperationAction(Op, VT, Custom); 193 break; 194 default: 195 setOperationAction(Op, VT, Expand); 196 break; 197 } 198 } 199 } 200 201 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 202 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 203 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 204 setOperationAction(ISD::FRINT, MVT::f64, Legal); 205 } 206 207 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 208 setOperationAction(ISD::FDIV, MVT::f32, Custom); 209 setOperationAction(ISD::FDIV, MVT::f64, Custom); 210 211 setTargetDAGCombine(ISD::FADD); 212 setTargetDAGCombine(ISD::FSUB); 213 setTargetDAGCombine(ISD::FMINNUM); 214 setTargetDAGCombine(ISD::FMAXNUM); 215 setTargetDAGCombine(ISD::SMIN); 216 setTargetDAGCombine(ISD::SMAX); 217 setTargetDAGCombine(ISD::UMIN); 218 setTargetDAGCombine(ISD::UMAX); 219 setTargetDAGCombine(ISD::SELECT_CC); 220 setTargetDAGCombine(ISD::SETCC); 221 setTargetDAGCombine(ISD::AND); 222 setTargetDAGCombine(ISD::OR); 223 setTargetDAGCombine(ISD::UINT_TO_FP); 224 225 // All memory operations. Some folding on the pointer operand is done to help 226 // matching the constant offsets in the addressing modes. 227 setTargetDAGCombine(ISD::LOAD); 228 setTargetDAGCombine(ISD::STORE); 229 setTargetDAGCombine(ISD::ATOMIC_LOAD); 230 setTargetDAGCombine(ISD::ATOMIC_STORE); 231 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); 232 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); 233 setTargetDAGCombine(ISD::ATOMIC_SWAP); 234 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); 235 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); 236 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); 237 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); 238 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); 239 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); 240 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); 241 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); 242 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); 243 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); 244 245 setSchedulingPreference(Sched::RegPressure); 246 } 247 248 //===----------------------------------------------------------------------===// 249 // TargetLowering queries 250 //===----------------------------------------------------------------------===// 251 252 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, 253 EVT) const { 254 // SI has some legal vector types, but no legal vector operations. Say no 255 // shuffles are legal in order to prefer scalarizing some vector operations. 256 return false; 257 } 258 259 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { 260 // Flat instructions do not have offsets, and only have the register 261 // address. 262 return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); 263 } 264 265 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { 266 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and 267 // additionally can do r + r + i with addr64. 32-bit has more addressing 268 // mode options. Depending on the resource constant, it can also do 269 // (i64 r0) + (i32 r1) * (i14 i). 270 // 271 // Private arrays end up using a scratch buffer most of the time, so also 272 // assume those use MUBUF instructions. Scratch loads / stores are currently 273 // implemented as mubuf instructions with offen bit set, so slightly 274 // different than the normal addr64. 275 if (!isUInt<12>(AM.BaseOffs)) 276 return false; 277 278 // FIXME: Since we can split immediate into soffset and immediate offset, 279 // would it make sense to allow any immediate? 280 281 switch (AM.Scale) { 282 case 0: // r + i or just i, depending on HasBaseReg. 283 return true; 284 case 1: 285 return true; // We have r + r or r + i. 286 case 2: 287 if (AM.HasBaseReg) { 288 // Reject 2 * r + r. 289 return false; 290 } 291 292 // Allow 2 * r as r + r 293 // Or 2 * r + i is allowed as r + r + i. 294 return true; 295 default: // Don't allow n * r 296 return false; 297 } 298 } 299 300 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, 301 const AddrMode &AM, Type *Ty, 302 unsigned AS) const { 303 // No global is ever allowed as a base. 304 if (AM.BaseGV) 305 return false; 306 307 switch (AS) { 308 case AMDGPUAS::GLOBAL_ADDRESS: { 309 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 310 // Assume the we will use FLAT for all global memory accesses 311 // on VI. 312 // FIXME: This assumption is currently wrong. On VI we still use 313 // MUBUF instructions for the r + i addressing mode. As currently 314 // implemented, the MUBUF instructions only work on buffer < 4GB. 315 // It may be possible to support > 4GB buffers with MUBUF instructions, 316 // by setting the stride value in the resource descriptor which would 317 // increase the size limit to (stride * 4GB). However, this is risky, 318 // because it has never been validated. 319 return isLegalFlatAddressingMode(AM); 320 } 321 322 return isLegalMUBUFAddressingMode(AM); 323 } 324 case AMDGPUAS::CONSTANT_ADDRESS: { 325 // If the offset isn't a multiple of 4, it probably isn't going to be 326 // correctly aligned. 327 if (AM.BaseOffs % 4 != 0) 328 return isLegalMUBUFAddressingMode(AM); 329 330 // There are no SMRD extloads, so if we have to do a small type access we 331 // will use a MUBUF load. 332 // FIXME?: We also need to do this if unaligned, but we don't know the 333 // alignment here. 334 if (DL.getTypeStoreSize(Ty) < 4) 335 return isLegalMUBUFAddressingMode(AM); 336 337 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { 338 // SMRD instructions have an 8-bit, dword offset on SI. 339 if (!isUInt<8>(AM.BaseOffs / 4)) 340 return false; 341 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { 342 // On CI+, this can also be a 32-bit literal constant offset. If it fits 343 // in 8-bits, it can use a smaller encoding. 344 if (!isUInt<32>(AM.BaseOffs / 4)) 345 return false; 346 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { 347 // On VI, these use the SMEM format and the offset is 20-bit in bytes. 348 if (!isUInt<20>(AM.BaseOffs)) 349 return false; 350 } else 351 llvm_unreachable("unhandled generation"); 352 353 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 354 return true; 355 356 if (AM.Scale == 1 && AM.HasBaseReg) 357 return true; 358 359 return false; 360 } 361 362 case AMDGPUAS::PRIVATE_ADDRESS: 363 case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: 364 return isLegalMUBUFAddressingMode(AM); 365 366 case AMDGPUAS::LOCAL_ADDRESS: 367 case AMDGPUAS::REGION_ADDRESS: { 368 // Basic, single offset DS instructions allow a 16-bit unsigned immediate 369 // field. 370 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have 371 // an 8-bit dword offset but we don't know the alignment here. 372 if (!isUInt<16>(AM.BaseOffs)) 373 return false; 374 375 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 376 return true; 377 378 if (AM.Scale == 1 && AM.HasBaseReg) 379 return true; 380 381 return false; 382 } 383 case AMDGPUAS::FLAT_ADDRESS: 384 return isLegalFlatAddressingMode(AM); 385 386 default: 387 llvm_unreachable("unhandled address space"); 388 } 389 } 390 391 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 392 unsigned AddrSpace, 393 unsigned Align, 394 bool *IsFast) const { 395 if (IsFast) 396 *IsFast = false; 397 398 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, 399 // which isn't a simple VT. 400 if (!VT.isSimple() || VT == MVT::Other) 401 return false; 402 403 // TODO - CI+ supports unaligned memory accesses, but this requires driver 404 // support. 405 406 // XXX - The only mention I see of this in the ISA manual is for LDS direct 407 // reads the "byte address and must be dword aligned". Is it also true for the 408 // normal loads and stores? 409 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { 410 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte 411 // aligned, 8 byte access in a single operation using ds_read2/write2_b32 412 // with adjacent offsets. 413 bool AlignedBy4 = (Align % 4 == 0); 414 if (IsFast) 415 *IsFast = AlignedBy4; 416 return AlignedBy4; 417 } 418 419 // Smaller than dword value must be aligned. 420 // FIXME: This should be allowed on CI+ 421 if (VT.bitsLT(MVT::i32)) 422 return false; 423 424 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 425 // byte-address are ignored, thus forcing Dword alignment. 426 // This applies to private, global, and constant memory. 427 if (IsFast) 428 *IsFast = true; 429 430 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 431 } 432 433 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 434 unsigned SrcAlign, bool IsMemset, 435 bool ZeroMemset, 436 bool MemcpyStrSrc, 437 MachineFunction &MF) const { 438 // FIXME: Should account for address space here. 439 440 // The default fallback uses the private pointer size as a guess for a type to 441 // use. Make sure we switch these to 64-bit accesses. 442 443 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global 444 return MVT::v4i32; 445 446 if (Size >= 8 && DstAlign >= 4) 447 return MVT::v2i32; 448 449 // Use the default. 450 return MVT::Other; 451 } 452 453 TargetLoweringBase::LegalizeTypeAction 454 SITargetLowering::getPreferredVectorAction(EVT VT) const { 455 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) 456 return TypeSplitVector; 457 458 return TargetLoweringBase::getPreferredVectorAction(VT); 459 } 460 461 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 462 Type *Ty) const { 463 const SIInstrInfo *TII = 464 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 465 return TII->isInlineConstant(Imm); 466 } 467 468 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 469 SDLoc SL, SDValue Chain, 470 unsigned Offset, bool Signed) const { 471 const DataLayout &DL = DAG.getDataLayout(); 472 MachineFunction &MF = DAG.getMachineFunction(); 473 const SIRegisterInfo *TRI = 474 static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); 475 unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); 476 477 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 478 479 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 480 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 481 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 482 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, 483 MRI.getLiveInVirtReg(InputPtrReg), PtrVT); 484 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 485 DAG.getConstant(Offset, SL, PtrVT)); 486 SDValue PtrOffset = DAG.getUNDEF(PtrVT); 487 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 488 489 unsigned Align = DL.getABITypeAlignment(Ty); 490 491 ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 492 if (MemVT.isFloatingPoint()) 493 ExtTy = ISD::EXTLOAD; 494 495 return DAG.getLoad(ISD::UNINDEXED, ExtTy, 496 VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, 497 false, // isVolatile 498 true, // isNonTemporal 499 true, // isInvariant 500 Align); // Alignment 501 } 502 503 SDValue SITargetLowering::LowerFormalArguments( 504 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 505 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 506 SmallVectorImpl<SDValue> &InVals) const { 507 const SIRegisterInfo *TRI = 508 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 509 510 MachineFunction &MF = DAG.getMachineFunction(); 511 FunctionType *FType = MF.getFunction()->getFunctionType(); 512 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 513 514 if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { 515 const Function *Fn = MF.getFunction(); 516 DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA"); 517 DAG.getContext()->diagnose(NoGraphicsHSA); 518 return SDValue(); 519 } 520 521 // FIXME: We currently assume all calling conventions are kernels. 522 523 SmallVector<ISD::InputArg, 16> Splits; 524 BitVector Skipped(Ins.size()); 525 526 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 527 const ISD::InputArg &Arg = Ins[i]; 528 529 // First check if it's a PS input addr 530 if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && 531 !Arg.Flags.isByVal()) { 532 533 assert((PSInputNum <= 15) && "Too many PS inputs!"); 534 535 if (!Arg.Used) { 536 // We can safely skip PS inputs 537 Skipped.set(i); 538 ++PSInputNum; 539 continue; 540 } 541 542 Info->PSInputAddr |= 1 << PSInputNum++; 543 } 544 545 // Second split vertices into their elements 546 if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { 547 ISD::InputArg NewArg = Arg; 548 NewArg.Flags.setSplit(); 549 NewArg.VT = Arg.VT.getVectorElementType(); 550 551 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 552 // three or five element vertex only needs three or five registers, 553 // NOT four or eight. 554 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 555 unsigned NumElements = ParamType->getVectorNumElements(); 556 557 for (unsigned j = 0; j != NumElements; ++j) { 558 Splits.push_back(NewArg); 559 NewArg.PartOffset += NewArg.VT.getStoreSize(); 560 } 561 562 } else if (Info->getShaderType() != ShaderType::COMPUTE) { 563 Splits.push_back(Arg); 564 } 565 } 566 567 SmallVector<CCValAssign, 16> ArgLocs; 568 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 569 *DAG.getContext()); 570 571 // At least one interpolation mode must be enabled or else the GPU will hang. 572 if (Info->getShaderType() == ShaderType::PIXEL && 573 (Info->PSInputAddr & 0x7F) == 0) { 574 Info->PSInputAddr |= 1; 575 CCInfo.AllocateReg(AMDGPU::VGPR0); 576 CCInfo.AllocateReg(AMDGPU::VGPR1); 577 } 578 579 // The pointer to the list of arguments is stored in SGPR0, SGPR1 580 // The pointer to the scratch buffer is stored in SGPR2, SGPR3 581 if (Info->getShaderType() == ShaderType::COMPUTE) { 582 if (Subtarget->isAmdHsaOS()) 583 Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. 584 else 585 Info->NumUserSGPRs = 4; 586 587 unsigned InputPtrReg = 588 TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); 589 unsigned InputPtrRegLo = 590 TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); 591 unsigned InputPtrRegHi = 592 TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); 593 594 unsigned ScratchPtrReg = 595 TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); 596 unsigned ScratchPtrRegLo = 597 TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); 598 unsigned ScratchPtrRegHi = 599 TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); 600 601 CCInfo.AllocateReg(InputPtrRegLo); 602 CCInfo.AllocateReg(InputPtrRegHi); 603 CCInfo.AllocateReg(ScratchPtrRegLo); 604 CCInfo.AllocateReg(ScratchPtrRegHi); 605 MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); 606 MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); 607 } 608 609 if (Info->getShaderType() == ShaderType::COMPUTE) { 610 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 611 Splits); 612 } 613 614 AnalyzeFormalArguments(CCInfo, Splits); 615 616 SmallVector<SDValue, 16> Chains; 617 618 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 619 620 const ISD::InputArg &Arg = Ins[i]; 621 if (Skipped[i]) { 622 InVals.push_back(DAG.getUNDEF(Arg.VT)); 623 continue; 624 } 625 626 CCValAssign &VA = ArgLocs[ArgIdx++]; 627 MVT VT = VA.getLocVT(); 628 629 if (VA.isMemLoc()) { 630 VT = Ins[i].VT; 631 EVT MemVT = Splits[i].VT; 632 const unsigned Offset = Subtarget->getExplicitKernelArgOffset() + 633 VA.getLocMemOffset(); 634 // The first 36 bytes of the input buffer contains information about 635 // thread group and global sizes. 636 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, 637 Offset, Ins[i].Flags.isSExt()); 638 Chains.push_back(Arg.getValue(1)); 639 640 auto *ParamTy = 641 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 642 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && 643 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 644 // On SI local pointers are just offsets into LDS, so they are always 645 // less than 16-bits. On CI and newer they could potentially be 646 // real pointers, so we can't guarantee their size. 647 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, 648 DAG.getValueType(MVT::i16)); 649 } 650 651 InVals.push_back(Arg); 652 Info->ABIArgOffset = Offset + MemVT.getStoreSize(); 653 continue; 654 } 655 assert(VA.isRegLoc() && "Parameter must be in a register!"); 656 657 unsigned Reg = VA.getLocReg(); 658 659 if (VT == MVT::i64) { 660 // For now assume it is a pointer 661 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 662 &AMDGPU::SReg_64RegClass); 663 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 664 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 665 InVals.push_back(Copy); 666 continue; 667 } 668 669 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 670 671 Reg = MF.addLiveIn(Reg, RC); 672 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 673 674 if (Arg.VT.isVector()) { 675 676 // Build a vector from the registers 677 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 678 unsigned NumElements = ParamType->getVectorNumElements(); 679 680 SmallVector<SDValue, 4> Regs; 681 Regs.push_back(Val); 682 for (unsigned j = 1; j != NumElements; ++j) { 683 Reg = ArgLocs[ArgIdx++].getLocReg(); 684 Reg = MF.addLiveIn(Reg, RC); 685 686 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 687 Regs.push_back(Copy); 688 } 689 690 // Fill up the missing vector elements 691 NumElements = Arg.VT.getVectorNumElements() - NumElements; 692 Regs.append(NumElements, DAG.getUNDEF(VT)); 693 694 InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); 695 continue; 696 } 697 698 InVals.push_back(Val); 699 } 700 701 if (Info->getShaderType() != ShaderType::COMPUTE) { 702 unsigned ScratchIdx = CCInfo.getFirstUnallocated(makeArrayRef( 703 AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); 704 Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); 705 } 706 707 if (Chains.empty()) 708 return Chain; 709 710 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 711 } 712 713 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 714 MachineInstr * MI, MachineBasicBlock * BB) const { 715 716 MachineBasicBlock::iterator I = *MI; 717 const SIInstrInfo *TII = 718 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 719 720 switch (MI->getOpcode()) { 721 default: 722 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 723 case AMDGPU::BRANCH: 724 return BB; 725 case AMDGPU::SI_RegisterStorePseudo: { 726 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 727 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 728 MachineInstrBuilder MIB = 729 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), 730 Reg); 731 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) 732 MIB.addOperand(MI->getOperand(i)); 733 734 MI->eraseFromParent(); 735 break; 736 } 737 } 738 return BB; 739 } 740 741 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 742 // This currently forces unfolding various combinations of fsub into fma with 743 // free fneg'd operands. As long as we have fast FMA (controlled by 744 // isFMAFasterThanFMulAndFAdd), we should perform these. 745 746 // When fma is quarter rate, for f64 where add / sub are at best half rate, 747 // most of these combines appear to be cycle neutral but save on instruction 748 // count / code size. 749 return true; 750 } 751 752 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, 753 EVT VT) const { 754 if (!VT.isVector()) { 755 return MVT::i1; 756 } 757 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 758 } 759 760 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const { 761 return MVT::i32; 762 } 763 764 // Answering this is somewhat tricky and depends on the specific device which 765 // have different rates for fma or all f64 operations. 766 // 767 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 768 // regardless of which device (although the number of cycles differs between 769 // devices), so it is always profitable for f64. 770 // 771 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 772 // only on full rate devices. Normally, we should prefer selecting v_mad_f32 773 // which we can always do even without fused FP ops since it returns the same 774 // result as the separate operations and since it is always full 775 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 776 // however does not support denormals, so we do report fma as faster if we have 777 // a fast fma device and require denormals. 778 // 779 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 780 VT = VT.getScalarType(); 781 782 if (!VT.isSimple()) 783 return false; 784 785 switch (VT.getSimpleVT().SimpleTy) { 786 case MVT::f32: 787 // This is as fast on some subtargets. However, we always have full rate f32 788 // mad available which returns the same result as the separate operations 789 // which we should prefer over fma. We can't use this if we want to support 790 // denormals, so only report this in these cases. 791 return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); 792 case MVT::f64: 793 return true; 794 default: 795 break; 796 } 797 798 return false; 799 } 800 801 //===----------------------------------------------------------------------===// 802 // Custom DAG Lowering Operations 803 //===----------------------------------------------------------------------===// 804 805 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 806 switch (Op.getOpcode()) { 807 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 808 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 809 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 810 case ISD::LOAD: { 811 SDValue Result = LowerLOAD(Op, DAG); 812 assert((!Result.getNode() || 813 Result.getNode()->getNumValues() == 2) && 814 "Load should return a value and a chain"); 815 return Result; 816 } 817 818 case ISD::FSIN: 819 case ISD::FCOS: 820 return LowerTrig(Op, DAG); 821 case ISD::SELECT: return LowerSELECT(Op, DAG); 822 case ISD::FDIV: return LowerFDIV(Op, DAG); 823 case ISD::STORE: return LowerSTORE(Op, DAG); 824 case ISD::GlobalAddress: { 825 MachineFunction &MF = DAG.getMachineFunction(); 826 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 827 return LowerGlobalAddress(MFI, Op, DAG); 828 } 829 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 830 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); 831 } 832 return SDValue(); 833 } 834 835 /// \brief Helper function for LowerBRCOND 836 static SDNode *findUser(SDValue Value, unsigned Opcode) { 837 838 SDNode *Parent = Value.getNode(); 839 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 840 I != E; ++I) { 841 842 if (I.getUse().get() != Value) 843 continue; 844 845 if (I->getOpcode() == Opcode) 846 return *I; 847 } 848 return nullptr; 849 } 850 851 SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { 852 853 SDLoc SL(Op); 854 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); 855 unsigned FrameIndex = FINode->getIndex(); 856 857 // A FrameIndex node represents a 32-bit offset into scratch memory. If 858 // the high bit of a frame index offset were to be set, this would mean 859 // that it represented an offset of ~2GB * 64 = ~128GB from the start of the 860 // scratch buffer, with 64 being the number of threads per wave. 861 // 862 // If we know the machine uses less than 128GB of scratch, then we can 863 // amrk the high bit of the FrameIndex node as known zero, 864 // which is important, because it means in most situations we can 865 // prove that values derived from FrameIndex nodes are non-negative. 866 // This enables us to take advantage of more addressing modes when 867 // accessing scratch buffers, since for scratch reads/writes, the register 868 // offset must always be positive. 869 870 SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); 871 if (Subtarget->enableHugeScratchBuffer()) 872 return TFI; 873 874 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, 875 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31))); 876 } 877 878 /// This transforms the control flow intrinsics to get the branch destination as 879 /// last parameter, also switches branch target with BR if the need arise 880 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 881 SelectionDAG &DAG) const { 882 883 SDLoc DL(BRCOND); 884 885 SDNode *Intr = BRCOND.getOperand(1).getNode(); 886 SDValue Target = BRCOND.getOperand(2); 887 SDNode *BR = nullptr; 888 889 if (Intr->getOpcode() == ISD::SETCC) { 890 // As long as we negate the condition everything is fine 891 SDNode *SetCC = Intr; 892 assert(SetCC->getConstantOperandVal(1) == 1); 893 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 894 ISD::SETNE); 895 Intr = SetCC->getOperand(0).getNode(); 896 897 } else { 898 // Get the target from BR if we don't negate the condition 899 BR = findUser(BRCOND, ISD::BR); 900 Target = BR->getOperand(1); 901 } 902 903 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 904 905 // Build the result and 906 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 907 908 // operands of the new intrinsic call 909 SmallVector<SDValue, 4> Ops; 910 Ops.push_back(BRCOND.getOperand(0)); 911 Ops.append(Intr->op_begin() + 1, Intr->op_end()); 912 Ops.push_back(Target); 913 914 // build the new intrinsic call 915 SDNode *Result = DAG.getNode( 916 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 917 DAG.getVTList(Res), Ops).getNode(); 918 919 if (BR) { 920 // Give the branch instruction our target 921 SDValue Ops[] = { 922 BR->getOperand(0), 923 BRCOND.getOperand(2) 924 }; 925 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 926 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 927 BR = NewBR.getNode(); 928 } 929 930 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 931 932 // Copy the intrinsic results to registers 933 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 934 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 935 if (!CopyToReg) 936 continue; 937 938 Chain = DAG.getCopyToReg( 939 Chain, DL, 940 CopyToReg->getOperand(1), 941 SDValue(Result, i - 1), 942 SDValue()); 943 944 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 945 } 946 947 // Remove the old intrinsic from the chain 948 DAG.ReplaceAllUsesOfValueWith( 949 SDValue(Intr, Intr->getNumValues() - 1), 950 Intr->getOperand(0)); 951 952 return Chain; 953 } 954 955 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 956 SDValue Op, 957 SelectionDAG &DAG) const { 958 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 959 960 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 961 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 962 963 SDLoc DL(GSD); 964 const GlobalValue *GV = GSD->getGlobal(); 965 MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); 966 967 SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); 968 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); 969 970 SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, 971 DAG.getConstant(0, DL, MVT::i32)); 972 SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, 973 DAG.getConstant(1, DL, MVT::i32)); 974 975 SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), 976 PtrLo, GA); 977 SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), 978 PtrHi, DAG.getConstant(0, DL, MVT::i32), 979 SDValue(Lo.getNode(), 1)); 980 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); 981 } 982 983 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, 984 SDValue V) const { 985 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, 986 // so we will end up with redundant moves to m0. 987 // 988 // We can't use S_MOV_B32, because there is no way to specify m0 as the 989 // destination register. 990 // 991 // We have to use them both. Machine cse will combine all the S_MOV_B32 992 // instructions and the register coalescer eliminate the extra copies. 993 SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); 994 return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), 995 SDValue(M0, 0), SDValue()); // Glue 996 // A Null SDValue creates 997 // a glue result. 998 } 999 1000 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 1001 SelectionDAG &DAG) const { 1002 MachineFunction &MF = DAG.getMachineFunction(); 1003 auto MFI = MF.getInfo<SIMachineFunctionInfo>(); 1004 const SIRegisterInfo *TRI = 1005 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 1006 1007 EVT VT = Op.getValueType(); 1008 SDLoc DL(Op); 1009 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 1010 1011 // TODO: Should this propagate fast-math-flags? 1012 1013 switch (IntrinsicID) { 1014 case Intrinsic::r600_read_ngroups_x: 1015 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1016 SI::KernelInputOffsets::NGROUPS_X, false); 1017 case Intrinsic::r600_read_ngroups_y: 1018 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1019 SI::KernelInputOffsets::NGROUPS_Y, false); 1020 case Intrinsic::r600_read_ngroups_z: 1021 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1022 SI::KernelInputOffsets::NGROUPS_Z, false); 1023 case Intrinsic::r600_read_global_size_x: 1024 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1025 SI::KernelInputOffsets::GLOBAL_SIZE_X, false); 1026 case Intrinsic::r600_read_global_size_y: 1027 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1028 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); 1029 case Intrinsic::r600_read_global_size_z: 1030 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1031 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); 1032 case Intrinsic::r600_read_local_size_x: 1033 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1034 SI::KernelInputOffsets::LOCAL_SIZE_X, false); 1035 case Intrinsic::r600_read_local_size_y: 1036 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1037 SI::KernelInputOffsets::LOCAL_SIZE_Y, false); 1038 case Intrinsic::r600_read_local_size_z: 1039 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1040 SI::KernelInputOffsets::LOCAL_SIZE_Z, false); 1041 1042 case Intrinsic::AMDGPU_read_workdim: 1043 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1044 getImplicitParameterOffset(MFI, GRID_DIM), false); 1045 1046 case Intrinsic::r600_read_tgid_x: 1047 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 1048 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); 1049 case Intrinsic::r600_read_tgid_y: 1050 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 1051 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); 1052 case Intrinsic::r600_read_tgid_z: 1053 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 1054 TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); 1055 case Intrinsic::r600_read_tidig_x: 1056 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 1057 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); 1058 case Intrinsic::r600_read_tidig_y: 1059 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 1060 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); 1061 case Intrinsic::r600_read_tidig_z: 1062 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 1063 TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); 1064 case AMDGPUIntrinsic::SI_load_const: { 1065 SDValue Ops[] = { 1066 Op.getOperand(1), 1067 Op.getOperand(2) 1068 }; 1069 1070 MachineMemOperand *MMO = MF.getMachineMemOperand( 1071 MachinePointerInfo(), 1072 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, 1073 VT.getStoreSize(), 4); 1074 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 1075 Op->getVTList(), Ops, VT, MMO); 1076 } 1077 case AMDGPUIntrinsic::SI_sample: 1078 return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); 1079 case AMDGPUIntrinsic::SI_sampleb: 1080 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); 1081 case AMDGPUIntrinsic::SI_sampled: 1082 return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); 1083 case AMDGPUIntrinsic::SI_samplel: 1084 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); 1085 case AMDGPUIntrinsic::SI_vs_load_input: 1086 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 1087 Op.getOperand(1), 1088 Op.getOperand(2), 1089 Op.getOperand(3)); 1090 1091 case AMDGPUIntrinsic::AMDGPU_fract: 1092 case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. 1093 return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), 1094 DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); 1095 case AMDGPUIntrinsic::SI_fs_constant: { 1096 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 1097 SDValue Glue = M0.getValue(1); 1098 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, 1099 DAG.getConstant(2, DL, MVT::i32), // P0 1100 Op.getOperand(1), Op.getOperand(2), Glue); 1101 } 1102 case AMDGPUIntrinsic::SI_packf16: 1103 if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) 1104 return DAG.getUNDEF(MVT::i32); 1105 return Op; 1106 case AMDGPUIntrinsic::SI_fs_interp: { 1107 SDValue IJ = Op.getOperand(4); 1108 SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 1109 DAG.getConstant(0, DL, MVT::i32)); 1110 SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 1111 DAG.getConstant(1, DL, MVT::i32)); 1112 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 1113 SDValue Glue = M0.getValue(1); 1114 SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, 1115 DAG.getVTList(MVT::f32, MVT::Glue), 1116 I, Op.getOperand(1), Op.getOperand(2), Glue); 1117 Glue = SDValue(P1.getNode(), 1); 1118 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, 1119 Op.getOperand(1), Op.getOperand(2), Glue); 1120 } 1121 default: 1122 return AMDGPUTargetLowering::LowerOperation(Op, DAG); 1123 } 1124 } 1125 1126 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 1127 SelectionDAG &DAG) const { 1128 MachineFunction &MF = DAG.getMachineFunction(); 1129 SDLoc DL(Op); 1130 SDValue Chain = Op.getOperand(0); 1131 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1132 1133 switch (IntrinsicID) { 1134 case AMDGPUIntrinsic::SI_sendmsg: { 1135 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 1136 SDValue Glue = Chain.getValue(1); 1137 return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, 1138 Op.getOperand(2), Glue); 1139 } 1140 case AMDGPUIntrinsic::SI_tbuffer_store: { 1141 SDValue Ops[] = { 1142 Chain, 1143 Op.getOperand(2), 1144 Op.getOperand(3), 1145 Op.getOperand(4), 1146 Op.getOperand(5), 1147 Op.getOperand(6), 1148 Op.getOperand(7), 1149 Op.getOperand(8), 1150 Op.getOperand(9), 1151 Op.getOperand(10), 1152 Op.getOperand(11), 1153 Op.getOperand(12), 1154 Op.getOperand(13), 1155 Op.getOperand(14) 1156 }; 1157 1158 EVT VT = Op.getOperand(3).getValueType(); 1159 1160 MachineMemOperand *MMO = MF.getMachineMemOperand( 1161 MachinePointerInfo(), 1162 MachineMemOperand::MOStore, 1163 VT.getStoreSize(), 4); 1164 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 1165 Op->getVTList(), Ops, VT, MMO); 1166 } 1167 default: 1168 return SDValue(); 1169 } 1170 } 1171 1172 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1173 SDLoc DL(Op); 1174 LoadSDNode *Load = cast<LoadSDNode>(Op); 1175 1176 if (Op.getValueType().isVector()) { 1177 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 1178 "Custom lowering for non-i32 vectors hasn't been implemented."); 1179 unsigned NumElements = Op.getValueType().getVectorNumElements(); 1180 assert(NumElements != 2 && "v2 loads are supported for all address spaces."); 1181 switch (Load->getAddressSpace()) { 1182 default: break; 1183 case AMDGPUAS::GLOBAL_ADDRESS: 1184 case AMDGPUAS::PRIVATE_ADDRESS: 1185 // v4 loads are supported for private and global memory. 1186 if (NumElements <= 4) 1187 break; 1188 // fall-through 1189 case AMDGPUAS::LOCAL_ADDRESS: 1190 return ScalarizeVectorLoad(Op, DAG); 1191 } 1192 } 1193 1194 return AMDGPUTargetLowering::LowerLOAD(Op, DAG); 1195 } 1196 1197 SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, 1198 const SDValue &Op, 1199 SelectionDAG &DAG) const { 1200 return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), 1201 Op.getOperand(2), 1202 Op.getOperand(3), 1203 Op.getOperand(4)); 1204 } 1205 1206 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 1207 if (Op.getValueType() != MVT::i64) 1208 return SDValue(); 1209 1210 SDLoc DL(Op); 1211 SDValue Cond = Op.getOperand(0); 1212 1213 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 1214 SDValue One = DAG.getConstant(1, DL, MVT::i32); 1215 1216 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 1217 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 1218 1219 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 1220 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 1221 1222 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 1223 1224 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 1225 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 1226 1227 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 1228 1229 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); 1230 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 1231 } 1232 1233 // Catch division cases where we can use shortcuts with rcp and rsq 1234 // instructions. 1235 SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { 1236 SDLoc SL(Op); 1237 SDValue LHS = Op.getOperand(0); 1238 SDValue RHS = Op.getOperand(1); 1239 EVT VT = Op.getValueType(); 1240 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; 1241 1242 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 1243 if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && 1244 CLHS->isExactlyValue(1.0)) { 1245 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 1246 // the CI documentation has a worst case error of 1 ulp. 1247 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 1248 // use it as long as we aren't trying to use denormals. 1249 1250 // 1.0 / sqrt(x) -> rsq(x) 1251 // 1252 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 1253 // error seems really high at 2^29 ULP. 1254 if (RHS.getOpcode() == ISD::FSQRT) 1255 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); 1256 1257 // 1.0 / x -> rcp(x) 1258 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 1259 } 1260 } 1261 1262 if (Unsafe) { 1263 // Turn into multiply by the reciprocal. 1264 // x / y -> x * (1.0 / y) 1265 SDNodeFlags Flags; 1266 Flags.setUnsafeAlgebra(true); 1267 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 1268 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags); 1269 } 1270 1271 return SDValue(); 1272 } 1273 1274 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 1275 SDValue FastLowered = LowerFastFDIV(Op, DAG); 1276 if (FastLowered.getNode()) 1277 return FastLowered; 1278 1279 // This uses v_rcp_f32 which does not handle denormals. Let this hit a 1280 // selection error for now rather than do something incorrect. 1281 if (Subtarget->hasFP32Denormals()) 1282 return SDValue(); 1283 1284 SDLoc SL(Op); 1285 SDValue LHS = Op.getOperand(0); 1286 SDValue RHS = Op.getOperand(1); 1287 1288 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); 1289 1290 const APFloat K0Val(BitsToFloat(0x6f800000)); 1291 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); 1292 1293 const APFloat K1Val(BitsToFloat(0x2f800000)); 1294 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); 1295 1296 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 1297 1298 EVT SetCCVT = 1299 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); 1300 1301 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 1302 1303 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); 1304 1305 // TODO: Should this propagate fast-math-flags? 1306 1307 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); 1308 1309 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); 1310 1311 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); 1312 1313 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); 1314 } 1315 1316 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 1317 if (DAG.getTarget().Options.UnsafeFPMath) 1318 return LowerFastFDIV(Op, DAG); 1319 1320 SDLoc SL(Op); 1321 SDValue X = Op.getOperand(0); 1322 SDValue Y = Op.getOperand(1); 1323 1324 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 1325 1326 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 1327 1328 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 1329 1330 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 1331 1332 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 1333 1334 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 1335 1336 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 1337 1338 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 1339 1340 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 1341 1342 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 1343 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 1344 1345 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, 1346 NegDivScale0, Mul, DivScale1); 1347 1348 SDValue Scale; 1349 1350 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { 1351 // Workaround a hardware bug on SI where the condition output from div_scale 1352 // is not usable. 1353 1354 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); 1355 1356 // Figure out if the scale to use for div_fmas. 1357 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 1358 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 1359 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 1360 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 1361 1362 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 1363 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 1364 1365 SDValue Scale0Hi 1366 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 1367 SDValue Scale1Hi 1368 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 1369 1370 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 1371 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 1372 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 1373 } else { 1374 Scale = DivScale1.getValue(1); 1375 } 1376 1377 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, 1378 Fma4, Fma3, Mul, Scale); 1379 1380 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 1381 } 1382 1383 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 1384 EVT VT = Op.getValueType(); 1385 1386 if (VT == MVT::f32) 1387 return LowerFDIV32(Op, DAG); 1388 1389 if (VT == MVT::f64) 1390 return LowerFDIV64(Op, DAG); 1391 1392 llvm_unreachable("Unexpected type for fdiv"); 1393 } 1394 1395 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1396 SDLoc DL(Op); 1397 StoreSDNode *Store = cast<StoreSDNode>(Op); 1398 EVT VT = Store->getMemoryVT(); 1399 1400 // These stores are legal. 1401 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 1402 if (VT.isVector() && VT.getVectorNumElements() > 4) 1403 return ScalarizeVectorStore(Op, DAG); 1404 return SDValue(); 1405 } 1406 1407 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1408 if (Ret.getNode()) 1409 return Ret; 1410 1411 if (VT.isVector() && VT.getVectorNumElements() >= 8) 1412 return ScalarizeVectorStore(Op, DAG); 1413 1414 if (VT == MVT::i1) 1415 return DAG.getTruncStore(Store->getChain(), DL, 1416 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 1417 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 1418 1419 return SDValue(); 1420 } 1421 1422 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 1423 SDLoc DL(Op); 1424 EVT VT = Op.getValueType(); 1425 SDValue Arg = Op.getOperand(0); 1426 // TODO: Should this propagate fast-math-flags? 1427 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 1428 DAG.getNode(ISD::FMUL, DL, VT, Arg, 1429 DAG.getConstantFP(0.5/M_PI, DL, 1430 VT))); 1431 1432 switch (Op.getOpcode()) { 1433 case ISD::FCOS: 1434 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); 1435 case ISD::FSIN: 1436 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); 1437 default: 1438 llvm_unreachable("Wrong trig opcode"); 1439 } 1440 } 1441 1442 //===----------------------------------------------------------------------===// 1443 // Custom DAG optimizations 1444 //===----------------------------------------------------------------------===// 1445 1446 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 1447 DAGCombinerInfo &DCI) const { 1448 EVT VT = N->getValueType(0); 1449 EVT ScalarVT = VT.getScalarType(); 1450 if (ScalarVT != MVT::f32) 1451 return SDValue(); 1452 1453 SelectionDAG &DAG = DCI.DAG; 1454 SDLoc DL(N); 1455 1456 SDValue Src = N->getOperand(0); 1457 EVT SrcVT = Src.getValueType(); 1458 1459 // TODO: We could try to match extracting the higher bytes, which would be 1460 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 1461 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 1462 // about in practice. 1463 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { 1464 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 1465 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); 1466 DCI.AddToWorklist(Cvt.getNode()); 1467 return Cvt; 1468 } 1469 } 1470 1471 // We are primarily trying to catch operations on illegal vector types 1472 // before they are expanded. 1473 // For scalars, we can use the more flexible method of checking masked bits 1474 // after legalization. 1475 if (!DCI.isBeforeLegalize() || 1476 !SrcVT.isVector() || 1477 SrcVT.getVectorElementType() != MVT::i8) { 1478 return SDValue(); 1479 } 1480 1481 assert(DCI.isBeforeLegalize() && "Unexpected legal type"); 1482 1483 // Weird sized vectors are a pain to handle, but we know 3 is really the same 1484 // size as 4. 1485 unsigned NElts = SrcVT.getVectorNumElements(); 1486 if (!SrcVT.isSimple() && NElts != 3) 1487 return SDValue(); 1488 1489 // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to 1490 // prevent a mess from expanding to v4i32 and repacking. 1491 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { 1492 EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); 1493 EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); 1494 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); 1495 LoadSDNode *Load = cast<LoadSDNode>(Src); 1496 1497 unsigned AS = Load->getAddressSpace(); 1498 unsigned Align = Load->getAlignment(); 1499 Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); 1500 unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); 1501 1502 // Don't try to replace the load if we have to expand it due to alignment 1503 // problems. Otherwise we will end up scalarizing the load, and trying to 1504 // repack into the vector for no real reason. 1505 if (Align < ABIAlignment && 1506 !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { 1507 return SDValue(); 1508 } 1509 1510 SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, 1511 Load->getChain(), 1512 Load->getBasePtr(), 1513 LoadVT, 1514 Load->getMemOperand()); 1515 1516 // Make sure successors of the original load stay after it by updating 1517 // them to use the new Chain. 1518 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); 1519 1520 SmallVector<SDValue, 4> Elts; 1521 if (RegVT.isVector()) 1522 DAG.ExtractVectorElements(NewLoad, Elts); 1523 else 1524 Elts.push_back(NewLoad); 1525 1526 SmallVector<SDValue, 4> Ops; 1527 1528 unsigned EltIdx = 0; 1529 for (SDValue Elt : Elts) { 1530 unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); 1531 for (unsigned I = 0; I < ComponentsInElt; ++I) { 1532 unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; 1533 SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); 1534 DCI.AddToWorklist(Cvt.getNode()); 1535 Ops.push_back(Cvt); 1536 } 1537 1538 ++EltIdx; 1539 } 1540 1541 assert(Ops.size() == NElts); 1542 1543 return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); 1544 } 1545 1546 return SDValue(); 1547 } 1548 1549 /// \brief Return true if the given offset Size in bytes can be folded into 1550 /// the immediate offsets of a memory instruction for the given address space. 1551 static bool canFoldOffset(unsigned OffsetSize, unsigned AS, 1552 const AMDGPUSubtarget &STI) { 1553 switch (AS) { 1554 case AMDGPUAS::GLOBAL_ADDRESS: { 1555 // MUBUF instructions a 12-bit offset in bytes. 1556 return isUInt<12>(OffsetSize); 1557 } 1558 case AMDGPUAS::CONSTANT_ADDRESS: { 1559 // SMRD instructions have an 8-bit offset in dwords on SI and 1560 // a 20-bit offset in bytes on VI. 1561 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1562 return isUInt<20>(OffsetSize); 1563 else 1564 return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); 1565 } 1566 case AMDGPUAS::LOCAL_ADDRESS: 1567 case AMDGPUAS::REGION_ADDRESS: { 1568 // The single offset versions have a 16-bit offset in bytes. 1569 return isUInt<16>(OffsetSize); 1570 } 1571 case AMDGPUAS::PRIVATE_ADDRESS: 1572 // Indirect register addressing does not use any offsets. 1573 default: 1574 return 0; 1575 } 1576 } 1577 1578 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 1579 1580 // This is a variant of 1581 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 1582 // 1583 // The normal DAG combiner will do this, but only if the add has one use since 1584 // that would increase the number of instructions. 1585 // 1586 // This prevents us from seeing a constant offset that can be folded into a 1587 // memory instruction's addressing mode. If we know the resulting add offset of 1588 // a pointer can be folded into an addressing offset, we can replace the pointer 1589 // operand with the add of new constant offset. This eliminates one of the uses, 1590 // and may allow the remaining use to also be simplified. 1591 // 1592 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, 1593 unsigned AddrSpace, 1594 DAGCombinerInfo &DCI) const { 1595 SDValue N0 = N->getOperand(0); 1596 SDValue N1 = N->getOperand(1); 1597 1598 if (N0.getOpcode() != ISD::ADD) 1599 return SDValue(); 1600 1601 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 1602 if (!CN1) 1603 return SDValue(); 1604 1605 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 1606 if (!CAdd) 1607 return SDValue(); 1608 1609 // If the resulting offset is too large, we can't fold it into the addressing 1610 // mode offset. 1611 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 1612 if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) 1613 return SDValue(); 1614 1615 SelectionDAG &DAG = DCI.DAG; 1616 SDLoc SL(N); 1617 EVT VT = N->getValueType(0); 1618 1619 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 1620 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); 1621 1622 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); 1623 } 1624 1625 SDValue SITargetLowering::performAndCombine(SDNode *N, 1626 DAGCombinerInfo &DCI) const { 1627 if (DCI.isBeforeLegalize()) 1628 return SDValue(); 1629 1630 SelectionDAG &DAG = DCI.DAG; 1631 1632 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 1633 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 1634 SDValue LHS = N->getOperand(0); 1635 SDValue RHS = N->getOperand(1); 1636 1637 if (LHS.getOpcode() == ISD::SETCC && 1638 RHS.getOpcode() == ISD::SETCC) { 1639 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 1640 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 1641 1642 SDValue X = LHS.getOperand(0); 1643 SDValue Y = RHS.getOperand(0); 1644 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) 1645 return SDValue(); 1646 1647 if (LCC == ISD::SETO) { 1648 if (X != LHS.getOperand(1)) 1649 return SDValue(); 1650 1651 if (RCC == ISD::SETUNE) { 1652 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 1653 if (!C1 || !C1->isInfinity() || C1->isNegative()) 1654 return SDValue(); 1655 1656 const uint32_t Mask = SIInstrFlags::N_NORMAL | 1657 SIInstrFlags::N_SUBNORMAL | 1658 SIInstrFlags::N_ZERO | 1659 SIInstrFlags::P_ZERO | 1660 SIInstrFlags::P_SUBNORMAL | 1661 SIInstrFlags::P_NORMAL; 1662 1663 static_assert(((~(SIInstrFlags::S_NAN | 1664 SIInstrFlags::Q_NAN | 1665 SIInstrFlags::N_INFINITY | 1666 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, 1667 "mask not equal"); 1668 1669 SDLoc DL(N); 1670 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 1671 X, DAG.getConstant(Mask, DL, MVT::i32)); 1672 } 1673 } 1674 } 1675 1676 return SDValue(); 1677 } 1678 1679 SDValue SITargetLowering::performOrCombine(SDNode *N, 1680 DAGCombinerInfo &DCI) const { 1681 SelectionDAG &DAG = DCI.DAG; 1682 SDValue LHS = N->getOperand(0); 1683 SDValue RHS = N->getOperand(1); 1684 1685 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 1686 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 1687 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 1688 SDValue Src = LHS.getOperand(0); 1689 if (Src != RHS.getOperand(0)) 1690 return SDValue(); 1691 1692 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 1693 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 1694 if (!CLHS || !CRHS) 1695 return SDValue(); 1696 1697 // Only 10 bits are used. 1698 static const uint32_t MaxMask = 0x3ff; 1699 1700 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 1701 SDLoc DL(N); 1702 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 1703 Src, DAG.getConstant(NewMask, DL, MVT::i32)); 1704 } 1705 1706 return SDValue(); 1707 } 1708 1709 SDValue SITargetLowering::performClassCombine(SDNode *N, 1710 DAGCombinerInfo &DCI) const { 1711 SelectionDAG &DAG = DCI.DAG; 1712 SDValue Mask = N->getOperand(1); 1713 1714 // fp_class x, 0 -> false 1715 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { 1716 if (CMask->isNullValue()) 1717 return DAG.getConstant(0, SDLoc(N), MVT::i1); 1718 } 1719 1720 return SDValue(); 1721 } 1722 1723 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 1724 switch (Opc) { 1725 case ISD::FMAXNUM: 1726 return AMDGPUISD::FMAX3; 1727 case ISD::SMAX: 1728 return AMDGPUISD::SMAX3; 1729 case ISD::UMAX: 1730 return AMDGPUISD::UMAX3; 1731 case ISD::FMINNUM: 1732 return AMDGPUISD::FMIN3; 1733 case ISD::SMIN: 1734 return AMDGPUISD::SMIN3; 1735 case ISD::UMIN: 1736 return AMDGPUISD::UMIN3; 1737 default: 1738 llvm_unreachable("Not a min/max opcode"); 1739 } 1740 } 1741 1742 SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, 1743 DAGCombinerInfo &DCI) const { 1744 SelectionDAG &DAG = DCI.DAG; 1745 1746 unsigned Opc = N->getOpcode(); 1747 SDValue Op0 = N->getOperand(0); 1748 SDValue Op1 = N->getOperand(1); 1749 1750 // Only do this if the inner op has one use since this will just increases 1751 // register pressure for no benefit. 1752 1753 // max(max(a, b), c) 1754 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 1755 SDLoc DL(N); 1756 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 1757 DL, 1758 N->getValueType(0), 1759 Op0.getOperand(0), 1760 Op0.getOperand(1), 1761 Op1); 1762 } 1763 1764 // max(a, max(b, c)) 1765 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 1766 SDLoc DL(N); 1767 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 1768 DL, 1769 N->getValueType(0), 1770 Op0, 1771 Op1.getOperand(0), 1772 Op1.getOperand(1)); 1773 } 1774 1775 return SDValue(); 1776 } 1777 1778 SDValue SITargetLowering::performSetCCCombine(SDNode *N, 1779 DAGCombinerInfo &DCI) const { 1780 SelectionDAG &DAG = DCI.DAG; 1781 SDLoc SL(N); 1782 1783 SDValue LHS = N->getOperand(0); 1784 SDValue RHS = N->getOperand(1); 1785 EVT VT = LHS.getValueType(); 1786 1787 if (VT != MVT::f32 && VT != MVT::f64) 1788 return SDValue(); 1789 1790 // Match isinf pattern 1791 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 1792 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 1793 if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { 1794 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 1795 if (!CRHS) 1796 return SDValue(); 1797 1798 const APFloat &APF = CRHS->getValueAPF(); 1799 if (APF.isInfinity() && !APF.isNegative()) { 1800 unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; 1801 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), 1802 DAG.getConstant(Mask, SL, MVT::i32)); 1803 } 1804 } 1805 1806 return SDValue(); 1807 } 1808 1809 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 1810 DAGCombinerInfo &DCI) const { 1811 SelectionDAG &DAG = DCI.DAG; 1812 SDLoc DL(N); 1813 1814 switch (N->getOpcode()) { 1815 default: 1816 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1817 case ISD::SETCC: 1818 return performSetCCCombine(N, DCI); 1819 case ISD::FMAXNUM: // TODO: What about fmax_legacy? 1820 case ISD::FMINNUM: 1821 case ISD::SMAX: 1822 case ISD::SMIN: 1823 case ISD::UMAX: 1824 case ISD::UMIN: { 1825 if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && 1826 N->getValueType(0) != MVT::f64 && 1827 getTargetMachine().getOptLevel() > CodeGenOpt::None) 1828 return performMin3Max3Combine(N, DCI); 1829 break; 1830 } 1831 1832 case AMDGPUISD::CVT_F32_UBYTE0: 1833 case AMDGPUISD::CVT_F32_UBYTE1: 1834 case AMDGPUISD::CVT_F32_UBYTE2: 1835 case AMDGPUISD::CVT_F32_UBYTE3: { 1836 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 1837 1838 SDValue Src = N->getOperand(0); 1839 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 1840 1841 APInt KnownZero, KnownOne; 1842 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 1843 !DCI.isBeforeLegalizeOps()); 1844 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 1845 if (TLO.ShrinkDemandedConstant(Src, Demanded) || 1846 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { 1847 DCI.CommitTargetLoweringOpt(TLO); 1848 } 1849 1850 break; 1851 } 1852 1853 case ISD::UINT_TO_FP: { 1854 return performUCharToFloatCombine(N, DCI); 1855 1856 case ISD::FADD: { 1857 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 1858 break; 1859 1860 EVT VT = N->getValueType(0); 1861 if (VT != MVT::f32) 1862 break; 1863 1864 // Only do this if we are not trying to support denormals. v_mad_f32 does 1865 // not support denormals ever. 1866 if (Subtarget->hasFP32Denormals()) 1867 break; 1868 1869 SDValue LHS = N->getOperand(0); 1870 SDValue RHS = N->getOperand(1); 1871 1872 // These should really be instruction patterns, but writing patterns with 1873 // source modiifiers is a pain. 1874 1875 // fadd (fadd (a, a), b) -> mad 2.0, a, b 1876 if (LHS.getOpcode() == ISD::FADD) { 1877 SDValue A = LHS.getOperand(0); 1878 if (A == LHS.getOperand(1)) { 1879 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 1880 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); 1881 } 1882 } 1883 1884 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 1885 if (RHS.getOpcode() == ISD::FADD) { 1886 SDValue A = RHS.getOperand(0); 1887 if (A == RHS.getOperand(1)) { 1888 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 1889 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); 1890 } 1891 } 1892 1893 return SDValue(); 1894 } 1895 case ISD::FSUB: { 1896 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 1897 break; 1898 1899 EVT VT = N->getValueType(0); 1900 1901 // Try to get the fneg to fold into the source modifier. This undoes generic 1902 // DAG combines and folds them into the mad. 1903 // 1904 // Only do this if we are not trying to support denormals. v_mad_f32 does 1905 // not support denormals ever. 1906 if (VT == MVT::f32 && 1907 !Subtarget->hasFP32Denormals()) { 1908 SDValue LHS = N->getOperand(0); 1909 SDValue RHS = N->getOperand(1); 1910 if (LHS.getOpcode() == ISD::FADD) { 1911 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 1912 1913 SDValue A = LHS.getOperand(0); 1914 if (A == LHS.getOperand(1)) { 1915 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 1916 SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); 1917 1918 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); 1919 } 1920 } 1921 1922 if (RHS.getOpcode() == ISD::FADD) { 1923 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 1924 1925 SDValue A = RHS.getOperand(0); 1926 if (A == RHS.getOperand(1)) { 1927 const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); 1928 return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); 1929 } 1930 } 1931 1932 return SDValue(); 1933 } 1934 1935 break; 1936 } 1937 } 1938 case ISD::LOAD: 1939 case ISD::STORE: 1940 case ISD::ATOMIC_LOAD: 1941 case ISD::ATOMIC_STORE: 1942 case ISD::ATOMIC_CMP_SWAP: 1943 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 1944 case ISD::ATOMIC_SWAP: 1945 case ISD::ATOMIC_LOAD_ADD: 1946 case ISD::ATOMIC_LOAD_SUB: 1947 case ISD::ATOMIC_LOAD_AND: 1948 case ISD::ATOMIC_LOAD_OR: 1949 case ISD::ATOMIC_LOAD_XOR: 1950 case ISD::ATOMIC_LOAD_NAND: 1951 case ISD::ATOMIC_LOAD_MIN: 1952 case ISD::ATOMIC_LOAD_MAX: 1953 case ISD::ATOMIC_LOAD_UMIN: 1954 case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. 1955 if (DCI.isBeforeLegalize()) 1956 break; 1957 1958 MemSDNode *MemNode = cast<MemSDNode>(N); 1959 SDValue Ptr = MemNode->getBasePtr(); 1960 1961 // TODO: We could also do this for multiplies. 1962 unsigned AS = MemNode->getAddressSpace(); 1963 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { 1964 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); 1965 if (NewPtr) { 1966 SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); 1967 1968 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; 1969 return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); 1970 } 1971 } 1972 break; 1973 } 1974 case ISD::AND: 1975 return performAndCombine(N, DCI); 1976 case ISD::OR: 1977 return performOrCombine(N, DCI); 1978 case AMDGPUISD::FP_CLASS: 1979 return performClassCombine(N, DCI); 1980 } 1981 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1982 } 1983 1984 /// \brief Analyze the possible immediate value Op 1985 /// 1986 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 1987 /// and the immediate value if it's a literal immediate 1988 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 1989 1990 const SIInstrInfo *TII = 1991 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 1992 1993 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 1994 if (TII->isInlineConstant(Node->getAPIntValue())) 1995 return 0; 1996 1997 uint64_t Val = Node->getZExtValue(); 1998 return isUInt<32>(Val) ? Val : -1; 1999 } 2000 2001 if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { 2002 if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) 2003 return 0; 2004 2005 if (Node->getValueType(0) == MVT::f32) 2006 return FloatToBits(Node->getValueAPF().convertToFloat()); 2007 2008 return -1; 2009 } 2010 2011 return -1; 2012 } 2013 2014 /// \brief Helper function for adjustWritemask 2015 static unsigned SubIdx2Lane(unsigned Idx) { 2016 switch (Idx) { 2017 default: return 0; 2018 case AMDGPU::sub0: return 0; 2019 case AMDGPU::sub1: return 1; 2020 case AMDGPU::sub2: return 2; 2021 case AMDGPU::sub3: return 3; 2022 } 2023 } 2024 2025 /// \brief Adjust the writemask of MIMG instructions 2026 void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 2027 SelectionDAG &DAG) const { 2028 SDNode *Users[4] = { }; 2029 unsigned Lane = 0; 2030 unsigned OldDmask = Node->getConstantOperandVal(0); 2031 unsigned NewDmask = 0; 2032 2033 // Try to figure out the used register components 2034 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 2035 I != E; ++I) { 2036 2037 // Abort if we can't understand the usage 2038 if (!I->isMachineOpcode() || 2039 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 2040 return; 2041 2042 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 2043 // Note that subregs are packed, i.e. Lane==0 is the first bit set 2044 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 2045 // set, etc. 2046 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 2047 2048 // Set which texture component corresponds to the lane. 2049 unsigned Comp; 2050 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 2051 assert(Dmask); 2052 Comp = countTrailingZeros(Dmask); 2053 Dmask &= ~(1 << Comp); 2054 } 2055 2056 // Abort if we have more than one user per component 2057 if (Users[Lane]) 2058 return; 2059 2060 Users[Lane] = *I; 2061 NewDmask |= 1 << Comp; 2062 } 2063 2064 // Abort if there's no change 2065 if (NewDmask == OldDmask) 2066 return; 2067 2068 // Adjust the writemask in the node 2069 std::vector<SDValue> Ops; 2070 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); 2071 Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); 2072 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); 2073 2074 // If we only got one lane, replace it with a copy 2075 // (if NewDmask has only one bit set...) 2076 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 2077 SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), 2078 MVT::i32); 2079 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 2080 SDLoc(), Users[Lane]->getValueType(0), 2081 SDValue(Node, 0), RC); 2082 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 2083 return; 2084 } 2085 2086 // Update the users of the node with the new indices 2087 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 2088 2089 SDNode *User = Users[i]; 2090 if (!User) 2091 continue; 2092 2093 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); 2094 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 2095 2096 switch (Idx) { 2097 default: break; 2098 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 2099 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 2100 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 2101 } 2102 } 2103 } 2104 2105 static bool isFrameIndexOp(SDValue Op) { 2106 if (Op.getOpcode() == ISD::AssertZext) 2107 Op = Op.getOperand(0); 2108 2109 return isa<FrameIndexSDNode>(Op); 2110 } 2111 2112 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) 2113 /// with frame index operands. 2114 /// LLVM assumes that inputs are to these instructions are registers. 2115 void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 2116 SelectionDAG &DAG) const { 2117 2118 SmallVector<SDValue, 8> Ops; 2119 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 2120 if (!isFrameIndexOp(Node->getOperand(i))) { 2121 Ops.push_back(Node->getOperand(i)); 2122 continue; 2123 } 2124 2125 SDLoc DL(Node); 2126 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 2127 Node->getOperand(i).getValueType(), 2128 Node->getOperand(i)), 0)); 2129 } 2130 2131 DAG.UpdateNodeOperands(Node, Ops); 2132 } 2133 2134 /// \brief Fold the instructions after selecting them. 2135 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 2136 SelectionDAG &DAG) const { 2137 const SIInstrInfo *TII = 2138 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2139 2140 if (TII->isMIMG(Node->getMachineOpcode())) 2141 adjustWritemask(Node, DAG); 2142 2143 if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || 2144 Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { 2145 legalizeTargetIndependentNode(Node, DAG); 2146 return Node; 2147 } 2148 return Node; 2149 } 2150 2151 /// \brief Assign the register class depending on the number of 2152 /// bits set in the writemask 2153 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 2154 SDNode *Node) const { 2155 const SIInstrInfo *TII = 2156 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2157 2158 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2159 2160 if (TII->isVOP3(MI->getOpcode())) { 2161 // Make sure constant bus requirements are respected. 2162 TII->legalizeOperandsVOP3(MRI, MI); 2163 return; 2164 } 2165 2166 if (TII->isMIMG(*MI)) { 2167 unsigned VReg = MI->getOperand(0).getReg(); 2168 unsigned Writemask = MI->getOperand(1).getImm(); 2169 unsigned BitsSet = 0; 2170 for (unsigned i = 0; i < 4; ++i) 2171 BitsSet += Writemask & (1 << i) ? 1 : 0; 2172 2173 const TargetRegisterClass *RC; 2174 switch (BitsSet) { 2175 default: return; 2176 case 1: RC = &AMDGPU::VGPR_32RegClass; break; 2177 case 2: RC = &AMDGPU::VReg_64RegClass; break; 2178 case 3: RC = &AMDGPU::VReg_96RegClass; break; 2179 } 2180 2181 unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); 2182 MI->setDesc(TII->get(NewOpcode)); 2183 MRI.setRegClass(VReg, RC); 2184 return; 2185 } 2186 2187 // Replace unused atomics with the no return version. 2188 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); 2189 if (NoRetAtomicOp != -1) { 2190 if (!Node->hasAnyUseOfValue(0)) { 2191 MI->setDesc(TII->get(NoRetAtomicOp)); 2192 MI->RemoveOperand(0); 2193 } 2194 2195 return; 2196 } 2197 } 2198 2199 static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { 2200 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); 2201 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 2202 } 2203 2204 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 2205 SDLoc DL, 2206 SDValue Ptr) const { 2207 const SIInstrInfo *TII = 2208 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2209 2210 // Build the half of the subregister with the constants before building the 2211 // full 128-bit register. If we are building multiple resource descriptors, 2212 // this will allow CSEing of the 2-component register. 2213 const SDValue Ops0[] = { 2214 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), 2215 buildSMovImm32(DAG, DL, 0), 2216 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 2217 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 2218 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) 2219 }; 2220 2221 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, 2222 MVT::v2i32, Ops0), 0); 2223 2224 // Combine the constants and the pointer. 2225 const SDValue Ops1[] = { 2226 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 2227 Ptr, 2228 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), 2229 SubRegHi, 2230 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) 2231 }; 2232 2233 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 2234 } 2235 2236 /// \brief Return a resource descriptor with the 'Add TID' bit enabled 2237 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] 2238 /// of the resource descriptor) to create an offset, which is added to 2239 /// the resource pointer. 2240 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, 2241 SDLoc DL, 2242 SDValue Ptr, 2243 uint32_t RsrcDword1, 2244 uint64_t RsrcDword2And3) const { 2245 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 2246 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 2247 if (RsrcDword1) { 2248 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 2249 DAG.getConstant(RsrcDword1, DL, MVT::i32)), 2250 0); 2251 } 2252 2253 SDValue DataLo = buildSMovImm32(DAG, DL, 2254 RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 2255 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 2256 2257 const SDValue Ops[] = { 2258 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 2259 PtrLo, 2260 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 2261 PtrHi, 2262 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), 2263 DataLo, 2264 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), 2265 DataHi, 2266 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) 2267 }; 2268 2269 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 2270 } 2271 2272 MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, 2273 SDLoc DL, 2274 SDValue Ptr) const { 2275 const SIInstrInfo *TII = 2276 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2277 2278 return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23()); 2279 } 2280 2281 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 2282 const TargetRegisterClass *RC, 2283 unsigned Reg, EVT VT) const { 2284 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 2285 2286 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 2287 cast<RegisterSDNode>(VReg)->getReg(), VT); 2288 } 2289 2290 //===----------------------------------------------------------------------===// 2291 // SI Inline Assembly Support 2292 //===----------------------------------------------------------------------===// 2293 2294 std::pair<unsigned, const TargetRegisterClass *> 2295 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 2296 StringRef Constraint, 2297 MVT VT) const { 2298 if (Constraint == "r") { 2299 switch(VT.SimpleTy) { 2300 default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); 2301 case MVT::i64: 2302 return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); 2303 case MVT::i32: 2304 return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); 2305 } 2306 } 2307 2308 if (Constraint.size() > 1) { 2309 const TargetRegisterClass *RC = nullptr; 2310 if (Constraint[1] == 'v') { 2311 RC = &AMDGPU::VGPR_32RegClass; 2312 } else if (Constraint[1] == 's') { 2313 RC = &AMDGPU::SGPR_32RegClass; 2314 } 2315 2316 if (RC) { 2317 uint32_t Idx; 2318 bool Failed = Constraint.substr(2).getAsInteger(10, Idx); 2319 if (!Failed && Idx < RC->getNumRegs()) 2320 return std::make_pair(RC->getRegister(Idx), RC); 2321 } 2322 } 2323 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 2324 } 2325