1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for SI 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifdef _MSC_VER 16 // Provide M_PI. 17 #define _USE_MATH_DEFINES 18 #include <cmath> 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPUIntrinsicInfo.h" 23 #include "AMDGPUSubtarget.h" 24 #include "SIISelLowering.h" 25 #include "SIInstrInfo.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "SIRegisterInfo.h" 28 #include "llvm/ADT/BitVector.h" 29 #include "llvm/ADT/StringSwitch.h" 30 #include "llvm/CodeGen/CallingConvLower.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/CodeGen/SelectionDAG.h" 34 #include "llvm/CodeGen/Analysis.h" 35 #include "llvm/IR/DiagnosticInfo.h" 36 #include "llvm/IR/Function.h" 37 38 using namespace llvm; 39 40 static cl::opt<bool> EnableVGPRIndexMode( 41 "amdgpu-vgpr-index-mode", 42 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 43 cl::init(false)); 44 45 46 static unsigned findFirstFreeSGPR(CCState &CCInfo) { 47 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 48 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { 49 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { 50 return AMDGPU::SGPR0 + Reg; 51 } 52 } 53 llvm_unreachable("Cannot allocate sgpr"); 54 } 55 56 SITargetLowering::SITargetLowering(const TargetMachine &TM, 57 const SISubtarget &STI) 58 : AMDGPUTargetLowering(TM, STI) { 59 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 60 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 61 62 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); 63 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 64 65 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 66 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 67 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 68 69 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); 70 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); 71 72 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); 73 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 74 75 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); 76 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 77 78 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); 79 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 80 81 computeRegisterProperties(STI.getRegisterInfo()); 82 83 // We need to custom lower vector stores from local memory 84 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 85 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 86 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 87 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 88 setOperationAction(ISD::LOAD, MVT::i1, Custom); 89 90 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 91 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 92 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 93 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 94 setOperationAction(ISD::STORE, MVT::i1, Custom); 95 96 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 97 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 98 setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); 99 100 setOperationAction(ISD::SELECT, MVT::i1, Promote); 101 setOperationAction(ISD::SELECT, MVT::i64, Custom); 102 setOperationAction(ISD::SELECT, MVT::f64, Promote); 103 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 104 105 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 106 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 107 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 108 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 109 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 110 111 setOperationAction(ISD::SETCC, MVT::i1, Promote); 112 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 113 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 114 115 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); 116 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 117 118 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 119 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 120 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 121 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 122 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 123 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 124 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 125 126 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 127 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 128 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 129 130 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 131 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 132 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 133 setOperationAction(ISD::BR_CC, MVT::i64, Expand); 134 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 135 setOperationAction(ISD::BR_CC, MVT::f64, Expand); 136 137 // We only support LOAD/STORE and vector manipulation ops for vectors 138 // with > 4 elements. 139 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { 140 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 141 switch (Op) { 142 case ISD::LOAD: 143 case ISD::STORE: 144 case ISD::BUILD_VECTOR: 145 case ISD::BITCAST: 146 case ISD::EXTRACT_VECTOR_ELT: 147 case ISD::INSERT_VECTOR_ELT: 148 case ISD::INSERT_SUBVECTOR: 149 case ISD::EXTRACT_SUBVECTOR: 150 case ISD::SCALAR_TO_VECTOR: 151 break; 152 case ISD::CONCAT_VECTORS: 153 setOperationAction(Op, VT, Custom); 154 break; 155 default: 156 setOperationAction(Op, VT, Expand); 157 break; 158 } 159 } 160 } 161 162 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that 163 // is expanded to avoid having two separate loops in case the index is a VGPR. 164 165 // Most operations are naturally 32-bit vector operations. We only support 166 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. 167 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { 168 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 169 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); 170 171 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 172 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); 173 174 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 175 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); 176 177 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 178 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); 179 } 180 181 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 182 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 183 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 184 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 185 186 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, 187 // and output demarshalling 188 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 189 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 190 191 // We can't return success/failure, only the old value, 192 // let LLVM add the comparison 193 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); 194 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); 195 196 if (getSubtarget()->hasFlatAddressSpace()) { 197 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); 198 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); 199 } 200 201 setOperationAction(ISD::BSWAP, MVT::i32, Legal); 202 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 203 204 // On SI this is s_memtime and s_memrealtime on VI. 205 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 206 setOperationAction(ISD::TRAP, MVT::Other, Custom); 207 208 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 209 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 210 211 if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) { 212 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 213 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 214 setOperationAction(ISD::FRINT, MVT::f64, Legal); 215 } 216 217 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 218 219 setOperationAction(ISD::FSIN, MVT::f32, Custom); 220 setOperationAction(ISD::FCOS, MVT::f32, Custom); 221 setOperationAction(ISD::FDIV, MVT::f32, Custom); 222 setOperationAction(ISD::FDIV, MVT::f64, Custom); 223 224 setTargetDAGCombine(ISD::FADD); 225 setTargetDAGCombine(ISD::FSUB); 226 setTargetDAGCombine(ISD::FMINNUM); 227 setTargetDAGCombine(ISD::FMAXNUM); 228 setTargetDAGCombine(ISD::SMIN); 229 setTargetDAGCombine(ISD::SMAX); 230 setTargetDAGCombine(ISD::UMIN); 231 setTargetDAGCombine(ISD::UMAX); 232 setTargetDAGCombine(ISD::SETCC); 233 setTargetDAGCombine(ISD::AND); 234 setTargetDAGCombine(ISD::OR); 235 setTargetDAGCombine(ISD::XOR); 236 setTargetDAGCombine(ISD::UINT_TO_FP); 237 setTargetDAGCombine(ISD::FCANONICALIZE); 238 239 // All memory operations. Some folding on the pointer operand is done to help 240 // matching the constant offsets in the addressing modes. 241 setTargetDAGCombine(ISD::LOAD); 242 setTargetDAGCombine(ISD::STORE); 243 setTargetDAGCombine(ISD::ATOMIC_LOAD); 244 setTargetDAGCombine(ISD::ATOMIC_STORE); 245 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); 246 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); 247 setTargetDAGCombine(ISD::ATOMIC_SWAP); 248 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); 249 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); 250 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); 251 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); 252 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); 253 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); 254 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); 255 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); 256 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); 257 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); 258 259 setSchedulingPreference(Sched::RegPressure); 260 } 261 262 const SISubtarget *SITargetLowering::getSubtarget() const { 263 return static_cast<const SISubtarget *>(Subtarget); 264 } 265 266 //===----------------------------------------------------------------------===// 267 // TargetLowering queries 268 //===----------------------------------------------------------------------===// 269 270 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 271 const CallInst &CI, 272 unsigned IntrID) const { 273 switch (IntrID) { 274 case Intrinsic::amdgcn_atomic_inc: 275 case Intrinsic::amdgcn_atomic_dec: 276 Info.opc = ISD::INTRINSIC_W_CHAIN; 277 Info.memVT = MVT::getVT(CI.getType()); 278 Info.ptrVal = CI.getOperand(0); 279 Info.align = 0; 280 Info.vol = false; 281 Info.readMem = true; 282 Info.writeMem = true; 283 return true; 284 default: 285 return false; 286 } 287 } 288 289 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, 290 EVT) const { 291 // SI has some legal vector types, but no legal vector operations. Say no 292 // shuffles are legal in order to prefer scalarizing some vector operations. 293 return false; 294 } 295 296 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { 297 // Flat instructions do not have offsets, and only have the register 298 // address. 299 return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); 300 } 301 302 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { 303 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and 304 // additionally can do r + r + i with addr64. 32-bit has more addressing 305 // mode options. Depending on the resource constant, it can also do 306 // (i64 r0) + (i32 r1) * (i14 i). 307 // 308 // Private arrays end up using a scratch buffer most of the time, so also 309 // assume those use MUBUF instructions. Scratch loads / stores are currently 310 // implemented as mubuf instructions with offen bit set, so slightly 311 // different than the normal addr64. 312 if (!isUInt<12>(AM.BaseOffs)) 313 return false; 314 315 // FIXME: Since we can split immediate into soffset and immediate offset, 316 // would it make sense to allow any immediate? 317 318 switch (AM.Scale) { 319 case 0: // r + i or just i, depending on HasBaseReg. 320 return true; 321 case 1: 322 return true; // We have r + r or r + i. 323 case 2: 324 if (AM.HasBaseReg) { 325 // Reject 2 * r + r. 326 return false; 327 } 328 329 // Allow 2 * r as r + r 330 // Or 2 * r + i is allowed as r + r + i. 331 return true; 332 default: // Don't allow n * r 333 return false; 334 } 335 } 336 337 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, 338 const AddrMode &AM, Type *Ty, 339 unsigned AS) const { 340 // No global is ever allowed as a base. 341 if (AM.BaseGV) 342 return false; 343 344 switch (AS) { 345 case AMDGPUAS::GLOBAL_ADDRESS: { 346 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 347 // Assume the we will use FLAT for all global memory accesses 348 // on VI. 349 // FIXME: This assumption is currently wrong. On VI we still use 350 // MUBUF instructions for the r + i addressing mode. As currently 351 // implemented, the MUBUF instructions only work on buffer < 4GB. 352 // It may be possible to support > 4GB buffers with MUBUF instructions, 353 // by setting the stride value in the resource descriptor which would 354 // increase the size limit to (stride * 4GB). However, this is risky, 355 // because it has never been validated. 356 return isLegalFlatAddressingMode(AM); 357 } 358 359 return isLegalMUBUFAddressingMode(AM); 360 } 361 case AMDGPUAS::CONSTANT_ADDRESS: { 362 // If the offset isn't a multiple of 4, it probably isn't going to be 363 // correctly aligned. 364 // FIXME: Can we get the real alignment here? 365 if (AM.BaseOffs % 4 != 0) 366 return isLegalMUBUFAddressingMode(AM); 367 368 // There are no SMRD extloads, so if we have to do a small type access we 369 // will use a MUBUF load. 370 // FIXME?: We also need to do this if unaligned, but we don't know the 371 // alignment here. 372 if (DL.getTypeStoreSize(Ty) < 4) 373 return isLegalMUBUFAddressingMode(AM); 374 375 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { 376 // SMRD instructions have an 8-bit, dword offset on SI. 377 if (!isUInt<8>(AM.BaseOffs / 4)) 378 return false; 379 } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) { 380 // On CI+, this can also be a 32-bit literal constant offset. If it fits 381 // in 8-bits, it can use a smaller encoding. 382 if (!isUInt<32>(AM.BaseOffs / 4)) 383 return false; 384 } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) { 385 // On VI, these use the SMEM format and the offset is 20-bit in bytes. 386 if (!isUInt<20>(AM.BaseOffs)) 387 return false; 388 } else 389 llvm_unreachable("unhandled generation"); 390 391 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 392 return true; 393 394 if (AM.Scale == 1 && AM.HasBaseReg) 395 return true; 396 397 return false; 398 } 399 400 case AMDGPUAS::PRIVATE_ADDRESS: 401 return isLegalMUBUFAddressingMode(AM); 402 403 case AMDGPUAS::LOCAL_ADDRESS: 404 case AMDGPUAS::REGION_ADDRESS: { 405 // Basic, single offset DS instructions allow a 16-bit unsigned immediate 406 // field. 407 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have 408 // an 8-bit dword offset but we don't know the alignment here. 409 if (!isUInt<16>(AM.BaseOffs)) 410 return false; 411 412 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 413 return true; 414 415 if (AM.Scale == 1 && AM.HasBaseReg) 416 return true; 417 418 return false; 419 } 420 case AMDGPUAS::FLAT_ADDRESS: 421 case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: 422 // For an unknown address space, this usually means that this is for some 423 // reason being used for pure arithmetic, and not based on some addressing 424 // computation. We don't have instructions that compute pointers with any 425 // addressing modes, so treat them as having no offset like flat 426 // instructions. 427 return isLegalFlatAddressingMode(AM); 428 429 default: 430 llvm_unreachable("unhandled address space"); 431 } 432 } 433 434 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 435 unsigned AddrSpace, 436 unsigned Align, 437 bool *IsFast) const { 438 if (IsFast) 439 *IsFast = false; 440 441 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, 442 // which isn't a simple VT. 443 // Until MVT is extended to handle this, simply check for the size and 444 // rely on the condition below: allow accesses if the size is a multiple of 4. 445 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && 446 VT.getStoreSize() > 16)) { 447 return false; 448 } 449 450 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 451 AddrSpace == AMDGPUAS::REGION_ADDRESS) { 452 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte 453 // aligned, 8 byte access in a single operation using ds_read2/write2_b32 454 // with adjacent offsets. 455 bool AlignedBy4 = (Align % 4 == 0); 456 if (IsFast) 457 *IsFast = AlignedBy4; 458 459 return AlignedBy4; 460 } 461 462 if (Subtarget->hasUnalignedBufferAccess()) { 463 // If we have an uniform constant load, it still requires using a slow 464 // buffer instruction if unaligned. 465 if (IsFast) { 466 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ? 467 (Align % 4 == 0) : true; 468 } 469 470 return true; 471 } 472 473 // Smaller than dword value must be aligned. 474 if (VT.bitsLT(MVT::i32)) 475 return false; 476 477 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 478 // byte-address are ignored, thus forcing Dword alignment. 479 // This applies to private, global, and constant memory. 480 if (IsFast) 481 *IsFast = true; 482 483 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 484 } 485 486 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 487 unsigned SrcAlign, bool IsMemset, 488 bool ZeroMemset, 489 bool MemcpyStrSrc, 490 MachineFunction &MF) const { 491 // FIXME: Should account for address space here. 492 493 // The default fallback uses the private pointer size as a guess for a type to 494 // use. Make sure we switch these to 64-bit accesses. 495 496 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global 497 return MVT::v4i32; 498 499 if (Size >= 8 && DstAlign >= 4) 500 return MVT::v2i32; 501 502 // Use the default. 503 return MVT::Other; 504 } 505 506 static bool isFlatGlobalAddrSpace(unsigned AS) { 507 return AS == AMDGPUAS::GLOBAL_ADDRESS || 508 AS == AMDGPUAS::FLAT_ADDRESS || 509 AS == AMDGPUAS::CONSTANT_ADDRESS; 510 } 511 512 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 513 unsigned DestAS) const { 514 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); 515 } 516 517 bool SITargetLowering::isMemOpUniform(const SDNode *N) const { 518 const MemSDNode *MemNode = cast<MemSDNode>(N); 519 const Value *Ptr = MemNode->getMemOperand()->getValue(); 520 521 // UndefValue means this is a load of a kernel input. These are uniform. 522 // Sometimes LDS instructions have constant pointers. 523 // If Ptr is null, then that means this mem operand contains a 524 // PseudoSourceValue like GOT. 525 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 526 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 527 return true; 528 529 const Instruction *I = dyn_cast<Instruction>(Ptr); 530 return I && I->getMetadata("amdgpu.uniform"); 531 } 532 533 TargetLoweringBase::LegalizeTypeAction 534 SITargetLowering::getPreferredVectorAction(EVT VT) const { 535 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) 536 return TypeSplitVector; 537 538 return TargetLoweringBase::getPreferredVectorAction(VT); 539 } 540 541 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 542 Type *Ty) const { 543 // FIXME: Could be smarter if called for vector constants. 544 return true; 545 } 546 547 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { 548 549 // i16 is not desirable unless it is a load or a store. 550 if (VT == MVT::i16 && Op != ISD::LOAD && Op != ISD::STORE) 551 return false; 552 553 // SimplifySetCC uses this function to determine whether or not it should 554 // create setcc with i1 operands. We don't have instructions for i1 setcc. 555 if (VT == MVT::i1 && Op == ISD::SETCC) 556 return false; 557 558 return TargetLowering::isTypeDesirableForOp(Op, VT); 559 } 560 561 SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, 562 const SDLoc &SL, SDValue Chain, 563 unsigned Offset) const { 564 const DataLayout &DL = DAG.getDataLayout(); 565 MachineFunction &MF = DAG.getMachineFunction(); 566 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 567 unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 568 569 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 570 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 571 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, 572 MRI.getLiveInVirtReg(InputPtrReg), PtrVT); 573 return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 574 DAG.getConstant(Offset, SL, PtrVT)); 575 } 576 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 577 const SDLoc &SL, SDValue Chain, 578 unsigned Offset, bool Signed) const { 579 const DataLayout &DL = DAG.getDataLayout(); 580 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 581 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 582 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 583 SDValue PtrOffset = DAG.getUNDEF(PtrVT); 584 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 585 586 unsigned Align = DL.getABITypeAlignment(Ty); 587 588 ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 589 if (MemVT.isFloatingPoint()) 590 ExtTy = ISD::EXTLOAD; 591 592 SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset); 593 return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset, 594 PtrInfo, MemVT, Align, 595 MachineMemOperand::MONonTemporal | 596 MachineMemOperand::MODereferenceable | 597 MachineMemOperand::MOInvariant); 598 } 599 600 SDValue SITargetLowering::LowerFormalArguments( 601 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 602 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 603 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 604 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 605 606 MachineFunction &MF = DAG.getMachineFunction(); 607 FunctionType *FType = MF.getFunction()->getFunctionType(); 608 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 609 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 610 611 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { 612 const Function *Fn = MF.getFunction(); 613 DiagnosticInfoUnsupported NoGraphicsHSA( 614 *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); 615 DAG.getContext()->diagnose(NoGraphicsHSA); 616 return DAG.getEntryNode(); 617 } 618 619 // Create stack objects that are used for emitting debugger prologue if 620 // "amdgpu-debugger-emit-prologue" attribute was specified. 621 if (ST.debuggerEmitPrologue()) 622 createDebuggerPrologueStackObjects(MF); 623 624 SmallVector<ISD::InputArg, 16> Splits; 625 BitVector Skipped(Ins.size()); 626 627 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 628 const ISD::InputArg &Arg = Ins[i]; 629 630 // First check if it's a PS input addr 631 if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && 632 !Arg.Flags.isByVal() && PSInputNum <= 15) { 633 634 if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { 635 // We can safely skip PS inputs 636 Skipped.set(i); 637 ++PSInputNum; 638 continue; 639 } 640 641 Info->markPSInputAllocated(PSInputNum); 642 if (Arg.Used) 643 Info->PSInputEna |= 1 << PSInputNum; 644 645 ++PSInputNum; 646 } 647 648 if (AMDGPU::isShader(CallConv)) { 649 // Second split vertices into their elements 650 if (Arg.VT.isVector()) { 651 ISD::InputArg NewArg = Arg; 652 NewArg.Flags.setSplit(); 653 NewArg.VT = Arg.VT.getVectorElementType(); 654 655 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 656 // three or five element vertex only needs three or five registers, 657 // NOT four or eight. 658 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 659 unsigned NumElements = ParamType->getVectorNumElements(); 660 661 for (unsigned j = 0; j != NumElements; ++j) { 662 Splits.push_back(NewArg); 663 NewArg.PartOffset += NewArg.VT.getStoreSize(); 664 } 665 } else { 666 Splits.push_back(Arg); 667 } 668 } 669 } 670 671 SmallVector<CCValAssign, 16> ArgLocs; 672 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 673 *DAG.getContext()); 674 675 // At least one interpolation mode must be enabled or else the GPU will hang. 676 // 677 // Check PSInputAddr instead of PSInputEna. The idea is that if the user set 678 // PSInputAddr, the user wants to enable some bits after the compilation 679 // based on run-time states. Since we can't know what the final PSInputEna 680 // will look like, so we shouldn't do anything here and the user should take 681 // responsibility for the correct programming. 682 // 683 // Otherwise, the following restrictions apply: 684 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 685 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 686 // enabled too. 687 if (CallConv == CallingConv::AMDGPU_PS && 688 ((Info->getPSInputAddr() & 0x7F) == 0 || 689 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) { 690 CCInfo.AllocateReg(AMDGPU::VGPR0); 691 CCInfo.AllocateReg(AMDGPU::VGPR1); 692 Info->markPSInputAllocated(0); 693 Info->PSInputEna |= 1; 694 } 695 696 if (!AMDGPU::isShader(CallConv)) { 697 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); 698 } else { 699 assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() && 700 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && 701 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && 702 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && 703 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && 704 !Info->hasWorkItemIDZ()); 705 } 706 707 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 708 if (Info->hasPrivateSegmentBuffer()) { 709 unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); 710 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); 711 CCInfo.AllocateReg(PrivateSegmentBufferReg); 712 } 713 714 if (Info->hasDispatchPtr()) { 715 unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); 716 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); 717 CCInfo.AllocateReg(DispatchPtrReg); 718 } 719 720 if (Info->hasQueuePtr()) { 721 unsigned QueuePtrReg = Info->addQueuePtr(*TRI); 722 MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass); 723 CCInfo.AllocateReg(QueuePtrReg); 724 } 725 726 if (Info->hasKernargSegmentPtr()) { 727 unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); 728 MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); 729 CCInfo.AllocateReg(InputPtrReg); 730 } 731 732 if (Info->hasDispatchID()) { 733 unsigned DispatchIDReg = Info->addDispatchID(*TRI); 734 MF.addLiveIn(DispatchIDReg, &AMDGPU::SReg_64RegClass); 735 CCInfo.AllocateReg(DispatchIDReg); 736 } 737 738 if (Info->hasFlatScratchInit()) { 739 unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); 740 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass); 741 CCInfo.AllocateReg(FlatScratchInitReg); 742 } 743 744 if (!AMDGPU::isShader(CallConv)) 745 analyzeFormalArgumentsCompute(CCInfo, Ins); 746 else 747 AnalyzeFormalArguments(CCInfo, Splits); 748 749 SmallVector<SDValue, 16> Chains; 750 751 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 752 753 const ISD::InputArg &Arg = Ins[i]; 754 if (Skipped[i]) { 755 InVals.push_back(DAG.getUNDEF(Arg.VT)); 756 continue; 757 } 758 759 CCValAssign &VA = ArgLocs[ArgIdx++]; 760 MVT VT = VA.getLocVT(); 761 762 if (VA.isMemLoc()) { 763 VT = Ins[i].VT; 764 EVT MemVT = VA.getLocVT(); 765 const unsigned Offset = Subtarget->getExplicitKernelArgOffset() + 766 VA.getLocMemOffset(); 767 // The first 36 bytes of the input buffer contains information about 768 // thread group and global sizes. 769 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, 770 Offset, Ins[i].Flags.isSExt()); 771 Chains.push_back(Arg.getValue(1)); 772 773 auto *ParamTy = 774 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 775 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && 776 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 777 // On SI local pointers are just offsets into LDS, so they are always 778 // less than 16-bits. On CI and newer they could potentially be 779 // real pointers, so we can't guarantee their size. 780 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, 781 DAG.getValueType(MVT::i16)); 782 } 783 784 InVals.push_back(Arg); 785 Info->setABIArgOffset(Offset + MemVT.getStoreSize()); 786 continue; 787 } 788 assert(VA.isRegLoc() && "Parameter must be in a register!"); 789 790 unsigned Reg = VA.getLocReg(); 791 792 if (VT == MVT::i64) { 793 // For now assume it is a pointer 794 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 795 &AMDGPU::SReg_64RegClass); 796 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 797 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 798 InVals.push_back(Copy); 799 continue; 800 } 801 802 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 803 804 Reg = MF.addLiveIn(Reg, RC); 805 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 806 807 if (Arg.VT.isVector()) { 808 809 // Build a vector from the registers 810 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 811 unsigned NumElements = ParamType->getVectorNumElements(); 812 813 SmallVector<SDValue, 4> Regs; 814 Regs.push_back(Val); 815 for (unsigned j = 1; j != NumElements; ++j) { 816 Reg = ArgLocs[ArgIdx++].getLocReg(); 817 Reg = MF.addLiveIn(Reg, RC); 818 819 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 820 Regs.push_back(Copy); 821 } 822 823 // Fill up the missing vector elements 824 NumElements = Arg.VT.getVectorNumElements() - NumElements; 825 Regs.append(NumElements, DAG.getUNDEF(VT)); 826 827 InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); 828 continue; 829 } 830 831 InVals.push_back(Val); 832 } 833 834 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 835 // these from the dispatch pointer. 836 837 // Start adding system SGPRs. 838 if (Info->hasWorkGroupIDX()) { 839 unsigned Reg = Info->addWorkGroupIDX(); 840 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); 841 CCInfo.AllocateReg(Reg); 842 } 843 844 if (Info->hasWorkGroupIDY()) { 845 unsigned Reg = Info->addWorkGroupIDY(); 846 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); 847 CCInfo.AllocateReg(Reg); 848 } 849 850 if (Info->hasWorkGroupIDZ()) { 851 unsigned Reg = Info->addWorkGroupIDZ(); 852 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); 853 CCInfo.AllocateReg(Reg); 854 } 855 856 if (Info->hasWorkGroupInfo()) { 857 unsigned Reg = Info->addWorkGroupInfo(); 858 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); 859 CCInfo.AllocateReg(Reg); 860 } 861 862 if (Info->hasPrivateSegmentWaveByteOffset()) { 863 // Scratch wave offset passed in system SGPR. 864 unsigned PrivateSegmentWaveByteOffsetReg; 865 866 if (AMDGPU::isShader(CallConv)) { 867 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); 868 Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); 869 } else 870 PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset(); 871 872 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); 873 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); 874 } 875 876 // Now that we've figured out where the scratch register inputs are, see if 877 // should reserve the arguments and use them directly. 878 bool HasStackObjects = MF.getFrameInfo().hasStackObjects(); 879 // Record that we know we have non-spill stack objects so we don't need to 880 // check all stack objects later. 881 if (HasStackObjects) 882 Info->setHasNonSpillStackObjects(true); 883 884 // Everything live out of a block is spilled with fast regalloc, so it's 885 // almost certain that spilling will be required. 886 if (getTargetMachine().getOptLevel() == CodeGenOpt::None) 887 HasStackObjects = true; 888 889 if (ST.isAmdCodeObjectV2()) { 890 if (HasStackObjects) { 891 // If we have stack objects, we unquestionably need the private buffer 892 // resource. For the Code Object V2 ABI, this will be the first 4 user 893 // SGPR inputs. We can reserve those and use them directly. 894 895 unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( 896 MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); 897 Info->setScratchRSrcReg(PrivateSegmentBufferReg); 898 899 unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( 900 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 901 Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); 902 } else { 903 unsigned ReservedBufferReg 904 = TRI->reservedPrivateSegmentBufferReg(MF); 905 unsigned ReservedOffsetReg 906 = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); 907 908 // We tentatively reserve the last registers (skipping the last two 909 // which may contain VCC). After register allocation, we'll replace 910 // these with the ones immediately after those which were really 911 // allocated. In the prologue copies will be inserted from the argument 912 // to these reserved registers. 913 Info->setScratchRSrcReg(ReservedBufferReg); 914 Info->setScratchWaveOffsetReg(ReservedOffsetReg); 915 } 916 } else { 917 unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); 918 919 // Without HSA, relocations are used for the scratch pointer and the 920 // buffer resource setup is always inserted in the prologue. Scratch wave 921 // offset is still in an input SGPR. 922 Info->setScratchRSrcReg(ReservedBufferReg); 923 924 if (HasStackObjects) { 925 unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( 926 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 927 Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); 928 } else { 929 unsigned ReservedOffsetReg 930 = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); 931 Info->setScratchWaveOffsetReg(ReservedOffsetReg); 932 } 933 } 934 935 if (Info->hasWorkItemIDX()) { 936 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); 937 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 938 CCInfo.AllocateReg(Reg); 939 } 940 941 if (Info->hasWorkItemIDY()) { 942 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); 943 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 944 CCInfo.AllocateReg(Reg); 945 } 946 947 if (Info->hasWorkItemIDZ()) { 948 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); 949 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 950 CCInfo.AllocateReg(Reg); 951 } 952 953 if (Chains.empty()) 954 return Chain; 955 956 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 957 } 958 959 SDValue 960 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 961 bool isVarArg, 962 const SmallVectorImpl<ISD::OutputArg> &Outs, 963 const SmallVectorImpl<SDValue> &OutVals, 964 const SDLoc &DL, SelectionDAG &DAG) const { 965 MachineFunction &MF = DAG.getMachineFunction(); 966 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 967 968 if (!AMDGPU::isShader(CallConv)) 969 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, 970 OutVals, DL, DAG); 971 972 Info->setIfReturnsVoid(Outs.size() == 0); 973 974 SmallVector<ISD::OutputArg, 48> Splits; 975 SmallVector<SDValue, 48> SplitVals; 976 977 // Split vectors into their elements. 978 for (unsigned i = 0, e = Outs.size(); i != e; ++i) { 979 const ISD::OutputArg &Out = Outs[i]; 980 981 if (Out.VT.isVector()) { 982 MVT VT = Out.VT.getVectorElementType(); 983 ISD::OutputArg NewOut = Out; 984 NewOut.Flags.setSplit(); 985 NewOut.VT = VT; 986 987 // We want the original number of vector elements here, e.g. 988 // three or five, not four or eight. 989 unsigned NumElements = Out.ArgVT.getVectorNumElements(); 990 991 for (unsigned j = 0; j != NumElements; ++j) { 992 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], 993 DAG.getConstant(j, DL, MVT::i32)); 994 SplitVals.push_back(Elem); 995 Splits.push_back(NewOut); 996 NewOut.PartOffset += NewOut.VT.getStoreSize(); 997 } 998 } else { 999 SplitVals.push_back(OutVals[i]); 1000 Splits.push_back(Out); 1001 } 1002 } 1003 1004 // CCValAssign - represent the assignment of the return value to a location. 1005 SmallVector<CCValAssign, 48> RVLocs; 1006 1007 // CCState - Info about the registers and stack slots. 1008 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1009 *DAG.getContext()); 1010 1011 // Analyze outgoing return values. 1012 AnalyzeReturn(CCInfo, Splits); 1013 1014 SDValue Flag; 1015 SmallVector<SDValue, 48> RetOps; 1016 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1017 1018 // Copy the result values into the output registers. 1019 for (unsigned i = 0, realRVLocIdx = 0; 1020 i != RVLocs.size(); 1021 ++i, ++realRVLocIdx) { 1022 CCValAssign &VA = RVLocs[i]; 1023 assert(VA.isRegLoc() && "Can only return in registers!"); 1024 1025 SDValue Arg = SplitVals[realRVLocIdx]; 1026 1027 // Copied from other backends. 1028 switch (VA.getLocInfo()) { 1029 default: llvm_unreachable("Unknown loc info!"); 1030 case CCValAssign::Full: 1031 break; 1032 case CCValAssign::BCvt: 1033 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 1034 break; 1035 } 1036 1037 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 1038 Flag = Chain.getValue(1); 1039 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1040 } 1041 1042 // Update chain and glue. 1043 RetOps[0] = Chain; 1044 if (Flag.getNode()) 1045 RetOps.push_back(Flag); 1046 1047 unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN; 1048 return DAG.getNode(Opc, DL, MVT::Other, RetOps); 1049 } 1050 1051 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, 1052 SelectionDAG &DAG) const { 1053 unsigned Reg = StringSwitch<unsigned>(RegName) 1054 .Case("m0", AMDGPU::M0) 1055 .Case("exec", AMDGPU::EXEC) 1056 .Case("exec_lo", AMDGPU::EXEC_LO) 1057 .Case("exec_hi", AMDGPU::EXEC_HI) 1058 .Case("flat_scratch", AMDGPU::FLAT_SCR) 1059 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) 1060 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) 1061 .Default(AMDGPU::NoRegister); 1062 1063 if (Reg == AMDGPU::NoRegister) { 1064 report_fatal_error(Twine("invalid register name \"" 1065 + StringRef(RegName) + "\".")); 1066 1067 } 1068 1069 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && 1070 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { 1071 report_fatal_error(Twine("invalid register \"" 1072 + StringRef(RegName) + "\" for subtarget.")); 1073 } 1074 1075 switch (Reg) { 1076 case AMDGPU::M0: 1077 case AMDGPU::EXEC_LO: 1078 case AMDGPU::EXEC_HI: 1079 case AMDGPU::FLAT_SCR_LO: 1080 case AMDGPU::FLAT_SCR_HI: 1081 if (VT.getSizeInBits() == 32) 1082 return Reg; 1083 break; 1084 case AMDGPU::EXEC: 1085 case AMDGPU::FLAT_SCR: 1086 if (VT.getSizeInBits() == 64) 1087 return Reg; 1088 break; 1089 default: 1090 llvm_unreachable("missing register type checking"); 1091 } 1092 1093 report_fatal_error(Twine("invalid type for register \"" 1094 + StringRef(RegName) + "\".")); 1095 } 1096 1097 // If kill is not the last instruction, split the block so kill is always a 1098 // proper terminator. 1099 MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, 1100 MachineBasicBlock *BB) const { 1101 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1102 1103 MachineBasicBlock::iterator SplitPoint(&MI); 1104 ++SplitPoint; 1105 1106 if (SplitPoint == BB->end()) { 1107 // Don't bother with a new block. 1108 MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); 1109 return BB; 1110 } 1111 1112 MachineFunction *MF = BB->getParent(); 1113 MachineBasicBlock *SplitBB 1114 = MF->CreateMachineBasicBlock(BB->getBasicBlock()); 1115 1116 MF->insert(++MachineFunction::iterator(BB), SplitBB); 1117 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); 1118 1119 SplitBB->transferSuccessorsAndUpdatePHIs(BB); 1120 BB->addSuccessor(SplitBB); 1121 1122 MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); 1123 return SplitBB; 1124 } 1125 1126 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the 1127 // wavefront. If the value is uniform and just happens to be in a VGPR, this 1128 // will only do one iteration. In the worst case, this will loop 64 times. 1129 // 1130 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. 1131 static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( 1132 const SIInstrInfo *TII, 1133 MachineRegisterInfo &MRI, 1134 MachineBasicBlock &OrigBB, 1135 MachineBasicBlock &LoopBB, 1136 const DebugLoc &DL, 1137 const MachineOperand &IdxReg, 1138 unsigned InitReg, 1139 unsigned ResultReg, 1140 unsigned PhiReg, 1141 unsigned InitSaveExecReg, 1142 int Offset, 1143 bool UseGPRIdxMode) { 1144 MachineBasicBlock::iterator I = LoopBB.begin(); 1145 1146 unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1147 unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1148 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1149 unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1150 1151 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) 1152 .addReg(InitReg) 1153 .addMBB(&OrigBB) 1154 .addReg(ResultReg) 1155 .addMBB(&LoopBB); 1156 1157 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec) 1158 .addReg(InitSaveExecReg) 1159 .addMBB(&OrigBB) 1160 .addReg(NewExec) 1161 .addMBB(&LoopBB); 1162 1163 // Read the next variant <- also loop target. 1164 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) 1165 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); 1166 1167 // Compare the just read M0 value to all possible Idx values. 1168 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) 1169 .addReg(CurrentIdxReg) 1170 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); 1171 1172 if (UseGPRIdxMode) { 1173 unsigned IdxReg; 1174 if (Offset == 0) { 1175 IdxReg = CurrentIdxReg; 1176 } else { 1177 IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1178 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg) 1179 .addReg(CurrentIdxReg, RegState::Kill) 1180 .addImm(Offset); 1181 } 1182 1183 MachineInstr *SetIdx = 1184 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX)) 1185 .addReg(IdxReg, RegState::Kill); 1186 SetIdx->getOperand(2).setIsUndef(); 1187 } else { 1188 // Move index from VCC into M0 1189 if (Offset == 0) { 1190 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1191 .addReg(CurrentIdxReg, RegState::Kill); 1192 } else { 1193 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 1194 .addReg(CurrentIdxReg, RegState::Kill) 1195 .addImm(Offset); 1196 } 1197 } 1198 1199 // Update EXEC, save the original EXEC value to VCC. 1200 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) 1201 .addReg(CondReg, RegState::Kill); 1202 1203 MRI.setSimpleHint(NewExec, CondReg); 1204 1205 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 1206 MachineInstr *InsertPt = 1207 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 1208 .addReg(AMDGPU::EXEC) 1209 .addReg(NewExec); 1210 1211 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 1212 // s_cbranch_scc0? 1213 1214 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 1215 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 1216 .addMBB(&LoopBB); 1217 1218 return InsertPt->getIterator(); 1219 } 1220 1221 // This has slightly sub-optimal regalloc when the source vector is killed by 1222 // the read. The register allocator does not understand that the kill is 1223 // per-workitem, so is kept alive for the whole loop so we end up not re-using a 1224 // subregister from it, using 1 more VGPR than necessary. This was saved when 1225 // this was expanded after register allocation. 1226 static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, 1227 MachineBasicBlock &MBB, 1228 MachineInstr &MI, 1229 unsigned InitResultReg, 1230 unsigned PhiReg, 1231 int Offset, 1232 bool UseGPRIdxMode) { 1233 MachineFunction *MF = MBB.getParent(); 1234 MachineRegisterInfo &MRI = MF->getRegInfo(); 1235 const DebugLoc &DL = MI.getDebugLoc(); 1236 MachineBasicBlock::iterator I(&MI); 1237 1238 unsigned DstReg = MI.getOperand(0).getReg(); 1239 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1240 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1241 1242 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); 1243 1244 // Save the EXEC mask 1245 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec) 1246 .addReg(AMDGPU::EXEC); 1247 1248 // To insert the loop we need to split the block. Move everything after this 1249 // point to a new block, and insert a new empty block between the two. 1250 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 1251 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 1252 MachineFunction::iterator MBBI(MBB); 1253 ++MBBI; 1254 1255 MF->insert(MBBI, LoopBB); 1256 MF->insert(MBBI, RemainderBB); 1257 1258 LoopBB->addSuccessor(LoopBB); 1259 LoopBB->addSuccessor(RemainderBB); 1260 1261 // Move the rest of the block into a new block. 1262 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 1263 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 1264 1265 MBB.addSuccessor(LoopBB); 1266 1267 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 1268 1269 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, 1270 InitResultReg, DstReg, PhiReg, TmpExec, 1271 Offset, UseGPRIdxMode); 1272 1273 MachineBasicBlock::iterator First = RemainderBB->begin(); 1274 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 1275 .addReg(SaveExec); 1276 1277 MI.eraseFromParent(); 1278 1279 return InsPt; 1280 } 1281 1282 // Returns subreg index, offset 1283 static std::pair<unsigned, int> 1284 computeIndirectRegAndOffset(const SIRegisterInfo &TRI, 1285 const TargetRegisterClass *SuperRC, 1286 unsigned VecReg, 1287 int Offset) { 1288 int NumElts = SuperRC->getSize() / 4; 1289 1290 // Skip out of bounds offsets, or else we would end up using an undefined 1291 // register. 1292 if (Offset >= NumElts || Offset < 0) 1293 return std::make_pair(AMDGPU::sub0, Offset); 1294 1295 return std::make_pair(AMDGPU::sub0 + Offset, 0); 1296 } 1297 1298 // Return true if the index is an SGPR and was set. 1299 static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, 1300 MachineRegisterInfo &MRI, 1301 MachineInstr &MI, 1302 int Offset, 1303 bool UseGPRIdxMode, 1304 bool IsIndirectSrc) { 1305 MachineBasicBlock *MBB = MI.getParent(); 1306 const DebugLoc &DL = MI.getDebugLoc(); 1307 MachineBasicBlock::iterator I(&MI); 1308 1309 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 1310 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); 1311 1312 assert(Idx->getReg() != AMDGPU::NoRegister); 1313 1314 if (!TII->getRegisterInfo().isSGPRClass(IdxRC)) 1315 return false; 1316 1317 if (UseGPRIdxMode) { 1318 unsigned IdxMode = IsIndirectSrc ? 1319 VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; 1320 if (Offset == 0) { 1321 MachineInstr *SetOn = 1322 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 1323 .addOperand(*Idx) 1324 .addImm(IdxMode); 1325 1326 SetOn->getOperand(3).setIsUndef(); 1327 } else { 1328 unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1329 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) 1330 .addOperand(*Idx) 1331 .addImm(Offset); 1332 MachineInstr *SetOn = 1333 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 1334 .addReg(Tmp, RegState::Kill) 1335 .addImm(IdxMode); 1336 1337 SetOn->getOperand(3).setIsUndef(); 1338 } 1339 1340 return true; 1341 } 1342 1343 if (Offset == 0) { 1344 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1345 .addOperand(*Idx); 1346 } else { 1347 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 1348 .addOperand(*Idx) 1349 .addImm(Offset); 1350 } 1351 1352 return true; 1353 } 1354 1355 // Control flow needs to be inserted if indexing with a VGPR. 1356 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, 1357 MachineBasicBlock &MBB, 1358 const SISubtarget &ST) { 1359 const SIInstrInfo *TII = ST.getInstrInfo(); 1360 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1361 MachineFunction *MF = MBB.getParent(); 1362 MachineRegisterInfo &MRI = MF->getRegInfo(); 1363 1364 unsigned Dst = MI.getOperand(0).getReg(); 1365 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); 1366 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 1367 1368 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); 1369 1370 unsigned SubReg; 1371 std::tie(SubReg, Offset) 1372 = computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset); 1373 1374 bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; 1375 1376 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) { 1377 MachineBasicBlock::iterator I(&MI); 1378 const DebugLoc &DL = MI.getDebugLoc(); 1379 1380 if (UseGPRIdxMode) { 1381 // TODO: Look at the uses to avoid the copy. This may require rescheduling 1382 // to avoid interfering with other uses, so probably requires a new 1383 // optimization pass. 1384 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) 1385 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) 1386 .addReg(SrcVec->getReg(), RegState::Implicit) 1387 .addReg(AMDGPU::M0, RegState::Implicit); 1388 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 1389 } else { 1390 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 1391 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) 1392 .addReg(SrcVec->getReg(), RegState::Implicit); 1393 } 1394 1395 MI.eraseFromParent(); 1396 1397 return &MBB; 1398 } 1399 1400 1401 const DebugLoc &DL = MI.getDebugLoc(); 1402 MachineBasicBlock::iterator I(&MI); 1403 1404 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1405 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1406 1407 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); 1408 1409 if (UseGPRIdxMode) { 1410 MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 1411 .addImm(0) // Reset inside loop. 1412 .addImm(VGPRIndexMode::SRC0_ENABLE); 1413 SetOn->getOperand(3).setIsUndef(); 1414 1415 1416 // Disable again after the loop. 1417 BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 1418 } 1419 1420 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode); 1421 MachineBasicBlock *LoopBB = InsPt->getParent(); 1422 1423 if (UseGPRIdxMode) { 1424 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) 1425 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) 1426 .addReg(SrcVec->getReg(), RegState::Implicit) 1427 .addReg(AMDGPU::M0, RegState::Implicit); 1428 } else { 1429 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 1430 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) 1431 .addReg(SrcVec->getReg(), RegState::Implicit); 1432 } 1433 1434 return LoopBB; 1435 } 1436 1437 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, 1438 MachineBasicBlock &MBB, 1439 const SISubtarget &ST) { 1440 const SIInstrInfo *TII = ST.getInstrInfo(); 1441 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1442 MachineFunction *MF = MBB.getParent(); 1443 MachineRegisterInfo &MRI = MF->getRegInfo(); 1444 1445 unsigned Dst = MI.getOperand(0).getReg(); 1446 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); 1447 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 1448 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); 1449 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 1450 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); 1451 1452 // This can be an immediate, but will be folded later. 1453 assert(Val->getReg()); 1454 1455 unsigned SubReg; 1456 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, 1457 SrcVec->getReg(), 1458 Offset); 1459 bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; 1460 1461 if (Idx->getReg() == AMDGPU::NoRegister) { 1462 MachineBasicBlock::iterator I(&MI); 1463 const DebugLoc &DL = MI.getDebugLoc(); 1464 1465 assert(Offset == 0); 1466 1467 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) 1468 .addOperand(*SrcVec) 1469 .addOperand(*Val) 1470 .addImm(SubReg); 1471 1472 MI.eraseFromParent(); 1473 return &MBB; 1474 } 1475 1476 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) { 1477 MachineBasicBlock::iterator I(&MI); 1478 const DebugLoc &DL = MI.getDebugLoc(); 1479 1480 if (UseGPRIdxMode) { 1481 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) 1482 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst 1483 .addOperand(*Val) 1484 .addReg(Dst, RegState::ImplicitDefine) 1485 .addReg(SrcVec->getReg(), RegState::Implicit) 1486 .addReg(AMDGPU::M0, RegState::Implicit); 1487 1488 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 1489 } else { 1490 const MCInstrDesc &MovRelDesc = TII->get(AMDGPU::V_MOVRELD_B32_e32); 1491 1492 MachineInstr *MovRel = 1493 BuildMI(MBB, I, DL, MovRelDesc) 1494 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst 1495 .addOperand(*Val) 1496 .addReg(Dst, RegState::ImplicitDefine) 1497 .addReg(SrcVec->getReg(), RegState::Implicit); 1498 1499 const int ImpDefIdx = MovRelDesc.getNumOperands() + 1500 MovRelDesc.getNumImplicitUses(); 1501 const int ImpUseIdx = ImpDefIdx + 1; 1502 1503 MovRel->tieOperands(ImpDefIdx, ImpUseIdx); 1504 } 1505 1506 MI.eraseFromParent(); 1507 return &MBB; 1508 } 1509 1510 if (Val->isReg()) 1511 MRI.clearKillFlags(Val->getReg()); 1512 1513 const DebugLoc &DL = MI.getDebugLoc(); 1514 1515 if (UseGPRIdxMode) { 1516 MachineBasicBlock::iterator I(&MI); 1517 1518 MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 1519 .addImm(0) // Reset inside loop. 1520 .addImm(VGPRIndexMode::DST_ENABLE); 1521 SetOn->getOperand(3).setIsUndef(); 1522 1523 // Disable again after the loop. 1524 BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 1525 } 1526 1527 unsigned PhiReg = MRI.createVirtualRegister(VecRC); 1528 1529 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, 1530 Offset, UseGPRIdxMode); 1531 MachineBasicBlock *LoopBB = InsPt->getParent(); 1532 1533 if (UseGPRIdxMode) { 1534 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) 1535 .addReg(PhiReg, RegState::Undef, SubReg) // vdst 1536 .addOperand(*Val) // src0 1537 .addReg(Dst, RegState::ImplicitDefine) 1538 .addReg(PhiReg, RegState::Implicit) 1539 .addReg(AMDGPU::M0, RegState::Implicit); 1540 } else { 1541 const MCInstrDesc &MovRelDesc = TII->get(AMDGPU::V_MOVRELD_B32_e32); 1542 // vdst is not actually read and just provides the base register index. 1543 MachineInstr *MovRel = 1544 BuildMI(*LoopBB, InsPt, DL, MovRelDesc) 1545 .addReg(PhiReg, RegState::Undef, SubReg) // vdst 1546 .addOperand(*Val) 1547 .addReg(Dst, RegState::ImplicitDefine) 1548 .addReg(PhiReg, RegState::Implicit); 1549 1550 const int ImpDefIdx = MovRelDesc.getNumOperands() + 1551 MovRelDesc.getNumImplicitUses(); 1552 const int ImpUseIdx = ImpDefIdx + 1; 1553 1554 MovRel->tieOperands(ImpDefIdx, ImpUseIdx); 1555 } 1556 1557 return LoopBB; 1558 } 1559 1560 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( 1561 MachineInstr &MI, MachineBasicBlock *BB) const { 1562 switch (MI.getOpcode()) { 1563 case AMDGPU::SI_INIT_M0: { 1564 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1565 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), 1566 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1567 .addOperand(MI.getOperand(0)); 1568 MI.eraseFromParent(); 1569 return BB; 1570 } 1571 case AMDGPU::GET_GROUPSTATICSIZE: { 1572 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1573 1574 MachineFunction *MF = BB->getParent(); 1575 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1576 DebugLoc DL = MI.getDebugLoc(); 1577 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) 1578 .addOperand(MI.getOperand(0)) 1579 .addImm(MFI->getLDSSize()); 1580 MI.eraseFromParent(); 1581 return BB; 1582 } 1583 case AMDGPU::SI_INDIRECT_SRC_V1: 1584 case AMDGPU::SI_INDIRECT_SRC_V2: 1585 case AMDGPU::SI_INDIRECT_SRC_V4: 1586 case AMDGPU::SI_INDIRECT_SRC_V8: 1587 case AMDGPU::SI_INDIRECT_SRC_V16: 1588 return emitIndirectSrc(MI, *BB, *getSubtarget()); 1589 case AMDGPU::SI_INDIRECT_DST_V1: 1590 case AMDGPU::SI_INDIRECT_DST_V2: 1591 case AMDGPU::SI_INDIRECT_DST_V4: 1592 case AMDGPU::SI_INDIRECT_DST_V8: 1593 case AMDGPU::SI_INDIRECT_DST_V16: 1594 return emitIndirectDst(MI, *BB, *getSubtarget()); 1595 case AMDGPU::SI_KILL: 1596 return splitKillBlock(MI, BB); 1597 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 1598 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 1599 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1600 1601 unsigned Dst = MI.getOperand(0).getReg(); 1602 unsigned Src0 = MI.getOperand(1).getReg(); 1603 unsigned Src1 = MI.getOperand(2).getReg(); 1604 const DebugLoc &DL = MI.getDebugLoc(); 1605 unsigned SrcCond = MI.getOperand(3).getReg(); 1606 1607 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1608 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1609 1610 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 1611 .addReg(Src0, 0, AMDGPU::sub0) 1612 .addReg(Src1, 0, AMDGPU::sub0) 1613 .addReg(SrcCond); 1614 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 1615 .addReg(Src0, 0, AMDGPU::sub1) 1616 .addReg(Src1, 0, AMDGPU::sub1) 1617 .addReg(SrcCond); 1618 1619 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) 1620 .addReg(DstLo) 1621 .addImm(AMDGPU::sub0) 1622 .addReg(DstHi) 1623 .addImm(AMDGPU::sub1); 1624 MI.eraseFromParent(); 1625 return BB; 1626 } 1627 default: 1628 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 1629 } 1630 } 1631 1632 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 1633 // This currently forces unfolding various combinations of fsub into fma with 1634 // free fneg'd operands. As long as we have fast FMA (controlled by 1635 // isFMAFasterThanFMulAndFAdd), we should perform these. 1636 1637 // When fma is quarter rate, for f64 where add / sub are at best half rate, 1638 // most of these combines appear to be cycle neutral but save on instruction 1639 // count / code size. 1640 return true; 1641 } 1642 1643 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, 1644 EVT VT) const { 1645 if (!VT.isVector()) { 1646 return MVT::i1; 1647 } 1648 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 1649 } 1650 1651 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const { 1652 return MVT::i32; 1653 } 1654 1655 // Answering this is somewhat tricky and depends on the specific device which 1656 // have different rates for fma or all f64 operations. 1657 // 1658 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 1659 // regardless of which device (although the number of cycles differs between 1660 // devices), so it is always profitable for f64. 1661 // 1662 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 1663 // only on full rate devices. Normally, we should prefer selecting v_mad_f32 1664 // which we can always do even without fused FP ops since it returns the same 1665 // result as the separate operations and since it is always full 1666 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 1667 // however does not support denormals, so we do report fma as faster if we have 1668 // a fast fma device and require denormals. 1669 // 1670 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 1671 VT = VT.getScalarType(); 1672 1673 if (!VT.isSimple()) 1674 return false; 1675 1676 switch (VT.getSimpleVT().SimpleTy) { 1677 case MVT::f32: 1678 // This is as fast on some subtargets. However, we always have full rate f32 1679 // mad available which returns the same result as the separate operations 1680 // which we should prefer over fma. We can't use this if we want to support 1681 // denormals, so only report this in these cases. 1682 return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); 1683 case MVT::f64: 1684 return true; 1685 default: 1686 break; 1687 } 1688 1689 return false; 1690 } 1691 1692 //===----------------------------------------------------------------------===// 1693 // Custom DAG Lowering Operations 1694 //===----------------------------------------------------------------------===// 1695 1696 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 1697 switch (Op.getOpcode()) { 1698 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 1699 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 1700 case ISD::LOAD: { 1701 SDValue Result = LowerLOAD(Op, DAG); 1702 assert((!Result.getNode() || 1703 Result.getNode()->getNumValues() == 2) && 1704 "Load should return a value and a chain"); 1705 return Result; 1706 } 1707 1708 case ISD::FSIN: 1709 case ISD::FCOS: 1710 return LowerTrig(Op, DAG); 1711 case ISD::SELECT: return LowerSELECT(Op, DAG); 1712 case ISD::FDIV: return LowerFDIV(Op, DAG); 1713 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); 1714 case ISD::STORE: return LowerSTORE(Op, DAG); 1715 case ISD::GlobalAddress: { 1716 MachineFunction &MF = DAG.getMachineFunction(); 1717 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1718 return LowerGlobalAddress(MFI, Op, DAG); 1719 } 1720 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 1721 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); 1722 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); 1723 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); 1724 case ISD::TRAP: return lowerTRAP(Op, DAG); 1725 } 1726 return SDValue(); 1727 } 1728 1729 /// \brief Helper function for LowerBRCOND 1730 static SDNode *findUser(SDValue Value, unsigned Opcode) { 1731 1732 SDNode *Parent = Value.getNode(); 1733 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 1734 I != E; ++I) { 1735 1736 if (I.getUse().get() != Value) 1737 continue; 1738 1739 if (I->getOpcode() == Opcode) 1740 return *I; 1741 } 1742 return nullptr; 1743 } 1744 1745 bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { 1746 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 1747 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { 1748 case AMDGPUIntrinsic::amdgcn_if: 1749 case AMDGPUIntrinsic::amdgcn_else: 1750 case AMDGPUIntrinsic::amdgcn_end_cf: 1751 case AMDGPUIntrinsic::amdgcn_loop: 1752 return true; 1753 default: 1754 return false; 1755 } 1756 } 1757 1758 if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) { 1759 switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) { 1760 case AMDGPUIntrinsic::amdgcn_break: 1761 case AMDGPUIntrinsic::amdgcn_if_break: 1762 case AMDGPUIntrinsic::amdgcn_else_break: 1763 return true; 1764 default: 1765 return false; 1766 } 1767 } 1768 1769 return false; 1770 } 1771 1772 void SITargetLowering::createDebuggerPrologueStackObjects( 1773 MachineFunction &MF) const { 1774 // Create stack objects that are used for emitting debugger prologue. 1775 // 1776 // Debugger prologue writes work group IDs and work item IDs to scratch memory 1777 // at fixed location in the following format: 1778 // offset 0: work group ID x 1779 // offset 4: work group ID y 1780 // offset 8: work group ID z 1781 // offset 16: work item ID x 1782 // offset 20: work item ID y 1783 // offset 24: work item ID z 1784 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1785 int ObjectIdx = 0; 1786 1787 // For each dimension: 1788 for (unsigned i = 0; i < 3; ++i) { 1789 // Create fixed stack object for work group ID. 1790 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true); 1791 Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); 1792 // Create fixed stack object for work item ID. 1793 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true); 1794 Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); 1795 } 1796 } 1797 1798 /// This transforms the control flow intrinsics to get the branch destination as 1799 /// last parameter, also switches branch target with BR if the need arise 1800 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 1801 SelectionDAG &DAG) const { 1802 1803 SDLoc DL(BRCOND); 1804 1805 SDNode *Intr = BRCOND.getOperand(1).getNode(); 1806 SDValue Target = BRCOND.getOperand(2); 1807 SDNode *BR = nullptr; 1808 SDNode *SetCC = nullptr; 1809 1810 if (Intr->getOpcode() == ISD::SETCC) { 1811 // As long as we negate the condition everything is fine 1812 SetCC = Intr; 1813 Intr = SetCC->getOperand(0).getNode(); 1814 1815 } else { 1816 // Get the target from BR if we don't negate the condition 1817 BR = findUser(BRCOND, ISD::BR); 1818 Target = BR->getOperand(1); 1819 } 1820 1821 // FIXME: This changes the types of the intrinsics instead of introducing new 1822 // nodes with the correct types. 1823 // e.g. llvm.amdgcn.loop 1824 1825 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3 1826 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088> 1827 1828 if (!isCFIntrinsic(Intr)) { 1829 // This is a uniform branch so we don't need to legalize. 1830 return BRCOND; 1831 } 1832 1833 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID || 1834 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; 1835 1836 assert(!SetCC || 1837 (SetCC->getConstantOperandVal(1) == 1 && 1838 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 1839 ISD::SETNE)); 1840 1841 // operands of the new intrinsic call 1842 SmallVector<SDValue, 4> Ops; 1843 if (HaveChain) 1844 Ops.push_back(BRCOND.getOperand(0)); 1845 1846 Ops.append(Intr->op_begin() + (HaveChain ? 1 : 0), Intr->op_end()); 1847 Ops.push_back(Target); 1848 1849 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 1850 1851 // build the new intrinsic call 1852 SDNode *Result = DAG.getNode( 1853 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 1854 DAG.getVTList(Res), Ops).getNode(); 1855 1856 if (!HaveChain) { 1857 SDValue Ops[] = { 1858 SDValue(Result, 0), 1859 BRCOND.getOperand(0) 1860 }; 1861 1862 Result = DAG.getMergeValues(Ops, DL).getNode(); 1863 } 1864 1865 if (BR) { 1866 // Give the branch instruction our target 1867 SDValue Ops[] = { 1868 BR->getOperand(0), 1869 BRCOND.getOperand(2) 1870 }; 1871 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 1872 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 1873 BR = NewBR.getNode(); 1874 } 1875 1876 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 1877 1878 // Copy the intrinsic results to registers 1879 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 1880 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 1881 if (!CopyToReg) 1882 continue; 1883 1884 Chain = DAG.getCopyToReg( 1885 Chain, DL, 1886 CopyToReg->getOperand(1), 1887 SDValue(Result, i - 1), 1888 SDValue()); 1889 1890 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 1891 } 1892 1893 // Remove the old intrinsic from the chain 1894 DAG.ReplaceAllUsesOfValueWith( 1895 SDValue(Intr, Intr->getNumValues() - 1), 1896 Intr->getOperand(0)); 1897 1898 return Chain; 1899 } 1900 1901 SDValue SITargetLowering::getSegmentAperture(unsigned AS, 1902 SelectionDAG &DAG) const { 1903 SDLoc SL; 1904 MachineFunction &MF = DAG.getMachineFunction(); 1905 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1906 unsigned UserSGPR = Info->getQueuePtrUserSGPR(); 1907 assert(UserSGPR != AMDGPU::NoRegister); 1908 1909 SDValue QueuePtr = CreateLiveInRegister( 1910 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); 1911 1912 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1913 // private_segment_aperture_base_hi. 1914 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1915 1916 SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr, 1917 DAG.getConstant(StructOffset, SL, MVT::i64)); 1918 1919 // TODO: Use custom target PseudoSourceValue. 1920 // TODO: We should use the value from the IR intrinsic call, but it might not 1921 // be available and how do we get it? 1922 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), 1923 AMDGPUAS::CONSTANT_ADDRESS)); 1924 1925 MachinePointerInfo PtrInfo(V, StructOffset); 1926 return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo, 1927 MinAlign(64, StructOffset), 1928 MachineMemOperand::MODereferenceable | 1929 MachineMemOperand::MOInvariant); 1930 } 1931 1932 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, 1933 SelectionDAG &DAG) const { 1934 SDLoc SL(Op); 1935 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op); 1936 1937 SDValue Src = ASC->getOperand(0); 1938 1939 // FIXME: Really support non-0 null pointers. 1940 SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32); 1941 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); 1942 1943 // flat -> local/private 1944 if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { 1945 if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1946 ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 1947 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); 1948 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); 1949 1950 return DAG.getNode(ISD::SELECT, SL, MVT::i32, 1951 NonNull, Ptr, SegmentNullPtr); 1952 } 1953 } 1954 1955 // local/private -> flat 1956 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { 1957 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1958 ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 1959 SDValue NonNull 1960 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); 1961 1962 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG); 1963 SDValue CvtPtr 1964 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); 1965 1966 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, 1967 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), 1968 FlatNullPtr); 1969 } 1970 } 1971 1972 // global <-> flat are no-ops and never emitted. 1973 1974 const MachineFunction &MF = DAG.getMachineFunction(); 1975 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 1976 *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); 1977 DAG.getContext()->diagnose(InvalidAddrSpaceCast); 1978 1979 return DAG.getUNDEF(ASC->getValueType(0)); 1980 } 1981 1982 static bool shouldEmitFixup(const GlobalValue *GV, 1983 const TargetMachine &TM) { 1984 // FIXME: We need to emit global variables in constant address space in a 1985 // separate section, and use relocations. 1986 return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; 1987 } 1988 1989 static bool shouldEmitGOTReloc(const GlobalValue *GV, 1990 const TargetMachine &TM) { 1991 return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 1992 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 1993 } 1994 1995 static bool shouldEmitPCReloc(const GlobalValue *GV, 1996 const TargetMachine &TM) { 1997 return !shouldEmitFixup(GV, TM) && !shouldEmitGOTReloc(GV, TM); 1998 } 1999 2000 bool 2001 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 2002 // We can fold offsets for anything that doesn't require a GOT relocation. 2003 return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 2004 !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine()); 2005 } 2006 2007 static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, 2008 SDLoc DL, unsigned Offset, EVT PtrVT, 2009 unsigned GAFlags = SIInstrInfo::MO_NONE) { 2010 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is 2011 // lowered to the following code sequence: 2012 // 2013 // For constant address space: 2014 // s_getpc_b64 s[0:1] 2015 // s_add_u32 s0, s0, $symbol 2016 // s_addc_u32 s1, s1, 0 2017 // 2018 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2019 // a fixup or relocation is emitted to replace $symbol with a literal 2020 // constant, which is a pc-relative offset from the encoding of the $symbol 2021 // operand to the global variable. 2022 // 2023 // For global address space: 2024 // s_getpc_b64 s[0:1] 2025 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2026 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2027 // 2028 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2029 // fixups or relocations are emitted to replace $symbol@*@lo and 2030 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2031 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2032 // operand to the global variable. 2033 // 2034 // What we want here is an offset from the value returned by s_getpc 2035 // (which is the address of the s_add_u32 instruction) to the global 2036 // variable, but since the encoding of $symbol starts 4 bytes after the start 2037 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2038 // small. This requires us to add 4 to the global variable offset in order to 2039 // compute the correct address. 2040 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, 2041 GAFlags); 2042 SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, 2043 GAFlags == SIInstrInfo::MO_NONE ? 2044 GAFlags : GAFlags + 1); 2045 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); 2046 } 2047 2048 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 2049 SDValue Op, 2050 SelectionDAG &DAG) const { 2051 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 2052 2053 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && 2054 GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) 2055 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 2056 2057 SDLoc DL(GSD); 2058 const GlobalValue *GV = GSD->getGlobal(); 2059 EVT PtrVT = Op.getValueType(); 2060 2061 if (shouldEmitFixup(GV, getTargetMachine())) 2062 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); 2063 else if (shouldEmitPCReloc(GV, getTargetMachine())) 2064 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT, 2065 SIInstrInfo::MO_REL32); 2066 2067 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, 2068 SIInstrInfo::MO_GOTPCREL32); 2069 2070 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); 2071 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 2072 const DataLayout &DataLayout = DAG.getDataLayout(); 2073 unsigned Align = DataLayout.getABITypeAlignment(PtrTy); 2074 // FIXME: Use a PseudoSourceValue once those can be assigned an address space. 2075 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 2076 2077 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align, 2078 MachineMemOperand::MODereferenceable | 2079 MachineMemOperand::MOInvariant); 2080 } 2081 2082 SDValue SITargetLowering::lowerTRAP(SDValue Op, 2083 SelectionDAG &DAG) const { 2084 const MachineFunction &MF = DAG.getMachineFunction(); 2085 DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), 2086 "trap handler not supported", 2087 Op.getDebugLoc(), 2088 DS_Warning); 2089 DAG.getContext()->diagnose(NoTrap); 2090 2091 // Emit s_endpgm. 2092 2093 // FIXME: This should really be selected to s_trap, but that requires 2094 // setting up the trap handler for it o do anything. 2095 return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other, 2096 Op.getOperand(0)); 2097 } 2098 2099 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, 2100 const SDLoc &DL, SDValue V) const { 2101 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as 2102 // the destination register. 2103 // 2104 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, 2105 // so we will end up with redundant moves to m0. 2106 // 2107 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. 2108 2109 // A Null SDValue creates a glue result. 2110 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, 2111 V, Chain); 2112 return SDValue(M0, 0); 2113 } 2114 2115 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, 2116 SDValue Op, 2117 MVT VT, 2118 unsigned Offset) const { 2119 SDLoc SL(Op); 2120 SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, 2121 DAG.getEntryNode(), Offset, false); 2122 // The local size values will have the hi 16-bits as zero. 2123 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, 2124 DAG.getValueType(VT)); 2125 } 2126 2127 static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { 2128 DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), 2129 "non-hsa intrinsic with hsa target", 2130 DL.getDebugLoc()); 2131 DAG.getContext()->diagnose(BadIntrin); 2132 return DAG.getUNDEF(VT); 2133 } 2134 2135 static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { 2136 DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), 2137 "intrinsic not supported on subtarget", 2138 DL.getDebugLoc()); 2139 DAG.getContext()->diagnose(BadIntrin); 2140 return DAG.getUNDEF(VT); 2141 } 2142 2143 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 2144 SelectionDAG &DAG) const { 2145 MachineFunction &MF = DAG.getMachineFunction(); 2146 auto MFI = MF.getInfo<SIMachineFunctionInfo>(); 2147 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 2148 2149 EVT VT = Op.getValueType(); 2150 SDLoc DL(Op); 2151 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2152 2153 // TODO: Should this propagate fast-math-flags? 2154 2155 switch (IntrinsicID) { 2156 case Intrinsic::amdgcn_dispatch_ptr: 2157 case Intrinsic::amdgcn_queue_ptr: { 2158 if (!Subtarget->isAmdCodeObjectV2()) { 2159 DiagnosticInfoUnsupported BadIntrin( 2160 *MF.getFunction(), "unsupported hsa intrinsic without hsa target", 2161 DL.getDebugLoc()); 2162 DAG.getContext()->diagnose(BadIntrin); 2163 return DAG.getUNDEF(VT); 2164 } 2165 2166 auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? 2167 SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR; 2168 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, 2169 TRI->getPreloadedValue(MF, Reg), VT); 2170 } 2171 case Intrinsic::amdgcn_implicitarg_ptr: { 2172 unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); 2173 return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset); 2174 } 2175 case Intrinsic::amdgcn_kernarg_segment_ptr: { 2176 unsigned Reg 2177 = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 2178 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); 2179 } 2180 case Intrinsic::amdgcn_dispatch_id: { 2181 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID); 2182 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); 2183 } 2184 case Intrinsic::amdgcn_rcp: 2185 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); 2186 case Intrinsic::amdgcn_rsq: 2187 case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name 2188 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 2189 case Intrinsic::amdgcn_rsq_legacy: { 2190 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 2191 return emitRemovedIntrinsicError(DAG, DL, VT); 2192 2193 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 2194 } 2195 case Intrinsic::amdgcn_rcp_legacy: { 2196 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 2197 return emitRemovedIntrinsicError(DAG, DL, VT); 2198 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1)); 2199 } 2200 case Intrinsic::amdgcn_rsq_clamp: { 2201 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) 2202 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 2203 2204 Type *Type = VT.getTypeForEVT(*DAG.getContext()); 2205 APFloat Max = APFloat::getLargest(Type->getFltSemantics()); 2206 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); 2207 2208 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 2209 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, 2210 DAG.getConstantFP(Max, DL, VT)); 2211 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, 2212 DAG.getConstantFP(Min, DL, VT)); 2213 } 2214 case Intrinsic::r600_read_ngroups_x: 2215 if (Subtarget->isAmdHsaOS()) 2216 return emitNonHSAIntrinsicError(DAG, DL, VT); 2217 2218 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2219 SI::KernelInputOffsets::NGROUPS_X, false); 2220 case Intrinsic::r600_read_ngroups_y: 2221 if (Subtarget->isAmdHsaOS()) 2222 return emitNonHSAIntrinsicError(DAG, DL, VT); 2223 2224 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2225 SI::KernelInputOffsets::NGROUPS_Y, false); 2226 case Intrinsic::r600_read_ngroups_z: 2227 if (Subtarget->isAmdHsaOS()) 2228 return emitNonHSAIntrinsicError(DAG, DL, VT); 2229 2230 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2231 SI::KernelInputOffsets::NGROUPS_Z, false); 2232 case Intrinsic::r600_read_global_size_x: 2233 if (Subtarget->isAmdHsaOS()) 2234 return emitNonHSAIntrinsicError(DAG, DL, VT); 2235 2236 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2237 SI::KernelInputOffsets::GLOBAL_SIZE_X, false); 2238 case Intrinsic::r600_read_global_size_y: 2239 if (Subtarget->isAmdHsaOS()) 2240 return emitNonHSAIntrinsicError(DAG, DL, VT); 2241 2242 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2243 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); 2244 case Intrinsic::r600_read_global_size_z: 2245 if (Subtarget->isAmdHsaOS()) 2246 return emitNonHSAIntrinsicError(DAG, DL, VT); 2247 2248 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2249 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); 2250 case Intrinsic::r600_read_local_size_x: 2251 if (Subtarget->isAmdHsaOS()) 2252 return emitNonHSAIntrinsicError(DAG, DL, VT); 2253 2254 return lowerImplicitZextParam(DAG, Op, MVT::i16, 2255 SI::KernelInputOffsets::LOCAL_SIZE_X); 2256 case Intrinsic::r600_read_local_size_y: 2257 if (Subtarget->isAmdHsaOS()) 2258 return emitNonHSAIntrinsicError(DAG, DL, VT); 2259 2260 return lowerImplicitZextParam(DAG, Op, MVT::i16, 2261 SI::KernelInputOffsets::LOCAL_SIZE_Y); 2262 case Intrinsic::r600_read_local_size_z: 2263 if (Subtarget->isAmdHsaOS()) 2264 return emitNonHSAIntrinsicError(DAG, DL, VT); 2265 2266 return lowerImplicitZextParam(DAG, Op, MVT::i16, 2267 SI::KernelInputOffsets::LOCAL_SIZE_Z); 2268 case Intrinsic::amdgcn_workgroup_id_x: 2269 case Intrinsic::r600_read_tgid_x: 2270 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 2271 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); 2272 case Intrinsic::amdgcn_workgroup_id_y: 2273 case Intrinsic::r600_read_tgid_y: 2274 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 2275 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); 2276 case Intrinsic::amdgcn_workgroup_id_z: 2277 case Intrinsic::r600_read_tgid_z: 2278 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 2279 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); 2280 case Intrinsic::amdgcn_workitem_id_x: 2281 case Intrinsic::r600_read_tidig_x: 2282 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 2283 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); 2284 case Intrinsic::amdgcn_workitem_id_y: 2285 case Intrinsic::r600_read_tidig_y: 2286 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 2287 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); 2288 case Intrinsic::amdgcn_workitem_id_z: 2289 case Intrinsic::r600_read_tidig_z: 2290 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 2291 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); 2292 case AMDGPUIntrinsic::SI_load_const: { 2293 SDValue Ops[] = { 2294 Op.getOperand(1), 2295 Op.getOperand(2) 2296 }; 2297 2298 MachineMemOperand *MMO = MF.getMachineMemOperand( 2299 MachinePointerInfo(), 2300 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2301 MachineMemOperand::MOInvariant, 2302 VT.getStoreSize(), 4); 2303 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 2304 Op->getVTList(), Ops, VT, MMO); 2305 } 2306 case AMDGPUIntrinsic::amdgcn_fdiv_fast: { 2307 return lowerFDIV_FAST(Op, DAG); 2308 } 2309 case AMDGPUIntrinsic::SI_vs_load_input: 2310 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 2311 Op.getOperand(1), 2312 Op.getOperand(2), 2313 Op.getOperand(3)); 2314 2315 case AMDGPUIntrinsic::SI_fs_constant: { 2316 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 2317 SDValue Glue = M0.getValue(1); 2318 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, 2319 DAG.getConstant(2, DL, MVT::i32), // P0 2320 Op.getOperand(1), Op.getOperand(2), Glue); 2321 } 2322 case AMDGPUIntrinsic::SI_packf16: 2323 if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) 2324 return DAG.getUNDEF(MVT::i32); 2325 return Op; 2326 case AMDGPUIntrinsic::SI_fs_interp: { 2327 SDValue IJ = Op.getOperand(4); 2328 SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 2329 DAG.getConstant(0, DL, MVT::i32)); 2330 SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 2331 DAG.getConstant(1, DL, MVT::i32)); 2332 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 2333 SDValue Glue = M0.getValue(1); 2334 SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, 2335 DAG.getVTList(MVT::f32, MVT::Glue), 2336 I, Op.getOperand(1), Op.getOperand(2), Glue); 2337 Glue = SDValue(P1.getNode(), 1); 2338 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, 2339 Op.getOperand(1), Op.getOperand(2), Glue); 2340 } 2341 case Intrinsic::amdgcn_interp_p1: { 2342 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); 2343 SDValue Glue = M0.getValue(1); 2344 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), 2345 Op.getOperand(2), Op.getOperand(3), Glue); 2346 } 2347 case Intrinsic::amdgcn_interp_p2: { 2348 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); 2349 SDValue Glue = SDValue(M0.getNode(), 1); 2350 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), 2351 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), 2352 Glue); 2353 } 2354 case Intrinsic::amdgcn_sin: 2355 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); 2356 2357 case Intrinsic::amdgcn_cos: 2358 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); 2359 2360 case Intrinsic::amdgcn_log_clamp: { 2361 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) 2362 return SDValue(); 2363 2364 DiagnosticInfoUnsupported BadIntrin( 2365 *MF.getFunction(), "intrinsic not supported on subtarget", 2366 DL.getDebugLoc()); 2367 DAG.getContext()->diagnose(BadIntrin); 2368 return DAG.getUNDEF(VT); 2369 } 2370 case Intrinsic::amdgcn_ldexp: 2371 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, 2372 Op.getOperand(1), Op.getOperand(2)); 2373 2374 case Intrinsic::amdgcn_fract: 2375 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 2376 2377 case Intrinsic::amdgcn_class: 2378 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, 2379 Op.getOperand(1), Op.getOperand(2)); 2380 case Intrinsic::amdgcn_div_fmas: 2381 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, 2382 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), 2383 Op.getOperand(4)); 2384 2385 case Intrinsic::amdgcn_div_fixup: 2386 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, 2387 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 2388 2389 case Intrinsic::amdgcn_trig_preop: 2390 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, 2391 Op.getOperand(1), Op.getOperand(2)); 2392 case Intrinsic::amdgcn_div_scale: { 2393 // 3rd parameter required to be a constant. 2394 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 2395 if (!Param) 2396 return DAG.getUNDEF(VT); 2397 2398 // Translate to the operands expected by the machine instruction. The 2399 // first parameter must be the same as the first instruction. 2400 SDValue Numerator = Op.getOperand(1); 2401 SDValue Denominator = Op.getOperand(2); 2402 2403 // Note this order is opposite of the machine instruction's operations, 2404 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The 2405 // intrinsic has the numerator as the first operand to match a normal 2406 // division operation. 2407 2408 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; 2409 2410 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, 2411 Denominator, Numerator); 2412 } 2413 case Intrinsic::amdgcn_icmp: { 2414 const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 2415 int CondCode = CD->getSExtValue(); 2416 2417 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || 2418 CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE) 2419 return DAG.getUNDEF(VT); 2420 2421 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); 2422 ISD::CondCode CCOpcode = getICmpCondCode(IcInput); 2423 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), 2424 Op.getOperand(2), DAG.getCondCode(CCOpcode)); 2425 } 2426 case Intrinsic::amdgcn_fcmp: { 2427 const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 2428 int CondCode = CD->getSExtValue(); 2429 2430 if (CondCode <= FCmpInst::Predicate::FCMP_FALSE || 2431 CondCode >= FCmpInst::Predicate::FCMP_TRUE) 2432 return DAG.getUNDEF(VT); 2433 2434 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); 2435 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); 2436 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), 2437 Op.getOperand(2), DAG.getCondCode(CCOpcode)); 2438 } 2439 case Intrinsic::amdgcn_fmul_legacy: 2440 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, 2441 Op.getOperand(1), Op.getOperand(2)); 2442 case Intrinsic::amdgcn_sffbh: 2443 case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name. 2444 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); 2445 default: 2446 return AMDGPUTargetLowering::LowerOperation(Op, DAG); 2447 } 2448 } 2449 2450 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, 2451 SelectionDAG &DAG) const { 2452 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 2453 switch (IntrID) { 2454 case Intrinsic::amdgcn_atomic_inc: 2455 case Intrinsic::amdgcn_atomic_dec: { 2456 MemSDNode *M = cast<MemSDNode>(Op); 2457 unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ? 2458 AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC; 2459 SDValue Ops[] = { 2460 M->getOperand(0), // Chain 2461 M->getOperand(2), // Ptr 2462 M->getOperand(3) // Value 2463 }; 2464 2465 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, 2466 M->getMemoryVT(), M->getMemOperand()); 2467 } 2468 default: 2469 return SDValue(); 2470 } 2471 } 2472 2473 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 2474 SelectionDAG &DAG) const { 2475 MachineFunction &MF = DAG.getMachineFunction(); 2476 SDLoc DL(Op); 2477 SDValue Chain = Op.getOperand(0); 2478 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 2479 2480 switch (IntrinsicID) { 2481 case AMDGPUIntrinsic::SI_sendmsg: { 2482 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 2483 SDValue Glue = Chain.getValue(1); 2484 return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, 2485 Op.getOperand(2), Glue); 2486 } 2487 case AMDGPUIntrinsic::SI_tbuffer_store: { 2488 SDValue Ops[] = { 2489 Chain, 2490 Op.getOperand(2), 2491 Op.getOperand(3), 2492 Op.getOperand(4), 2493 Op.getOperand(5), 2494 Op.getOperand(6), 2495 Op.getOperand(7), 2496 Op.getOperand(8), 2497 Op.getOperand(9), 2498 Op.getOperand(10), 2499 Op.getOperand(11), 2500 Op.getOperand(12), 2501 Op.getOperand(13), 2502 Op.getOperand(14) 2503 }; 2504 2505 EVT VT = Op.getOperand(3).getValueType(); 2506 2507 MachineMemOperand *MMO = MF.getMachineMemOperand( 2508 MachinePointerInfo(), 2509 MachineMemOperand::MOStore, 2510 VT.getStoreSize(), 4); 2511 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 2512 Op->getVTList(), Ops, VT, MMO); 2513 } 2514 case AMDGPUIntrinsic::AMDGPU_kill: { 2515 SDValue Src = Op.getOperand(2); 2516 if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) { 2517 if (!K->isNegative()) 2518 return Chain; 2519 2520 SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32); 2521 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne); 2522 } 2523 2524 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); 2525 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast); 2526 } 2527 default: 2528 return SDValue(); 2529 } 2530 } 2531 2532 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2533 SDLoc DL(Op); 2534 LoadSDNode *Load = cast<LoadSDNode>(Op); 2535 ISD::LoadExtType ExtType = Load->getExtensionType(); 2536 EVT MemVT = Load->getMemoryVT(); 2537 2538 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { 2539 assert(MemVT == MVT::i1 && "Only i1 non-extloads expected"); 2540 // FIXME: Copied from PPC 2541 // First, load into 32 bits, then truncate to 1 bit. 2542 2543 SDValue Chain = Load->getChain(); 2544 SDValue BasePtr = Load->getBasePtr(); 2545 MachineMemOperand *MMO = Load->getMemOperand(); 2546 2547 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, 2548 BasePtr, MVT::i8, MMO); 2549 2550 SDValue Ops[] = { 2551 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), 2552 NewLD.getValue(1) 2553 }; 2554 2555 return DAG.getMergeValues(Ops, DL); 2556 } 2557 2558 if (!MemVT.isVector()) 2559 return SDValue(); 2560 2561 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 2562 "Custom lowering for non-i32 vectors hasn't been implemented."); 2563 2564 unsigned AS = Load->getAddressSpace(); 2565 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, 2566 AS, Load->getAlignment())) { 2567 SDValue Ops[2]; 2568 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2569 return DAG.getMergeValues(Ops, DL); 2570 } 2571 2572 unsigned NumElements = MemVT.getVectorNumElements(); 2573 switch (AS) { 2574 case AMDGPUAS::CONSTANT_ADDRESS: 2575 if (isMemOpUniform(Load)) 2576 return SDValue(); 2577 // Non-uniform loads will be selected to MUBUF instructions, so they 2578 // have the same legalization requires ments as global and private 2579 // loads. 2580 // 2581 LLVM_FALLTHROUGH; 2582 case AMDGPUAS::GLOBAL_ADDRESS: 2583 case AMDGPUAS::FLAT_ADDRESS: 2584 if (NumElements > 4) 2585 return SplitVectorLoad(Op, DAG); 2586 // v4 loads are supported for private and global memory. 2587 return SDValue(); 2588 case AMDGPUAS::PRIVATE_ADDRESS: { 2589 // Depending on the setting of the private_element_size field in the 2590 // resource descriptor, we can only make private accesses up to a certain 2591 // size. 2592 switch (Subtarget->getMaxPrivateElementSize()) { 2593 case 4: 2594 return scalarizeVectorLoad(Load, DAG); 2595 case 8: 2596 if (NumElements > 2) 2597 return SplitVectorLoad(Op, DAG); 2598 return SDValue(); 2599 case 16: 2600 // Same as global/flat 2601 if (NumElements > 4) 2602 return SplitVectorLoad(Op, DAG); 2603 return SDValue(); 2604 default: 2605 llvm_unreachable("unsupported private_element_size"); 2606 } 2607 } 2608 case AMDGPUAS::LOCAL_ADDRESS: { 2609 if (NumElements > 2) 2610 return SplitVectorLoad(Op, DAG); 2611 2612 if (NumElements == 2) 2613 return SDValue(); 2614 2615 // If properly aligned, if we split we might be able to use ds_read_b64. 2616 return SplitVectorLoad(Op, DAG); 2617 } 2618 default: 2619 return SDValue(); 2620 } 2621 } 2622 2623 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 2624 if (Op.getValueType() != MVT::i64) 2625 return SDValue(); 2626 2627 SDLoc DL(Op); 2628 SDValue Cond = Op.getOperand(0); 2629 2630 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 2631 SDValue One = DAG.getConstant(1, DL, MVT::i32); 2632 2633 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 2634 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 2635 2636 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 2637 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 2638 2639 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 2640 2641 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 2642 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 2643 2644 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 2645 2646 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); 2647 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 2648 } 2649 2650 // Catch division cases where we can use shortcuts with rcp and rsq 2651 // instructions. 2652 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, 2653 SelectionDAG &DAG) const { 2654 SDLoc SL(Op); 2655 SDValue LHS = Op.getOperand(0); 2656 SDValue RHS = Op.getOperand(1); 2657 EVT VT = Op.getValueType(); 2658 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; 2659 2660 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 2661 if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()))) { 2662 2663 if (CLHS->isExactlyValue(1.0)) { 2664 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 2665 // the CI documentation has a worst case error of 1 ulp. 2666 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 2667 // use it as long as we aren't trying to use denormals. 2668 2669 // 1.0 / sqrt(x) -> rsq(x) 2670 // 2671 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 2672 // error seems really high at 2^29 ULP. 2673 if (RHS.getOpcode() == ISD::FSQRT) 2674 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); 2675 2676 // 1.0 / x -> rcp(x) 2677 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 2678 } 2679 2680 // Same as for 1.0, but expand the sign out of the constant. 2681 if (CLHS->isExactlyValue(-1.0)) { 2682 // -1.0 / x -> rcp (fneg x) 2683 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 2684 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); 2685 } 2686 } 2687 } 2688 2689 const SDNodeFlags *Flags = Op->getFlags(); 2690 2691 if (Unsafe || Flags->hasAllowReciprocal()) { 2692 // Turn into multiply by the reciprocal. 2693 // x / y -> x * (1.0 / y) 2694 SDNodeFlags Flags; 2695 Flags.setUnsafeAlgebra(true); 2696 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 2697 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags); 2698 } 2699 2700 return SDValue(); 2701 } 2702 2703 // Faster 2.5 ULP division that does not support denormals. 2704 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { 2705 SDLoc SL(Op); 2706 SDValue LHS = Op.getOperand(1); 2707 SDValue RHS = Op.getOperand(2); 2708 2709 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); 2710 2711 const APFloat K0Val(BitsToFloat(0x6f800000)); 2712 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); 2713 2714 const APFloat K1Val(BitsToFloat(0x2f800000)); 2715 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); 2716 2717 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 2718 2719 EVT SetCCVT = 2720 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); 2721 2722 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 2723 2724 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); 2725 2726 // TODO: Should this propagate fast-math-flags? 2727 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); 2728 2729 // rcp does not support denormals. 2730 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); 2731 2732 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); 2733 2734 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); 2735 } 2736 2737 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 2738 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) 2739 return FastLowered; 2740 2741 SDLoc SL(Op); 2742 SDValue LHS = Op.getOperand(0); 2743 SDValue RHS = Op.getOperand(1); 2744 2745 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 2746 2747 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); 2748 2749 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS); 2750 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); 2751 2752 // Denominator is scaled to not be denormal, so using rcp is ok. 2753 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled); 2754 2755 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled); 2756 2757 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One); 2758 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp); 2759 2760 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1); 2761 2762 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled); 2763 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul); 2764 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled); 2765 2766 SDValue Scale = NumeratorScaled.getValue(1); 2767 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale); 2768 2769 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); 2770 } 2771 2772 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 2773 if (DAG.getTarget().Options.UnsafeFPMath) 2774 return lowerFastUnsafeFDIV(Op, DAG); 2775 2776 SDLoc SL(Op); 2777 SDValue X = Op.getOperand(0); 2778 SDValue Y = Op.getOperand(1); 2779 2780 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 2781 2782 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 2783 2784 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 2785 2786 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 2787 2788 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 2789 2790 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 2791 2792 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 2793 2794 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 2795 2796 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 2797 2798 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 2799 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 2800 2801 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, 2802 NegDivScale0, Mul, DivScale1); 2803 2804 SDValue Scale; 2805 2806 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { 2807 // Workaround a hardware bug on SI where the condition output from div_scale 2808 // is not usable. 2809 2810 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); 2811 2812 // Figure out if the scale to use for div_fmas. 2813 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 2814 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 2815 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 2816 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 2817 2818 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 2819 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 2820 2821 SDValue Scale0Hi 2822 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 2823 SDValue Scale1Hi 2824 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 2825 2826 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 2827 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 2828 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 2829 } else { 2830 Scale = DivScale1.getValue(1); 2831 } 2832 2833 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, 2834 Fma4, Fma3, Mul, Scale); 2835 2836 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 2837 } 2838 2839 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 2840 EVT VT = Op.getValueType(); 2841 2842 if (VT == MVT::f32) 2843 return LowerFDIV32(Op, DAG); 2844 2845 if (VT == MVT::f64) 2846 return LowerFDIV64(Op, DAG); 2847 2848 llvm_unreachable("Unexpected type for fdiv"); 2849 } 2850 2851 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2852 SDLoc DL(Op); 2853 StoreSDNode *Store = cast<StoreSDNode>(Op); 2854 EVT VT = Store->getMemoryVT(); 2855 2856 if (VT == MVT::i1) { 2857 return DAG.getTruncStore(Store->getChain(), DL, 2858 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 2859 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 2860 } 2861 2862 assert(VT.isVector() && 2863 Store->getValue().getValueType().getScalarType() == MVT::i32); 2864 2865 unsigned AS = Store->getAddressSpace(); 2866 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 2867 AS, Store->getAlignment())) { 2868 return expandUnalignedStore(Store, DAG); 2869 } 2870 2871 unsigned NumElements = VT.getVectorNumElements(); 2872 switch (AS) { 2873 case AMDGPUAS::GLOBAL_ADDRESS: 2874 case AMDGPUAS::FLAT_ADDRESS: 2875 if (NumElements > 4) 2876 return SplitVectorStore(Op, DAG); 2877 return SDValue(); 2878 case AMDGPUAS::PRIVATE_ADDRESS: { 2879 switch (Subtarget->getMaxPrivateElementSize()) { 2880 case 4: 2881 return scalarizeVectorStore(Store, DAG); 2882 case 8: 2883 if (NumElements > 2) 2884 return SplitVectorStore(Op, DAG); 2885 return SDValue(); 2886 case 16: 2887 if (NumElements > 4) 2888 return SplitVectorStore(Op, DAG); 2889 return SDValue(); 2890 default: 2891 llvm_unreachable("unsupported private_element_size"); 2892 } 2893 } 2894 case AMDGPUAS::LOCAL_ADDRESS: { 2895 if (NumElements > 2) 2896 return SplitVectorStore(Op, DAG); 2897 2898 if (NumElements == 2) 2899 return Op; 2900 2901 // If properly aligned, if we split we might be able to use ds_write_b64. 2902 return SplitVectorStore(Op, DAG); 2903 } 2904 default: 2905 llvm_unreachable("unhandled address space"); 2906 } 2907 } 2908 2909 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 2910 SDLoc DL(Op); 2911 EVT VT = Op.getValueType(); 2912 SDValue Arg = Op.getOperand(0); 2913 // TODO: Should this propagate fast-math-flags? 2914 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 2915 DAG.getNode(ISD::FMUL, DL, VT, Arg, 2916 DAG.getConstantFP(0.5/M_PI, DL, 2917 VT))); 2918 2919 switch (Op.getOpcode()) { 2920 case ISD::FCOS: 2921 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); 2922 case ISD::FSIN: 2923 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); 2924 default: 2925 llvm_unreachable("Wrong trig opcode"); 2926 } 2927 } 2928 2929 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 2930 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op); 2931 assert(AtomicNode->isCompareAndSwap()); 2932 unsigned AS = AtomicNode->getAddressSpace(); 2933 2934 // No custom lowering required for local address space 2935 if (!isFlatGlobalAddrSpace(AS)) 2936 return Op; 2937 2938 // Non-local address space requires custom lowering for atomic compare 2939 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 2940 SDLoc DL(Op); 2941 SDValue ChainIn = Op.getOperand(0); 2942 SDValue Addr = Op.getOperand(1); 2943 SDValue Old = Op.getOperand(2); 2944 SDValue New = Op.getOperand(3); 2945 EVT VT = Op.getValueType(); 2946 MVT SimpleVT = VT.getSimpleVT(); 2947 MVT VecType = MVT::getVectorVT(SimpleVT, 2); 2948 2949 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old}); 2950 SDValue Ops[] = { ChainIn, Addr, NewOld }; 2951 2952 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(), 2953 Ops, VT, AtomicNode->getMemOperand()); 2954 } 2955 2956 //===----------------------------------------------------------------------===// 2957 // Custom DAG optimizations 2958 //===----------------------------------------------------------------------===// 2959 2960 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 2961 DAGCombinerInfo &DCI) const { 2962 EVT VT = N->getValueType(0); 2963 EVT ScalarVT = VT.getScalarType(); 2964 if (ScalarVT != MVT::f32) 2965 return SDValue(); 2966 2967 SelectionDAG &DAG = DCI.DAG; 2968 SDLoc DL(N); 2969 2970 SDValue Src = N->getOperand(0); 2971 EVT SrcVT = Src.getValueType(); 2972 2973 // TODO: We could try to match extracting the higher bytes, which would be 2974 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 2975 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 2976 // about in practice. 2977 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { 2978 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 2979 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); 2980 DCI.AddToWorklist(Cvt.getNode()); 2981 return Cvt; 2982 } 2983 } 2984 2985 return SDValue(); 2986 } 2987 2988 /// \brief Return true if the given offset Size in bytes can be folded into 2989 /// the immediate offsets of a memory instruction for the given address space. 2990 static bool canFoldOffset(unsigned OffsetSize, unsigned AS, 2991 const SISubtarget &STI) { 2992 switch (AS) { 2993 case AMDGPUAS::GLOBAL_ADDRESS: { 2994 // MUBUF instructions a 12-bit offset in bytes. 2995 return isUInt<12>(OffsetSize); 2996 } 2997 case AMDGPUAS::CONSTANT_ADDRESS: { 2998 // SMRD instructions have an 8-bit offset in dwords on SI and 2999 // a 20-bit offset in bytes on VI. 3000 if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 3001 return isUInt<20>(OffsetSize); 3002 else 3003 return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); 3004 } 3005 case AMDGPUAS::LOCAL_ADDRESS: 3006 case AMDGPUAS::REGION_ADDRESS: { 3007 // The single offset versions have a 16-bit offset in bytes. 3008 return isUInt<16>(OffsetSize); 3009 } 3010 case AMDGPUAS::PRIVATE_ADDRESS: 3011 // Indirect register addressing does not use any offsets. 3012 default: 3013 return 0; 3014 } 3015 } 3016 3017 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 3018 3019 // This is a variant of 3020 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 3021 // 3022 // The normal DAG combiner will do this, but only if the add has one use since 3023 // that would increase the number of instructions. 3024 // 3025 // This prevents us from seeing a constant offset that can be folded into a 3026 // memory instruction's addressing mode. If we know the resulting add offset of 3027 // a pointer can be folded into an addressing offset, we can replace the pointer 3028 // operand with the add of new constant offset. This eliminates one of the uses, 3029 // and may allow the remaining use to also be simplified. 3030 // 3031 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, 3032 unsigned AddrSpace, 3033 DAGCombinerInfo &DCI) const { 3034 SDValue N0 = N->getOperand(0); 3035 SDValue N1 = N->getOperand(1); 3036 3037 if (N0.getOpcode() != ISD::ADD) 3038 return SDValue(); 3039 3040 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 3041 if (!CN1) 3042 return SDValue(); 3043 3044 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 3045 if (!CAdd) 3046 return SDValue(); 3047 3048 // If the resulting offset is too large, we can't fold it into the addressing 3049 // mode offset. 3050 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 3051 if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget())) 3052 return SDValue(); 3053 3054 SelectionDAG &DAG = DCI.DAG; 3055 SDLoc SL(N); 3056 EVT VT = N->getValueType(0); 3057 3058 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 3059 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); 3060 3061 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); 3062 } 3063 3064 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) { 3065 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) || 3066 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) || 3067 (Opc == ISD::XOR && Val == 0); 3068 } 3069 3070 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This 3071 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit 3072 // integer combine opportunities since most 64-bit operations are decomposed 3073 // this way. TODO: We won't want this for SALU especially if it is an inline 3074 // immediate. 3075 SDValue SITargetLowering::splitBinaryBitConstantOp( 3076 DAGCombinerInfo &DCI, 3077 const SDLoc &SL, 3078 unsigned Opc, SDValue LHS, 3079 const ConstantSDNode *CRHS) const { 3080 uint64_t Val = CRHS->getZExtValue(); 3081 uint32_t ValLo = Lo_32(Val); 3082 uint32_t ValHi = Hi_32(Val); 3083 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 3084 3085 if ((bitOpWithConstantIsReducible(Opc, ValLo) || 3086 bitOpWithConstantIsReducible(Opc, ValHi)) || 3087 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { 3088 // If we need to materialize a 64-bit immediate, it will be split up later 3089 // anyway. Avoid creating the harder to understand 64-bit immediate 3090 // materialization. 3091 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi); 3092 } 3093 3094 return SDValue(); 3095 } 3096 3097 SDValue SITargetLowering::performAndCombine(SDNode *N, 3098 DAGCombinerInfo &DCI) const { 3099 if (DCI.isBeforeLegalize()) 3100 return SDValue(); 3101 3102 SelectionDAG &DAG = DCI.DAG; 3103 EVT VT = N->getValueType(0); 3104 SDValue LHS = N->getOperand(0); 3105 SDValue RHS = N->getOperand(1); 3106 3107 3108 if (VT == MVT::i64) { 3109 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 3110 if (CRHS) { 3111 if (SDValue Split 3112 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) 3113 return Split; 3114 } 3115 } 3116 3117 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 3118 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 3119 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) { 3120 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 3121 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 3122 3123 SDValue X = LHS.getOperand(0); 3124 SDValue Y = RHS.getOperand(0); 3125 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) 3126 return SDValue(); 3127 3128 if (LCC == ISD::SETO) { 3129 if (X != LHS.getOperand(1)) 3130 return SDValue(); 3131 3132 if (RCC == ISD::SETUNE) { 3133 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 3134 if (!C1 || !C1->isInfinity() || C1->isNegative()) 3135 return SDValue(); 3136 3137 const uint32_t Mask = SIInstrFlags::N_NORMAL | 3138 SIInstrFlags::N_SUBNORMAL | 3139 SIInstrFlags::N_ZERO | 3140 SIInstrFlags::P_ZERO | 3141 SIInstrFlags::P_SUBNORMAL | 3142 SIInstrFlags::P_NORMAL; 3143 3144 static_assert(((~(SIInstrFlags::S_NAN | 3145 SIInstrFlags::Q_NAN | 3146 SIInstrFlags::N_INFINITY | 3147 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, 3148 "mask not equal"); 3149 3150 SDLoc DL(N); 3151 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 3152 X, DAG.getConstant(Mask, DL, MVT::i32)); 3153 } 3154 } 3155 } 3156 3157 return SDValue(); 3158 } 3159 3160 SDValue SITargetLowering::performOrCombine(SDNode *N, 3161 DAGCombinerInfo &DCI) const { 3162 SelectionDAG &DAG = DCI.DAG; 3163 SDValue LHS = N->getOperand(0); 3164 SDValue RHS = N->getOperand(1); 3165 3166 EVT VT = N->getValueType(0); 3167 if (VT == MVT::i1) { 3168 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 3169 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 3170 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 3171 SDValue Src = LHS.getOperand(0); 3172 if (Src != RHS.getOperand(0)) 3173 return SDValue(); 3174 3175 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 3176 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 3177 if (!CLHS || !CRHS) 3178 return SDValue(); 3179 3180 // Only 10 bits are used. 3181 static const uint32_t MaxMask = 0x3ff; 3182 3183 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 3184 SDLoc DL(N); 3185 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 3186 Src, DAG.getConstant(NewMask, DL, MVT::i32)); 3187 } 3188 3189 return SDValue(); 3190 } 3191 3192 if (VT != MVT::i64) 3193 return SDValue(); 3194 3195 // TODO: This could be a generic combine with a predicate for extracting the 3196 // high half of an integer being free. 3197 3198 // (or i64:x, (zero_extend i32:y)) -> 3199 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) 3200 if (LHS.getOpcode() == ISD::ZERO_EXTEND && 3201 RHS.getOpcode() != ISD::ZERO_EXTEND) 3202 std::swap(LHS, RHS); 3203 3204 if (RHS.getOpcode() == ISD::ZERO_EXTEND) { 3205 SDValue ExtSrc = RHS.getOperand(0); 3206 EVT SrcVT = ExtSrc.getValueType(); 3207 if (SrcVT == MVT::i32) { 3208 SDLoc SL(N); 3209 SDValue LowLHS, HiBits; 3210 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); 3211 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); 3212 3213 DCI.AddToWorklist(LowOr.getNode()); 3214 DCI.AddToWorklist(HiBits.getNode()); 3215 3216 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 3217 LowOr, HiBits); 3218 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 3219 } 3220 } 3221 3222 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 3223 if (CRHS) { 3224 if (SDValue Split 3225 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS)) 3226 return Split; 3227 } 3228 3229 return SDValue(); 3230 } 3231 3232 SDValue SITargetLowering::performXorCombine(SDNode *N, 3233 DAGCombinerInfo &DCI) const { 3234 EVT VT = N->getValueType(0); 3235 if (VT != MVT::i64) 3236 return SDValue(); 3237 3238 SDValue LHS = N->getOperand(0); 3239 SDValue RHS = N->getOperand(1); 3240 3241 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 3242 if (CRHS) { 3243 if (SDValue Split 3244 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) 3245 return Split; 3246 } 3247 3248 return SDValue(); 3249 } 3250 3251 SDValue SITargetLowering::performClassCombine(SDNode *N, 3252 DAGCombinerInfo &DCI) const { 3253 SelectionDAG &DAG = DCI.DAG; 3254 SDValue Mask = N->getOperand(1); 3255 3256 // fp_class x, 0 -> false 3257 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { 3258 if (CMask->isNullValue()) 3259 return DAG.getConstant(0, SDLoc(N), MVT::i1); 3260 } 3261 3262 if (N->getOperand(0).isUndef()) 3263 return DAG.getUNDEF(MVT::i1); 3264 3265 return SDValue(); 3266 } 3267 3268 // Constant fold canonicalize. 3269 SDValue SITargetLowering::performFCanonicalizeCombine( 3270 SDNode *N, 3271 DAGCombinerInfo &DCI) const { 3272 ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 3273 if (!CFP) 3274 return SDValue(); 3275 3276 SelectionDAG &DAG = DCI.DAG; 3277 const APFloat &C = CFP->getValueAPF(); 3278 3279 // Flush denormals to 0 if not enabled. 3280 if (C.isDenormal()) { 3281 EVT VT = N->getValueType(0); 3282 if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) 3283 return DAG.getConstantFP(0.0, SDLoc(N), VT); 3284 3285 if (VT == MVT::f64 && !Subtarget->hasFP64Denormals()) 3286 return DAG.getConstantFP(0.0, SDLoc(N), VT); 3287 } 3288 3289 if (C.isNaN()) { 3290 EVT VT = N->getValueType(0); 3291 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); 3292 if (C.isSignaling()) { 3293 // Quiet a signaling NaN. 3294 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); 3295 } 3296 3297 // Make sure it is the canonical NaN bitpattern. 3298 // 3299 // TODO: Can we use -1 as the canonical NaN value since it's an inline 3300 // immediate? 3301 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) 3302 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); 3303 } 3304 3305 return SDValue(CFP, 0); 3306 } 3307 3308 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 3309 switch (Opc) { 3310 case ISD::FMAXNUM: 3311 return AMDGPUISD::FMAX3; 3312 case ISD::SMAX: 3313 return AMDGPUISD::SMAX3; 3314 case ISD::UMAX: 3315 return AMDGPUISD::UMAX3; 3316 case ISD::FMINNUM: 3317 return AMDGPUISD::FMIN3; 3318 case ISD::SMIN: 3319 return AMDGPUISD::SMIN3; 3320 case ISD::UMIN: 3321 return AMDGPUISD::UMIN3; 3322 default: 3323 llvm_unreachable("Not a min/max opcode"); 3324 } 3325 } 3326 3327 static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, 3328 SDValue Op0, SDValue Op1, bool Signed) { 3329 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1); 3330 if (!K1) 3331 return SDValue(); 3332 3333 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1)); 3334 if (!K0) 3335 return SDValue(); 3336 3337 if (Signed) { 3338 if (K0->getAPIntValue().sge(K1->getAPIntValue())) 3339 return SDValue(); 3340 } else { 3341 if (K0->getAPIntValue().uge(K1->getAPIntValue())) 3342 return SDValue(); 3343 } 3344 3345 EVT VT = K0->getValueType(0); 3346 return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, 3347 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); 3348 } 3349 3350 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { 3351 if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) 3352 return true; 3353 3354 return DAG.isKnownNeverNaN(Op); 3355 } 3356 3357 static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, 3358 SDValue Op0, SDValue Op1) { 3359 ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1); 3360 if (!K1) 3361 return SDValue(); 3362 3363 ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1)); 3364 if (!K0) 3365 return SDValue(); 3366 3367 // Ordered >= (although NaN inputs should have folded away by now). 3368 APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); 3369 if (Cmp == APFloat::cmpGreaterThan) 3370 return SDValue(); 3371 3372 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a 3373 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then 3374 // give the other result, which is different from med3 with a NaN input. 3375 SDValue Var = Op0.getOperand(0); 3376 if (!isKnownNeverSNan(DAG, Var)) 3377 return SDValue(); 3378 3379 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), 3380 Var, SDValue(K0, 0), SDValue(K1, 0)); 3381 } 3382 3383 SDValue SITargetLowering::performMinMaxCombine(SDNode *N, 3384 DAGCombinerInfo &DCI) const { 3385 SelectionDAG &DAG = DCI.DAG; 3386 3387 unsigned Opc = N->getOpcode(); 3388 SDValue Op0 = N->getOperand(0); 3389 SDValue Op1 = N->getOperand(1); 3390 3391 // Only do this if the inner op has one use since this will just increases 3392 // register pressure for no benefit. 3393 3394 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) { 3395 // max(max(a, b), c) -> max3(a, b, c) 3396 // min(min(a, b), c) -> min3(a, b, c) 3397 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 3398 SDLoc DL(N); 3399 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 3400 DL, 3401 N->getValueType(0), 3402 Op0.getOperand(0), 3403 Op0.getOperand(1), 3404 Op1); 3405 } 3406 3407 // Try commuted. 3408 // max(a, max(b, c)) -> max3(a, b, c) 3409 // min(a, min(b, c)) -> min3(a, b, c) 3410 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 3411 SDLoc DL(N); 3412 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 3413 DL, 3414 N->getValueType(0), 3415 Op0, 3416 Op1.getOperand(0), 3417 Op1.getOperand(1)); 3418 } 3419 } 3420 3421 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) 3422 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { 3423 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) 3424 return Med3; 3425 } 3426 3427 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { 3428 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) 3429 return Med3; 3430 } 3431 3432 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) 3433 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || 3434 (Opc == AMDGPUISD::FMIN_LEGACY && 3435 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && 3436 N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { 3437 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) 3438 return Res; 3439 } 3440 3441 return SDValue(); 3442 } 3443 3444 SDValue SITargetLowering::performSetCCCombine(SDNode *N, 3445 DAGCombinerInfo &DCI) const { 3446 SelectionDAG &DAG = DCI.DAG; 3447 SDLoc SL(N); 3448 3449 SDValue LHS = N->getOperand(0); 3450 SDValue RHS = N->getOperand(1); 3451 EVT VT = LHS.getValueType(); 3452 3453 if (VT != MVT::f32 && VT != MVT::f64) 3454 return SDValue(); 3455 3456 // Match isinf pattern 3457 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 3458 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 3459 if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { 3460 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 3461 if (!CRHS) 3462 return SDValue(); 3463 3464 const APFloat &APF = CRHS->getValueAPF(); 3465 if (APF.isInfinity() && !APF.isNegative()) { 3466 unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; 3467 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), 3468 DAG.getConstant(Mask, SL, MVT::i32)); 3469 } 3470 } 3471 3472 return SDValue(); 3473 } 3474 3475 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 3476 DAGCombinerInfo &DCI) const { 3477 SelectionDAG &DAG = DCI.DAG; 3478 SDLoc DL(N); 3479 3480 switch (N->getOpcode()) { 3481 default: 3482 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 3483 case ISD::SETCC: 3484 return performSetCCCombine(N, DCI); 3485 case ISD::FMAXNUM: 3486 case ISD::FMINNUM: 3487 case ISD::SMAX: 3488 case ISD::SMIN: 3489 case ISD::UMAX: 3490 case ISD::UMIN: 3491 case AMDGPUISD::FMIN_LEGACY: 3492 case AMDGPUISD::FMAX_LEGACY: { 3493 if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && 3494 N->getValueType(0) != MVT::f64 && 3495 getTargetMachine().getOptLevel() > CodeGenOpt::None) 3496 return performMinMaxCombine(N, DCI); 3497 break; 3498 } 3499 3500 case AMDGPUISD::CVT_F32_UBYTE0: 3501 case AMDGPUISD::CVT_F32_UBYTE1: 3502 case AMDGPUISD::CVT_F32_UBYTE2: 3503 case AMDGPUISD::CVT_F32_UBYTE3: { 3504 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 3505 SDValue Src = N->getOperand(0); 3506 3507 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. 3508 if (Src.getOpcode() == ISD::SRL) { 3509 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x 3510 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x 3511 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x 3512 3513 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) { 3514 unsigned SrcOffset = C->getZExtValue() + 8 * Offset; 3515 if (SrcOffset < 32 && SrcOffset % 8 == 0) { 3516 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL, 3517 MVT::f32, Src.getOperand(0)); 3518 } 3519 } 3520 } 3521 3522 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 3523 3524 APInt KnownZero, KnownOne; 3525 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 3526 !DCI.isBeforeLegalizeOps()); 3527 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 3528 if (TLO.ShrinkDemandedConstant(Src, Demanded) || 3529 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { 3530 DCI.CommitTargetLoweringOpt(TLO); 3531 } 3532 3533 break; 3534 } 3535 3536 case ISD::UINT_TO_FP: { 3537 return performUCharToFloatCombine(N, DCI); 3538 } 3539 case ISD::FADD: { 3540 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 3541 break; 3542 3543 EVT VT = N->getValueType(0); 3544 if (VT != MVT::f32) 3545 break; 3546 3547 // Only do this if we are not trying to support denormals. v_mad_f32 does 3548 // not support denormals ever. 3549 if (Subtarget->hasFP32Denormals()) 3550 break; 3551 3552 SDValue LHS = N->getOperand(0); 3553 SDValue RHS = N->getOperand(1); 3554 3555 // These should really be instruction patterns, but writing patterns with 3556 // source modiifiers is a pain. 3557 3558 // fadd (fadd (a, a), b) -> mad 2.0, a, b 3559 if (LHS.getOpcode() == ISD::FADD) { 3560 SDValue A = LHS.getOperand(0); 3561 if (A == LHS.getOperand(1)) { 3562 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 3563 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); 3564 } 3565 } 3566 3567 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 3568 if (RHS.getOpcode() == ISD::FADD) { 3569 SDValue A = RHS.getOperand(0); 3570 if (A == RHS.getOperand(1)) { 3571 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 3572 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); 3573 } 3574 } 3575 3576 return SDValue(); 3577 } 3578 case ISD::FSUB: { 3579 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 3580 break; 3581 3582 EVT VT = N->getValueType(0); 3583 3584 // Try to get the fneg to fold into the source modifier. This undoes generic 3585 // DAG combines and folds them into the mad. 3586 // 3587 // Only do this if we are not trying to support denormals. v_mad_f32 does 3588 // not support denormals ever. 3589 if (VT == MVT::f32 && 3590 !Subtarget->hasFP32Denormals()) { 3591 SDValue LHS = N->getOperand(0); 3592 SDValue RHS = N->getOperand(1); 3593 if (LHS.getOpcode() == ISD::FADD) { 3594 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 3595 3596 SDValue A = LHS.getOperand(0); 3597 if (A == LHS.getOperand(1)) { 3598 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 3599 SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); 3600 3601 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); 3602 } 3603 } 3604 3605 if (RHS.getOpcode() == ISD::FADD) { 3606 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 3607 3608 SDValue A = RHS.getOperand(0); 3609 if (A == RHS.getOperand(1)) { 3610 const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); 3611 return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); 3612 } 3613 } 3614 3615 return SDValue(); 3616 } 3617 3618 break; 3619 } 3620 case ISD::LOAD: 3621 case ISD::STORE: 3622 case ISD::ATOMIC_LOAD: 3623 case ISD::ATOMIC_STORE: 3624 case ISD::ATOMIC_CMP_SWAP: 3625 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 3626 case ISD::ATOMIC_SWAP: 3627 case ISD::ATOMIC_LOAD_ADD: 3628 case ISD::ATOMIC_LOAD_SUB: 3629 case ISD::ATOMIC_LOAD_AND: 3630 case ISD::ATOMIC_LOAD_OR: 3631 case ISD::ATOMIC_LOAD_XOR: 3632 case ISD::ATOMIC_LOAD_NAND: 3633 case ISD::ATOMIC_LOAD_MIN: 3634 case ISD::ATOMIC_LOAD_MAX: 3635 case ISD::ATOMIC_LOAD_UMIN: 3636 case ISD::ATOMIC_LOAD_UMAX: 3637 case AMDGPUISD::ATOMIC_INC: 3638 case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics. 3639 if (DCI.isBeforeLegalize()) 3640 break; 3641 3642 MemSDNode *MemNode = cast<MemSDNode>(N); 3643 SDValue Ptr = MemNode->getBasePtr(); 3644 3645 // TODO: We could also do this for multiplies. 3646 unsigned AS = MemNode->getAddressSpace(); 3647 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { 3648 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); 3649 if (NewPtr) { 3650 SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); 3651 3652 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; 3653 return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); 3654 } 3655 } 3656 break; 3657 } 3658 case ISD::AND: 3659 return performAndCombine(N, DCI); 3660 case ISD::OR: 3661 return performOrCombine(N, DCI); 3662 case ISD::XOR: 3663 return performXorCombine(N, DCI); 3664 case AMDGPUISD::FP_CLASS: 3665 return performClassCombine(N, DCI); 3666 case ISD::FCANONICALIZE: 3667 return performFCanonicalizeCombine(N, DCI); 3668 case AMDGPUISD::FRACT: 3669 case AMDGPUISD::RCP: 3670 case AMDGPUISD::RSQ: 3671 case AMDGPUISD::RCP_LEGACY: 3672 case AMDGPUISD::RSQ_LEGACY: 3673 case AMDGPUISD::RSQ_CLAMP: 3674 case AMDGPUISD::LDEXP: { 3675 SDValue Src = N->getOperand(0); 3676 if (Src.isUndef()) 3677 return Src; 3678 break; 3679 } 3680 } 3681 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 3682 } 3683 3684 /// \brief Helper function for adjustWritemask 3685 static unsigned SubIdx2Lane(unsigned Idx) { 3686 switch (Idx) { 3687 default: return 0; 3688 case AMDGPU::sub0: return 0; 3689 case AMDGPU::sub1: return 1; 3690 case AMDGPU::sub2: return 2; 3691 case AMDGPU::sub3: return 3; 3692 } 3693 } 3694 3695 /// \brief Adjust the writemask of MIMG instructions 3696 void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 3697 SelectionDAG &DAG) const { 3698 SDNode *Users[4] = { }; 3699 unsigned Lane = 0; 3700 unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; 3701 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); 3702 unsigned NewDmask = 0; 3703 3704 // Try to figure out the used register components 3705 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 3706 I != E; ++I) { 3707 3708 // Abort if we can't understand the usage 3709 if (!I->isMachineOpcode() || 3710 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 3711 return; 3712 3713 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 3714 // Note that subregs are packed, i.e. Lane==0 is the first bit set 3715 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 3716 // set, etc. 3717 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 3718 3719 // Set which texture component corresponds to the lane. 3720 unsigned Comp; 3721 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 3722 assert(Dmask); 3723 Comp = countTrailingZeros(Dmask); 3724 Dmask &= ~(1 << Comp); 3725 } 3726 3727 // Abort if we have more than one user per component 3728 if (Users[Lane]) 3729 return; 3730 3731 Users[Lane] = *I; 3732 NewDmask |= 1 << Comp; 3733 } 3734 3735 // Abort if there's no change 3736 if (NewDmask == OldDmask) 3737 return; 3738 3739 // Adjust the writemask in the node 3740 std::vector<SDValue> Ops; 3741 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); 3742 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); 3743 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); 3744 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); 3745 3746 // If we only got one lane, replace it with a copy 3747 // (if NewDmask has only one bit set...) 3748 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 3749 SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), 3750 MVT::i32); 3751 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 3752 SDLoc(), Users[Lane]->getValueType(0), 3753 SDValue(Node, 0), RC); 3754 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 3755 return; 3756 } 3757 3758 // Update the users of the node with the new indices 3759 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 3760 3761 SDNode *User = Users[i]; 3762 if (!User) 3763 continue; 3764 3765 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); 3766 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 3767 3768 switch (Idx) { 3769 default: break; 3770 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 3771 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 3772 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 3773 } 3774 } 3775 } 3776 3777 static bool isFrameIndexOp(SDValue Op) { 3778 if (Op.getOpcode() == ISD::AssertZext) 3779 Op = Op.getOperand(0); 3780 3781 return isa<FrameIndexSDNode>(Op); 3782 } 3783 3784 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) 3785 /// with frame index operands. 3786 /// LLVM assumes that inputs are to these instructions are registers. 3787 void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 3788 SelectionDAG &DAG) const { 3789 3790 SmallVector<SDValue, 8> Ops; 3791 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 3792 if (!isFrameIndexOp(Node->getOperand(i))) { 3793 Ops.push_back(Node->getOperand(i)); 3794 continue; 3795 } 3796 3797 SDLoc DL(Node); 3798 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 3799 Node->getOperand(i).getValueType(), 3800 Node->getOperand(i)), 0)); 3801 } 3802 3803 DAG.UpdateNodeOperands(Node, Ops); 3804 } 3805 3806 /// \brief Fold the instructions after selecting them. 3807 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 3808 SelectionDAG &DAG) const { 3809 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 3810 unsigned Opcode = Node->getMachineOpcode(); 3811 3812 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && 3813 !TII->isGather4(Opcode)) 3814 adjustWritemask(Node, DAG); 3815 3816 if (Opcode == AMDGPU::INSERT_SUBREG || 3817 Opcode == AMDGPU::REG_SEQUENCE) { 3818 legalizeTargetIndependentNode(Node, DAG); 3819 return Node; 3820 } 3821 return Node; 3822 } 3823 3824 /// \brief Assign the register class depending on the number of 3825 /// bits set in the writemask 3826 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 3827 SDNode *Node) const { 3828 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 3829 3830 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 3831 3832 if (TII->isVOP3(MI.getOpcode())) { 3833 // Make sure constant bus requirements are respected. 3834 TII->legalizeOperandsVOP3(MRI, MI); 3835 return; 3836 } 3837 3838 if (TII->isMIMG(MI)) { 3839 unsigned VReg = MI.getOperand(0).getReg(); 3840 unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; 3841 unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); 3842 unsigned BitsSet = 0; 3843 for (unsigned i = 0; i < 4; ++i) 3844 BitsSet += Writemask & (1 << i) ? 1 : 0; 3845 3846 const TargetRegisterClass *RC; 3847 switch (BitsSet) { 3848 default: return; 3849 case 1: RC = &AMDGPU::VGPR_32RegClass; break; 3850 case 2: RC = &AMDGPU::VReg_64RegClass; break; 3851 case 3: RC = &AMDGPU::VReg_96RegClass; break; 3852 } 3853 3854 unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet); 3855 MI.setDesc(TII->get(NewOpcode)); 3856 MRI.setRegClass(VReg, RC); 3857 return; 3858 } 3859 3860 // Replace unused atomics with the no return version. 3861 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); 3862 if (NoRetAtomicOp != -1) { 3863 if (!Node->hasAnyUseOfValue(0)) { 3864 MI.setDesc(TII->get(NoRetAtomicOp)); 3865 MI.RemoveOperand(0); 3866 return; 3867 } 3868 3869 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg 3870 // instruction, because the return type of these instructions is a vec2 of 3871 // the memory type, so it can be tied to the input operand. 3872 // This means these instructions always have a use, so we need to add a 3873 // special case to check if the atomic has only one extract_subreg use, 3874 // which itself has no uses. 3875 if ((Node->hasNUsesOfValue(1, 0) && 3876 Node->use_begin()->isMachineOpcode() && 3877 Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && 3878 !Node->use_begin()->hasAnyUseOfValue(0))) { 3879 unsigned Def = MI.getOperand(0).getReg(); 3880 3881 // Change this into a noret atomic. 3882 MI.setDesc(TII->get(NoRetAtomicOp)); 3883 MI.RemoveOperand(0); 3884 3885 // If we only remove the def operand from the atomic instruction, the 3886 // extract_subreg will be left with a use of a vreg without a def. 3887 // So we need to insert an implicit_def to avoid machine verifier 3888 // errors. 3889 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 3890 TII->get(AMDGPU::IMPLICIT_DEF), Def); 3891 } 3892 return; 3893 } 3894 } 3895 3896 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, 3897 uint64_t Val) { 3898 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); 3899 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 3900 } 3901 3902 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 3903 const SDLoc &DL, 3904 SDValue Ptr) const { 3905 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 3906 3907 // Build the half of the subregister with the constants before building the 3908 // full 128-bit register. If we are building multiple resource descriptors, 3909 // this will allow CSEing of the 2-component register. 3910 const SDValue Ops0[] = { 3911 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), 3912 buildSMovImm32(DAG, DL, 0), 3913 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 3914 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 3915 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) 3916 }; 3917 3918 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, 3919 MVT::v2i32, Ops0), 0); 3920 3921 // Combine the constants and the pointer. 3922 const SDValue Ops1[] = { 3923 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 3924 Ptr, 3925 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), 3926 SubRegHi, 3927 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) 3928 }; 3929 3930 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 3931 } 3932 3933 /// \brief Return a resource descriptor with the 'Add TID' bit enabled 3934 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] 3935 /// of the resource descriptor) to create an offset, which is added to 3936 /// the resource pointer. 3937 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, 3938 SDValue Ptr, uint32_t RsrcDword1, 3939 uint64_t RsrcDword2And3) const { 3940 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 3941 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 3942 if (RsrcDword1) { 3943 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 3944 DAG.getConstant(RsrcDword1, DL, MVT::i32)), 3945 0); 3946 } 3947 3948 SDValue DataLo = buildSMovImm32(DAG, DL, 3949 RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 3950 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 3951 3952 const SDValue Ops[] = { 3953 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 3954 PtrLo, 3955 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 3956 PtrHi, 3957 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), 3958 DataLo, 3959 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), 3960 DataHi, 3961 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) 3962 }; 3963 3964 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 3965 } 3966 3967 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 3968 const TargetRegisterClass *RC, 3969 unsigned Reg, EVT VT) const { 3970 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 3971 3972 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 3973 cast<RegisterSDNode>(VReg)->getReg(), VT); 3974 } 3975 3976 //===----------------------------------------------------------------------===// 3977 // SI Inline Assembly Support 3978 //===----------------------------------------------------------------------===// 3979 3980 std::pair<unsigned, const TargetRegisterClass *> 3981 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 3982 StringRef Constraint, 3983 MVT VT) const { 3984 3985 if (Constraint.size() == 1) { 3986 switch (Constraint[0]) { 3987 case 's': 3988 case 'r': 3989 switch (VT.getSizeInBits()) { 3990 default: 3991 return std::make_pair(0U, nullptr); 3992 case 32: 3993 return std::make_pair(0U, &AMDGPU::SReg_32RegClass); 3994 case 64: 3995 return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); 3996 case 128: 3997 return std::make_pair(0U, &AMDGPU::SReg_128RegClass); 3998 case 256: 3999 return std::make_pair(0U, &AMDGPU::SReg_256RegClass); 4000 } 4001 4002 case 'v': 4003 switch (VT.getSizeInBits()) { 4004 default: 4005 return std::make_pair(0U, nullptr); 4006 case 32: 4007 return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); 4008 case 64: 4009 return std::make_pair(0U, &AMDGPU::VReg_64RegClass); 4010 case 96: 4011 return std::make_pair(0U, &AMDGPU::VReg_96RegClass); 4012 case 128: 4013 return std::make_pair(0U, &AMDGPU::VReg_128RegClass); 4014 case 256: 4015 return std::make_pair(0U, &AMDGPU::VReg_256RegClass); 4016 case 512: 4017 return std::make_pair(0U, &AMDGPU::VReg_512RegClass); 4018 } 4019 } 4020 } 4021 4022 if (Constraint.size() > 1) { 4023 const TargetRegisterClass *RC = nullptr; 4024 if (Constraint[1] == 'v') { 4025 RC = &AMDGPU::VGPR_32RegClass; 4026 } else if (Constraint[1] == 's') { 4027 RC = &AMDGPU::SGPR_32RegClass; 4028 } 4029 4030 if (RC) { 4031 uint32_t Idx; 4032 bool Failed = Constraint.substr(2).getAsInteger(10, Idx); 4033 if (!Failed && Idx < RC->getNumRegs()) 4034 return std::make_pair(RC->getRegister(Idx), RC); 4035 } 4036 } 4037 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4038 } 4039 4040 SITargetLowering::ConstraintType 4041 SITargetLowering::getConstraintType(StringRef Constraint) const { 4042 if (Constraint.size() == 1) { 4043 switch (Constraint[0]) { 4044 default: break; 4045 case 's': 4046 case 'v': 4047 return C_RegisterClass; 4048 } 4049 } 4050 return TargetLowering::getConstraintType(Constraint); 4051 } 4052