1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for SI 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifdef _MSC_VER 16 // Provide M_PI. 17 #define _USE_MATH_DEFINES 18 #include <cmath> 19 #endif 20 21 #include "SIISelLowering.h" 22 #include "AMDGPU.h" 23 #include "AMDGPUIntrinsicInfo.h" 24 #include "AMDGPUSubtarget.h" 25 #include "SIInstrInfo.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "SIRegisterInfo.h" 28 #include "llvm/ADT/BitVector.h" 29 #include "llvm/ADT/StringSwitch.h" 30 #include "llvm/CodeGen/CallingConvLower.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineRegisterInfo.h" 33 #include "llvm/CodeGen/SelectionDAG.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Function.h" 36 #include "llvm/ADT/SmallString.h" 37 38 using namespace llvm; 39 40 SITargetLowering::SITargetLowering(TargetMachine &TM, 41 const AMDGPUSubtarget &STI) 42 : AMDGPUTargetLowering(TM, STI) { 43 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 44 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 45 46 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); 47 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 48 49 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 50 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 51 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 52 53 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); 54 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); 55 56 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); 57 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 58 59 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); 60 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 61 62 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); 63 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 64 65 computeRegisterProperties(STI.getRegisterInfo()); 66 67 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 68 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 69 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 70 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 71 72 setOperationAction(ISD::ADD, MVT::i32, Legal); 73 setOperationAction(ISD::ADDC, MVT::i32, Legal); 74 setOperationAction(ISD::ADDE, MVT::i32, Legal); 75 setOperationAction(ISD::SUBC, MVT::i32, Legal); 76 setOperationAction(ISD::SUBE, MVT::i32, Legal); 77 78 setOperationAction(ISD::FSIN, MVT::f32, Custom); 79 setOperationAction(ISD::FCOS, MVT::f32, Custom); 80 81 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 82 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 83 84 // We need to custom lower vector stores from local memory 85 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 86 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 87 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 88 89 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 90 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 91 92 setOperationAction(ISD::STORE, MVT::i1, Custom); 93 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 94 95 setOperationAction(ISD::SELECT, MVT::i64, Custom); 96 setOperationAction(ISD::SELECT, MVT::f64, Promote); 97 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 98 99 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 100 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 101 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 102 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 103 104 setOperationAction(ISD::SETCC, MVT::i1, Promote); 105 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 106 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 107 108 setOperationAction(ISD::BSWAP, MVT::i32, Legal); 109 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 110 111 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); 112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 113 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 114 115 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); 116 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 117 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 118 119 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 120 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 121 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 122 123 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 124 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 125 126 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 127 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 128 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); 129 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 130 131 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 132 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 133 134 for (MVT VT : MVT::integer_valuetypes()) { 135 if (VT == MVT::i64) 136 continue; 137 138 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 139 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); 140 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); 141 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); 142 143 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 144 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); 145 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); 146 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); 147 148 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 149 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); 150 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); 151 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); 152 } 153 154 for (MVT VT : MVT::integer_vector_valuetypes()) { 155 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); 156 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); 157 } 158 159 for (MVT VT : MVT::fp_valuetypes()) 160 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 161 162 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 163 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 164 165 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 166 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 167 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); 168 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 169 170 171 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); 172 173 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); 174 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); 175 176 setOperationAction(ISD::LOAD, MVT::i1, Custom); 177 178 setOperationAction(ISD::LOAD, MVT::v2i64, Promote); 179 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); 180 181 setOperationAction(ISD::STORE, MVT::v2i64, Promote); 182 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); 183 184 setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); 185 186 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 187 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 188 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 189 190 // These should use UDIVREM, so set them to expand 191 setOperationAction(ISD::UDIV, MVT::i64, Expand); 192 setOperationAction(ISD::UREM, MVT::i64, Expand); 193 194 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 195 setOperationAction(ISD::SELECT, MVT::i1, Promote); 196 197 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); 198 199 200 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 201 202 // We only support LOAD/STORE and vector manipulation ops for vectors 203 // with > 4 elements. 204 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { 205 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 206 switch(Op) { 207 case ISD::LOAD: 208 case ISD::STORE: 209 case ISD::BUILD_VECTOR: 210 case ISD::BITCAST: 211 case ISD::EXTRACT_VECTOR_ELT: 212 case ISD::INSERT_VECTOR_ELT: 213 case ISD::INSERT_SUBVECTOR: 214 case ISD::EXTRACT_SUBVECTOR: 215 case ISD::SCALAR_TO_VECTOR: 216 break; 217 case ISD::CONCAT_VECTORS: 218 setOperationAction(Op, VT, Custom); 219 break; 220 default: 221 setOperationAction(Op, VT, Expand); 222 break; 223 } 224 } 225 } 226 227 // Most operations are naturally 32-bit vector operations. We only support 228 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. 229 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { 230 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 231 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); 232 233 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 234 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); 235 236 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 237 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); 238 239 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 240 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); 241 } 242 243 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 244 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 245 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 246 setOperationAction(ISD::FRINT, MVT::f64, Legal); 247 } 248 249 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 250 setOperationAction(ISD::FDIV, MVT::f32, Custom); 251 setOperationAction(ISD::FDIV, MVT::f64, Custom); 252 253 setTargetDAGCombine(ISD::FADD); 254 setTargetDAGCombine(ISD::FSUB); 255 setTargetDAGCombine(ISD::FMINNUM); 256 setTargetDAGCombine(ISD::FMAXNUM); 257 setTargetDAGCombine(ISD::SMIN); 258 setTargetDAGCombine(ISD::SMAX); 259 setTargetDAGCombine(ISD::UMIN); 260 setTargetDAGCombine(ISD::UMAX); 261 setTargetDAGCombine(ISD::SETCC); 262 setTargetDAGCombine(ISD::AND); 263 setTargetDAGCombine(ISD::OR); 264 setTargetDAGCombine(ISD::UINT_TO_FP); 265 266 // All memory operations. Some folding on the pointer operand is done to help 267 // matching the constant offsets in the addressing modes. 268 setTargetDAGCombine(ISD::LOAD); 269 setTargetDAGCombine(ISD::STORE); 270 setTargetDAGCombine(ISD::ATOMIC_LOAD); 271 setTargetDAGCombine(ISD::ATOMIC_STORE); 272 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); 273 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); 274 setTargetDAGCombine(ISD::ATOMIC_SWAP); 275 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); 276 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); 277 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); 278 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); 279 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); 280 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); 281 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); 282 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); 283 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); 284 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); 285 286 setSchedulingPreference(Sched::RegPressure); 287 } 288 289 //===----------------------------------------------------------------------===// 290 // TargetLowering queries 291 //===----------------------------------------------------------------------===// 292 293 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, 294 EVT) const { 295 // SI has some legal vector types, but no legal vector operations. Say no 296 // shuffles are legal in order to prefer scalarizing some vector operations. 297 return false; 298 } 299 300 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { 301 // Flat instructions do not have offsets, and only have the register 302 // address. 303 return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); 304 } 305 306 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { 307 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and 308 // additionally can do r + r + i with addr64. 32-bit has more addressing 309 // mode options. Depending on the resource constant, it can also do 310 // (i64 r0) + (i32 r1) * (i14 i). 311 // 312 // Private arrays end up using a scratch buffer most of the time, so also 313 // assume those use MUBUF instructions. Scratch loads / stores are currently 314 // implemented as mubuf instructions with offen bit set, so slightly 315 // different than the normal addr64. 316 if (!isUInt<12>(AM.BaseOffs)) 317 return false; 318 319 // FIXME: Since we can split immediate into soffset and immediate offset, 320 // would it make sense to allow any immediate? 321 322 switch (AM.Scale) { 323 case 0: // r + i or just i, depending on HasBaseReg. 324 return true; 325 case 1: 326 return true; // We have r + r or r + i. 327 case 2: 328 if (AM.HasBaseReg) { 329 // Reject 2 * r + r. 330 return false; 331 } 332 333 // Allow 2 * r as r + r 334 // Or 2 * r + i is allowed as r + r + i. 335 return true; 336 default: // Don't allow n * r 337 return false; 338 } 339 } 340 341 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, 342 const AddrMode &AM, Type *Ty, 343 unsigned AS) const { 344 // No global is ever allowed as a base. 345 if (AM.BaseGV) 346 return false; 347 348 switch (AS) { 349 case AMDGPUAS::GLOBAL_ADDRESS: { 350 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 351 // Assume the we will use FLAT for all global memory accesses 352 // on VI. 353 // FIXME: This assumption is currently wrong. On VI we still use 354 // MUBUF instructions for the r + i addressing mode. As currently 355 // implemented, the MUBUF instructions only work on buffer < 4GB. 356 // It may be possible to support > 4GB buffers with MUBUF instructions, 357 // by setting the stride value in the resource descriptor which would 358 // increase the size limit to (stride * 4GB). However, this is risky, 359 // because it has never been validated. 360 return isLegalFlatAddressingMode(AM); 361 } 362 363 return isLegalMUBUFAddressingMode(AM); 364 } 365 case AMDGPUAS::CONSTANT_ADDRESS: { 366 // If the offset isn't a multiple of 4, it probably isn't going to be 367 // correctly aligned. 368 if (AM.BaseOffs % 4 != 0) 369 return isLegalMUBUFAddressingMode(AM); 370 371 // There are no SMRD extloads, so if we have to do a small type access we 372 // will use a MUBUF load. 373 // FIXME?: We also need to do this if unaligned, but we don't know the 374 // alignment here. 375 if (DL.getTypeStoreSize(Ty) < 4) 376 return isLegalMUBUFAddressingMode(AM); 377 378 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { 379 // SMRD instructions have an 8-bit, dword offset on SI. 380 if (!isUInt<8>(AM.BaseOffs / 4)) 381 return false; 382 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { 383 // On CI+, this can also be a 32-bit literal constant offset. If it fits 384 // in 8-bits, it can use a smaller encoding. 385 if (!isUInt<32>(AM.BaseOffs / 4)) 386 return false; 387 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { 388 // On VI, these use the SMEM format and the offset is 20-bit in bytes. 389 if (!isUInt<20>(AM.BaseOffs)) 390 return false; 391 } else 392 llvm_unreachable("unhandled generation"); 393 394 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 395 return true; 396 397 if (AM.Scale == 1 && AM.HasBaseReg) 398 return true; 399 400 return false; 401 } 402 403 case AMDGPUAS::PRIVATE_ADDRESS: 404 case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: 405 return isLegalMUBUFAddressingMode(AM); 406 407 case AMDGPUAS::LOCAL_ADDRESS: 408 case AMDGPUAS::REGION_ADDRESS: { 409 // Basic, single offset DS instructions allow a 16-bit unsigned immediate 410 // field. 411 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have 412 // an 8-bit dword offset but we don't know the alignment here. 413 if (!isUInt<16>(AM.BaseOffs)) 414 return false; 415 416 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 417 return true; 418 419 if (AM.Scale == 1 && AM.HasBaseReg) 420 return true; 421 422 return false; 423 } 424 case AMDGPUAS::FLAT_ADDRESS: 425 return isLegalFlatAddressingMode(AM); 426 427 default: 428 llvm_unreachable("unhandled address space"); 429 } 430 } 431 432 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 433 unsigned AddrSpace, 434 unsigned Align, 435 bool *IsFast) const { 436 if (IsFast) 437 *IsFast = false; 438 439 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, 440 // which isn't a simple VT. 441 if (!VT.isSimple() || VT == MVT::Other) 442 return false; 443 444 // TODO - CI+ supports unaligned memory accesses, but this requires driver 445 // support. 446 447 // XXX - The only mention I see of this in the ISA manual is for LDS direct 448 // reads the "byte address and must be dword aligned". Is it also true for the 449 // normal loads and stores? 450 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { 451 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte 452 // aligned, 8 byte access in a single operation using ds_read2/write2_b32 453 // with adjacent offsets. 454 bool AlignedBy4 = (Align % 4 == 0); 455 if (IsFast) 456 *IsFast = AlignedBy4; 457 return AlignedBy4; 458 } 459 460 // Smaller than dword value must be aligned. 461 // FIXME: This should be allowed on CI+ 462 if (VT.bitsLT(MVT::i32)) 463 return false; 464 465 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 466 // byte-address are ignored, thus forcing Dword alignment. 467 // This applies to private, global, and constant memory. 468 if (IsFast) 469 *IsFast = true; 470 471 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 472 } 473 474 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 475 unsigned SrcAlign, bool IsMemset, 476 bool ZeroMemset, 477 bool MemcpyStrSrc, 478 MachineFunction &MF) const { 479 // FIXME: Should account for address space here. 480 481 // The default fallback uses the private pointer size as a guess for a type to 482 // use. Make sure we switch these to 64-bit accesses. 483 484 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global 485 return MVT::v4i32; 486 487 if (Size >= 8 && DstAlign >= 4) 488 return MVT::v2i32; 489 490 // Use the default. 491 return MVT::Other; 492 } 493 494 static bool isFlatGlobalAddrSpace(unsigned AS) { 495 return AS == AMDGPUAS::GLOBAL_ADDRESS || 496 AS == AMDGPUAS::FLAT_ADDRESS || 497 AS == AMDGPUAS::CONSTANT_ADDRESS; 498 } 499 500 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 501 unsigned DestAS) const { 502 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); 503 } 504 505 506 bool SITargetLowering::isMemOpUniform(const SDNode *N) const { 507 const MemSDNode *MemNode = cast<MemSDNode>(N); 508 const Value *Ptr = MemNode->getMemOperand()->getValue(); 509 510 // UndefValue means this is a load of a kernel input. These are uniform. 511 // Sometimes LDS instructions have constant pointers 512 if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) || 513 isa<GlobalValue>(Ptr)) 514 return true; 515 516 const Instruction *I = dyn_cast_or_null<Instruction>(Ptr); 517 return I && I->getMetadata("amdgpu.uniform"); 518 } 519 520 TargetLoweringBase::LegalizeTypeAction 521 SITargetLowering::getPreferredVectorAction(EVT VT) const { 522 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) 523 return TypeSplitVector; 524 525 return TargetLoweringBase::getPreferredVectorAction(VT); 526 } 527 528 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 529 Type *Ty) const { 530 const SIInstrInfo *TII = 531 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 532 return TII->isInlineConstant(Imm); 533 } 534 535 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { 536 537 // SimplifySetCC uses this function to determine whether or not it should 538 // create setcc with i1 operands. We don't have instructions for i1 setcc. 539 if (VT == MVT::i1 && Op == ISD::SETCC) 540 return false; 541 542 return TargetLowering::isTypeDesirableForOp(Op, VT); 543 } 544 545 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 546 SDLoc SL, SDValue Chain, 547 unsigned Offset, bool Signed) const { 548 const DataLayout &DL = DAG.getDataLayout(); 549 MachineFunction &MF = DAG.getMachineFunction(); 550 const SIRegisterInfo *TRI = 551 static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); 552 unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 553 554 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 555 556 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 557 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 558 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 559 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, 560 MRI.getLiveInVirtReg(InputPtrReg), PtrVT); 561 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 562 DAG.getConstant(Offset, SL, PtrVT)); 563 SDValue PtrOffset = DAG.getUNDEF(PtrVT); 564 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 565 566 unsigned Align = DL.getABITypeAlignment(Ty); 567 568 ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 569 if (MemVT.isFloatingPoint()) 570 ExtTy = ISD::EXTLOAD; 571 572 return DAG.getLoad(ISD::UNINDEXED, ExtTy, 573 VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, 574 false, // isVolatile 575 true, // isNonTemporal 576 true, // isInvariant 577 Align); // Alignment 578 } 579 580 SDValue SITargetLowering::LowerFormalArguments( 581 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 582 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 583 SmallVectorImpl<SDValue> &InVals) const { 584 const SIRegisterInfo *TRI = 585 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 586 587 MachineFunction &MF = DAG.getMachineFunction(); 588 FunctionType *FType = MF.getFunction()->getFunctionType(); 589 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 590 const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); 591 592 if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { 593 const Function *Fn = MF.getFunction(); 594 DiagnosticInfoUnsupported NoGraphicsHSA( 595 *Fn, "unsupported non-compute shaders with HSA", DL); 596 DAG.getContext()->diagnose(NoGraphicsHSA); 597 return SDValue(); 598 } 599 600 // FIXME: We currently assume all calling conventions are kernels. 601 602 SmallVector<ISD::InputArg, 16> Splits; 603 BitVector Skipped(Ins.size()); 604 605 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 606 const ISD::InputArg &Arg = Ins[i]; 607 608 // First check if it's a PS input addr 609 if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && 610 !Arg.Flags.isByVal() && PSInputNum <= 15) { 611 612 if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { 613 // We can safely skip PS inputs 614 Skipped.set(i); 615 ++PSInputNum; 616 continue; 617 } 618 619 Info->markPSInputAllocated(PSInputNum); 620 if (Arg.Used) 621 Info->PSInputEna |= 1 << PSInputNum; 622 623 ++PSInputNum; 624 } 625 626 // Second split vertices into their elements 627 if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { 628 ISD::InputArg NewArg = Arg; 629 NewArg.Flags.setSplit(); 630 NewArg.VT = Arg.VT.getVectorElementType(); 631 632 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 633 // three or five element vertex only needs three or five registers, 634 // NOT four or eight. 635 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 636 unsigned NumElements = ParamType->getVectorNumElements(); 637 638 for (unsigned j = 0; j != NumElements; ++j) { 639 Splits.push_back(NewArg); 640 NewArg.PartOffset += NewArg.VT.getStoreSize(); 641 } 642 643 } else if (Info->getShaderType() != ShaderType::COMPUTE) { 644 Splits.push_back(Arg); 645 } 646 } 647 648 SmallVector<CCValAssign, 16> ArgLocs; 649 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 650 *DAG.getContext()); 651 652 // At least one interpolation mode must be enabled or else the GPU will hang. 653 // 654 // Check PSInputAddr instead of PSInputEna. The idea is that if the user set 655 // PSInputAddr, the user wants to enable some bits after the compilation 656 // based on run-time states. Since we can't know what the final PSInputEna 657 // will look like, so we shouldn't do anything here and the user should take 658 // responsibility for the correct programming. 659 // 660 // Otherwise, the following restrictions apply: 661 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 662 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 663 // enabled too. 664 if (Info->getShaderType() == ShaderType::PIXEL && 665 ((Info->getPSInputAddr() & 0x7F) == 0 || 666 ((Info->getPSInputAddr() & 0xF) == 0 && 667 Info->isPSInputAllocated(11)))) { 668 CCInfo.AllocateReg(AMDGPU::VGPR0); 669 CCInfo.AllocateReg(AMDGPU::VGPR1); 670 Info->markPSInputAllocated(0); 671 Info->PSInputEna |= 1; 672 } 673 674 if (Info->getShaderType() == ShaderType::COMPUTE) { 675 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 676 Splits); 677 } 678 679 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 680 if (Info->hasPrivateSegmentBuffer()) { 681 unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); 682 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); 683 CCInfo.AllocateReg(PrivateSegmentBufferReg); 684 } 685 686 if (Info->hasDispatchPtr()) { 687 unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); 688 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); 689 CCInfo.AllocateReg(DispatchPtrReg); 690 } 691 692 if (Info->hasKernargSegmentPtr()) { 693 unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); 694 MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); 695 CCInfo.AllocateReg(InputPtrReg); 696 } 697 698 AnalyzeFormalArguments(CCInfo, Splits); 699 700 SmallVector<SDValue, 16> Chains; 701 702 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 703 704 const ISD::InputArg &Arg = Ins[i]; 705 if (Skipped[i]) { 706 InVals.push_back(DAG.getUNDEF(Arg.VT)); 707 continue; 708 } 709 710 CCValAssign &VA = ArgLocs[ArgIdx++]; 711 MVT VT = VA.getLocVT(); 712 713 if (VA.isMemLoc()) { 714 VT = Ins[i].VT; 715 EVT MemVT = Splits[i].VT; 716 const unsigned Offset = Subtarget->getExplicitKernelArgOffset() + 717 VA.getLocMemOffset(); 718 // The first 36 bytes of the input buffer contains information about 719 // thread group and global sizes. 720 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, 721 Offset, Ins[i].Flags.isSExt()); 722 Chains.push_back(Arg.getValue(1)); 723 724 auto *ParamTy = 725 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 726 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && 727 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 728 // On SI local pointers are just offsets into LDS, so they are always 729 // less than 16-bits. On CI and newer they could potentially be 730 // real pointers, so we can't guarantee their size. 731 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, 732 DAG.getValueType(MVT::i16)); 733 } 734 735 InVals.push_back(Arg); 736 Info->ABIArgOffset = Offset + MemVT.getStoreSize(); 737 continue; 738 } 739 assert(VA.isRegLoc() && "Parameter must be in a register!"); 740 741 unsigned Reg = VA.getLocReg(); 742 743 if (VT == MVT::i64) { 744 // For now assume it is a pointer 745 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 746 &AMDGPU::SReg_64RegClass); 747 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 748 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 749 InVals.push_back(Copy); 750 continue; 751 } 752 753 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 754 755 Reg = MF.addLiveIn(Reg, RC); 756 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 757 758 if (Arg.VT.isVector()) { 759 760 // Build a vector from the registers 761 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 762 unsigned NumElements = ParamType->getVectorNumElements(); 763 764 SmallVector<SDValue, 4> Regs; 765 Regs.push_back(Val); 766 for (unsigned j = 1; j != NumElements; ++j) { 767 Reg = ArgLocs[ArgIdx++].getLocReg(); 768 Reg = MF.addLiveIn(Reg, RC); 769 770 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 771 Regs.push_back(Copy); 772 } 773 774 // Fill up the missing vector elements 775 NumElements = Arg.VT.getVectorNumElements() - NumElements; 776 Regs.append(NumElements, DAG.getUNDEF(VT)); 777 778 InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); 779 continue; 780 } 781 782 InVals.push_back(Val); 783 } 784 785 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 786 // these from the dispatch pointer. 787 788 // Start adding system SGPRs. 789 if (Info->hasWorkGroupIDX()) { 790 unsigned Reg = Info->addWorkGroupIDX(); 791 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); 792 CCInfo.AllocateReg(Reg); 793 } else 794 llvm_unreachable("work group id x is always enabled"); 795 796 if (Info->hasWorkGroupIDY()) { 797 unsigned Reg = Info->addWorkGroupIDY(); 798 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); 799 CCInfo.AllocateReg(Reg); 800 } 801 802 if (Info->hasWorkGroupIDZ()) { 803 unsigned Reg = Info->addWorkGroupIDZ(); 804 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); 805 CCInfo.AllocateReg(Reg); 806 } 807 808 if (Info->hasWorkGroupInfo()) { 809 unsigned Reg = Info->addWorkGroupInfo(); 810 MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); 811 CCInfo.AllocateReg(Reg); 812 } 813 814 if (Info->hasPrivateSegmentWaveByteOffset()) { 815 // Scratch wave offset passed in system SGPR. 816 unsigned PrivateSegmentWaveByteOffsetReg 817 = Info->addPrivateSegmentWaveByteOffset(); 818 819 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); 820 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); 821 } 822 823 // Now that we've figured out where the scratch register inputs are, see if 824 // should reserve the arguments and use them directly. 825 826 bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); 827 828 if (ST.isAmdHsaOS()) { 829 // TODO: Assume we will spill without optimizations. 830 if (HasStackObjects) { 831 // If we have stack objects, we unquestionably need the private buffer 832 // resource. For the HSA ABI, this will be the first 4 user SGPR 833 // inputs. We can reserve those and use them directly. 834 835 unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( 836 MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); 837 Info->setScratchRSrcReg(PrivateSegmentBufferReg); 838 839 unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( 840 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 841 Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); 842 } else { 843 unsigned ReservedBufferReg 844 = TRI->reservedPrivateSegmentBufferReg(MF); 845 unsigned ReservedOffsetReg 846 = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); 847 848 // We tentatively reserve the last registers (skipping the last two 849 // which may contain VCC). After register allocation, we'll replace 850 // these with the ones immediately after those which were really 851 // allocated. In the prologue copies will be inserted from the argument 852 // to these reserved registers. 853 Info->setScratchRSrcReg(ReservedBufferReg); 854 Info->setScratchWaveOffsetReg(ReservedOffsetReg); 855 } 856 } else { 857 unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); 858 859 // Without HSA, relocations are used for the scratch pointer and the 860 // buffer resource setup is always inserted in the prologue. Scratch wave 861 // offset is still in an input SGPR. 862 Info->setScratchRSrcReg(ReservedBufferReg); 863 864 if (HasStackObjects) { 865 unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( 866 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 867 Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); 868 } else { 869 unsigned ReservedOffsetReg 870 = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); 871 Info->setScratchWaveOffsetReg(ReservedOffsetReg); 872 } 873 } 874 875 if (Info->hasWorkItemIDX()) { 876 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); 877 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 878 CCInfo.AllocateReg(Reg); 879 } else 880 llvm_unreachable("workitem id x should always be enabled"); 881 882 if (Info->hasWorkItemIDY()) { 883 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); 884 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 885 CCInfo.AllocateReg(Reg); 886 } 887 888 if (Info->hasWorkItemIDZ()) { 889 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); 890 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 891 CCInfo.AllocateReg(Reg); 892 } 893 894 if (Chains.empty()) 895 return Chain; 896 897 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 898 } 899 900 SDValue SITargetLowering::LowerReturn(SDValue Chain, 901 CallingConv::ID CallConv, 902 bool isVarArg, 903 const SmallVectorImpl<ISD::OutputArg> &Outs, 904 const SmallVectorImpl<SDValue> &OutVals, 905 SDLoc DL, SelectionDAG &DAG) const { 906 MachineFunction &MF = DAG.getMachineFunction(); 907 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 908 909 if (Info->getShaderType() == ShaderType::COMPUTE) 910 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, 911 OutVals, DL, DAG); 912 913 Info->setIfReturnsVoid(Outs.size() == 0); 914 915 SmallVector<ISD::OutputArg, 48> Splits; 916 SmallVector<SDValue, 48> SplitVals; 917 918 // Split vectors into their elements. 919 for (unsigned i = 0, e = Outs.size(); i != e; ++i) { 920 const ISD::OutputArg &Out = Outs[i]; 921 922 if (Out.VT.isVector()) { 923 MVT VT = Out.VT.getVectorElementType(); 924 ISD::OutputArg NewOut = Out; 925 NewOut.Flags.setSplit(); 926 NewOut.VT = VT; 927 928 // We want the original number of vector elements here, e.g. 929 // three or five, not four or eight. 930 unsigned NumElements = Out.ArgVT.getVectorNumElements(); 931 932 for (unsigned j = 0; j != NumElements; ++j) { 933 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], 934 DAG.getConstant(j, DL, MVT::i32)); 935 SplitVals.push_back(Elem); 936 Splits.push_back(NewOut); 937 NewOut.PartOffset += NewOut.VT.getStoreSize(); 938 } 939 } else { 940 SplitVals.push_back(OutVals[i]); 941 Splits.push_back(Out); 942 } 943 } 944 945 // CCValAssign - represent the assignment of the return value to a location. 946 SmallVector<CCValAssign, 48> RVLocs; 947 948 // CCState - Info about the registers and stack slots. 949 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 950 *DAG.getContext()); 951 952 // Analyze outgoing return values. 953 AnalyzeReturn(CCInfo, Splits); 954 955 SDValue Flag; 956 SmallVector<SDValue, 48> RetOps; 957 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 958 959 // Copy the result values into the output registers. 960 for (unsigned i = 0, realRVLocIdx = 0; 961 i != RVLocs.size(); 962 ++i, ++realRVLocIdx) { 963 CCValAssign &VA = RVLocs[i]; 964 assert(VA.isRegLoc() && "Can only return in registers!"); 965 966 SDValue Arg = SplitVals[realRVLocIdx]; 967 968 // Copied from other backends. 969 switch (VA.getLocInfo()) { 970 default: llvm_unreachable("Unknown loc info!"); 971 case CCValAssign::Full: 972 break; 973 case CCValAssign::BCvt: 974 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 975 break; 976 } 977 978 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 979 Flag = Chain.getValue(1); 980 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 981 } 982 983 // Update chain and glue. 984 RetOps[0] = Chain; 985 if (Flag.getNode()) 986 RetOps.push_back(Flag); 987 988 return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps); 989 } 990 991 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, 992 SelectionDAG &DAG) const { 993 unsigned Reg = StringSwitch<unsigned>(RegName) 994 .Case("m0", AMDGPU::M0) 995 .Case("exec", AMDGPU::EXEC) 996 .Case("exec_lo", AMDGPU::EXEC_LO) 997 .Case("exec_hi", AMDGPU::EXEC_HI) 998 .Case("flat_scratch", AMDGPU::FLAT_SCR) 999 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) 1000 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) 1001 .Default(AMDGPU::NoRegister); 1002 1003 if (Reg == AMDGPU::NoRegister) { 1004 report_fatal_error(Twine("invalid register name \"" 1005 + StringRef(RegName) + "\".")); 1006 1007 } 1008 1009 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && 1010 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { 1011 report_fatal_error(Twine("invalid register \"" 1012 + StringRef(RegName) + "\" for subtarget.")); 1013 } 1014 1015 switch (Reg) { 1016 case AMDGPU::M0: 1017 case AMDGPU::EXEC_LO: 1018 case AMDGPU::EXEC_HI: 1019 case AMDGPU::FLAT_SCR_LO: 1020 case AMDGPU::FLAT_SCR_HI: 1021 if (VT.getSizeInBits() == 32) 1022 return Reg; 1023 break; 1024 case AMDGPU::EXEC: 1025 case AMDGPU::FLAT_SCR: 1026 if (VT.getSizeInBits() == 64) 1027 return Reg; 1028 break; 1029 default: 1030 llvm_unreachable("missing register type checking"); 1031 } 1032 1033 report_fatal_error(Twine("invalid type for register \"" 1034 + StringRef(RegName) + "\".")); 1035 } 1036 1037 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 1038 MachineInstr * MI, MachineBasicBlock * BB) const { 1039 1040 switch (MI->getOpcode()) { 1041 default: 1042 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 1043 case AMDGPU::BRANCH: 1044 return BB; 1045 } 1046 return BB; 1047 } 1048 1049 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 1050 // This currently forces unfolding various combinations of fsub into fma with 1051 // free fneg'd operands. As long as we have fast FMA (controlled by 1052 // isFMAFasterThanFMulAndFAdd), we should perform these. 1053 1054 // When fma is quarter rate, for f64 where add / sub are at best half rate, 1055 // most of these combines appear to be cycle neutral but save on instruction 1056 // count / code size. 1057 return true; 1058 } 1059 1060 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, 1061 EVT VT) const { 1062 if (!VT.isVector()) { 1063 return MVT::i1; 1064 } 1065 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 1066 } 1067 1068 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const { 1069 return MVT::i32; 1070 } 1071 1072 // Answering this is somewhat tricky and depends on the specific device which 1073 // have different rates for fma or all f64 operations. 1074 // 1075 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 1076 // regardless of which device (although the number of cycles differs between 1077 // devices), so it is always profitable for f64. 1078 // 1079 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 1080 // only on full rate devices. Normally, we should prefer selecting v_mad_f32 1081 // which we can always do even without fused FP ops since it returns the same 1082 // result as the separate operations and since it is always full 1083 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 1084 // however does not support denormals, so we do report fma as faster if we have 1085 // a fast fma device and require denormals. 1086 // 1087 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 1088 VT = VT.getScalarType(); 1089 1090 if (!VT.isSimple()) 1091 return false; 1092 1093 switch (VT.getSimpleVT().SimpleTy) { 1094 case MVT::f32: 1095 // This is as fast on some subtargets. However, we always have full rate f32 1096 // mad available which returns the same result as the separate operations 1097 // which we should prefer over fma. We can't use this if we want to support 1098 // denormals, so only report this in these cases. 1099 return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); 1100 case MVT::f64: 1101 return true; 1102 default: 1103 break; 1104 } 1105 1106 return false; 1107 } 1108 1109 //===----------------------------------------------------------------------===// 1110 // Custom DAG Lowering Operations 1111 //===----------------------------------------------------------------------===// 1112 1113 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 1114 switch (Op.getOpcode()) { 1115 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 1116 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 1117 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 1118 case ISD::LOAD: { 1119 SDValue Result = LowerLOAD(Op, DAG); 1120 assert((!Result.getNode() || 1121 Result.getNode()->getNumValues() == 2) && 1122 "Load should return a value and a chain"); 1123 return Result; 1124 } 1125 1126 case ISD::FSIN: 1127 case ISD::FCOS: 1128 return LowerTrig(Op, DAG); 1129 case ISD::SELECT: return LowerSELECT(Op, DAG); 1130 case ISD::FDIV: return LowerFDIV(Op, DAG); 1131 case ISD::STORE: return LowerSTORE(Op, DAG); 1132 case ISD::GlobalAddress: { 1133 MachineFunction &MF = DAG.getMachineFunction(); 1134 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1135 return LowerGlobalAddress(MFI, Op, DAG); 1136 } 1137 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 1138 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); 1139 } 1140 return SDValue(); 1141 } 1142 1143 /// \brief Helper function for LowerBRCOND 1144 static SDNode *findUser(SDValue Value, unsigned Opcode) { 1145 1146 SDNode *Parent = Value.getNode(); 1147 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 1148 I != E; ++I) { 1149 1150 if (I.getUse().get() != Value) 1151 continue; 1152 1153 if (I->getOpcode() == Opcode) 1154 return *I; 1155 } 1156 return nullptr; 1157 } 1158 1159 SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { 1160 1161 SDLoc SL(Op); 1162 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); 1163 unsigned FrameIndex = FINode->getIndex(); 1164 1165 // A FrameIndex node represents a 32-bit offset into scratch memory. If 1166 // the high bit of a frame index offset were to be set, this would mean 1167 // that it represented an offset of ~2GB * 64 = ~128GB from the start of the 1168 // scratch buffer, with 64 being the number of threads per wave. 1169 // 1170 // If we know the machine uses less than 128GB of scratch, then we can 1171 // amrk the high bit of the FrameIndex node as known zero, 1172 // which is important, because it means in most situations we can 1173 // prove that values derived from FrameIndex nodes are non-negative. 1174 // This enables us to take advantage of more addressing modes when 1175 // accessing scratch buffers, since for scratch reads/writes, the register 1176 // offset must always be positive. 1177 1178 SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); 1179 if (Subtarget->enableHugeScratchBuffer()) 1180 return TFI; 1181 1182 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, 1183 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31))); 1184 } 1185 1186 /// This transforms the control flow intrinsics to get the branch destination as 1187 /// last parameter, also switches branch target with BR if the need arise 1188 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 1189 SelectionDAG &DAG) const { 1190 1191 SDLoc DL(BRCOND); 1192 1193 SDNode *Intr = BRCOND.getOperand(1).getNode(); 1194 SDValue Target = BRCOND.getOperand(2); 1195 SDNode *BR = nullptr; 1196 1197 if (Intr->getOpcode() == ISD::SETCC) { 1198 // As long as we negate the condition everything is fine 1199 SDNode *SetCC = Intr; 1200 assert(SetCC->getConstantOperandVal(1) == 1); 1201 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 1202 ISD::SETNE); 1203 Intr = SetCC->getOperand(0).getNode(); 1204 1205 } else { 1206 // Get the target from BR if we don't negate the condition 1207 BR = findUser(BRCOND, ISD::BR); 1208 Target = BR->getOperand(1); 1209 } 1210 1211 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 1212 1213 // Build the result and 1214 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 1215 1216 // operands of the new intrinsic call 1217 SmallVector<SDValue, 4> Ops; 1218 Ops.push_back(BRCOND.getOperand(0)); 1219 Ops.append(Intr->op_begin() + 1, Intr->op_end()); 1220 Ops.push_back(Target); 1221 1222 // build the new intrinsic call 1223 SDNode *Result = DAG.getNode( 1224 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 1225 DAG.getVTList(Res), Ops).getNode(); 1226 1227 if (BR) { 1228 // Give the branch instruction our target 1229 SDValue Ops[] = { 1230 BR->getOperand(0), 1231 BRCOND.getOperand(2) 1232 }; 1233 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 1234 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 1235 BR = NewBR.getNode(); 1236 } 1237 1238 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 1239 1240 // Copy the intrinsic results to registers 1241 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 1242 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 1243 if (!CopyToReg) 1244 continue; 1245 1246 Chain = DAG.getCopyToReg( 1247 Chain, DL, 1248 CopyToReg->getOperand(1), 1249 SDValue(Result, i - 1), 1250 SDValue()); 1251 1252 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 1253 } 1254 1255 // Remove the old intrinsic from the chain 1256 DAG.ReplaceAllUsesOfValueWith( 1257 SDValue(Intr, Intr->getNumValues() - 1), 1258 Intr->getOperand(0)); 1259 1260 return Chain; 1261 } 1262 1263 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 1264 SDValue Op, 1265 SelectionDAG &DAG) const { 1266 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 1267 1268 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 1269 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 1270 1271 SDLoc DL(GSD); 1272 const GlobalValue *GV = GSD->getGlobal(); 1273 MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); 1274 1275 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); 1276 return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA); 1277 } 1278 1279 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, 1280 SDValue V) const { 1281 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, 1282 // so we will end up with redundant moves to m0. 1283 // 1284 // We can't use S_MOV_B32, because there is no way to specify m0 as the 1285 // destination register. 1286 // 1287 // We have to use them both. Machine cse will combine all the S_MOV_B32 1288 // instructions and the register coalescer eliminate the extra copies. 1289 SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); 1290 return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), 1291 SDValue(M0, 0), SDValue()); // Glue 1292 // A Null SDValue creates 1293 // a glue result. 1294 } 1295 1296 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, 1297 SDValue Op, 1298 MVT VT, 1299 unsigned Offset) const { 1300 SDLoc SL(Op); 1301 SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, 1302 DAG.getEntryNode(), Offset, false); 1303 // The local size values will have the hi 16-bits as zero. 1304 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, 1305 DAG.getValueType(VT)); 1306 } 1307 1308 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 1309 SelectionDAG &DAG) const { 1310 MachineFunction &MF = DAG.getMachineFunction(); 1311 auto MFI = MF.getInfo<SIMachineFunctionInfo>(); 1312 const SIRegisterInfo *TRI = 1313 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 1314 1315 EVT VT = Op.getValueType(); 1316 SDLoc DL(Op); 1317 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 1318 1319 // TODO: Should this propagate fast-math-flags? 1320 1321 switch (IntrinsicID) { 1322 case Intrinsic::amdgcn_dispatch_ptr: 1323 if (!Subtarget->isAmdHsaOS()) { 1324 DiagnosticInfoUnsupported BadIntrin( 1325 *MF.getFunction(), "unsupported hsa intrinsic without hsa target", 1326 DL); 1327 DAG.getContext()->diagnose(BadIntrin); 1328 return DAG.getUNDEF(VT); 1329 } 1330 1331 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, 1332 TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT); 1333 case Intrinsic::amdgcn_rcp: 1334 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); 1335 case Intrinsic::amdgcn_rsq: 1336 case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name 1337 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 1338 case Intrinsic::amdgcn_rsq_clamped: 1339 case AMDGPUIntrinsic::AMDGPU_rsq_clamped: { // Legacy name 1340 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 1341 return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); 1342 1343 Type *Type = VT.getTypeForEVT(*DAG.getContext()); 1344 APFloat Max = APFloat::getLargest(Type->getFltSemantics()); 1345 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); 1346 1347 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 1348 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, 1349 DAG.getConstantFP(Max, DL, VT)); 1350 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, 1351 DAG.getConstantFP(Min, DL, VT)); 1352 } 1353 case Intrinsic::r600_read_ngroups_x: 1354 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1355 SI::KernelInputOffsets::NGROUPS_X, false); 1356 case Intrinsic::r600_read_ngroups_y: 1357 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1358 SI::KernelInputOffsets::NGROUPS_Y, false); 1359 case Intrinsic::r600_read_ngroups_z: 1360 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1361 SI::KernelInputOffsets::NGROUPS_Z, false); 1362 case Intrinsic::r600_read_global_size_x: 1363 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1364 SI::KernelInputOffsets::GLOBAL_SIZE_X, false); 1365 case Intrinsic::r600_read_global_size_y: 1366 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1367 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); 1368 case Intrinsic::r600_read_global_size_z: 1369 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 1370 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); 1371 case Intrinsic::r600_read_local_size_x: 1372 return lowerImplicitZextParam(DAG, Op, MVT::i16, 1373 SI::KernelInputOffsets::LOCAL_SIZE_X); 1374 case Intrinsic::r600_read_local_size_y: 1375 return lowerImplicitZextParam(DAG, Op, MVT::i16, 1376 SI::KernelInputOffsets::LOCAL_SIZE_Y); 1377 case Intrinsic::r600_read_local_size_z: 1378 return lowerImplicitZextParam(DAG, Op, MVT::i16, 1379 SI::KernelInputOffsets::LOCAL_SIZE_Z); 1380 case Intrinsic::amdgcn_read_workdim: 1381 case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name. 1382 // Really only 2 bits. 1383 return lowerImplicitZextParam(DAG, Op, MVT::i8, 1384 getImplicitParameterOffset(MFI, GRID_DIM)); 1385 case Intrinsic::r600_read_tgid_x: 1386 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 1387 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); 1388 case Intrinsic::r600_read_tgid_y: 1389 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 1390 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); 1391 case Intrinsic::r600_read_tgid_z: 1392 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 1393 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); 1394 case Intrinsic::r600_read_tidig_x: 1395 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 1396 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); 1397 case Intrinsic::r600_read_tidig_y: 1398 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 1399 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); 1400 case Intrinsic::r600_read_tidig_z: 1401 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 1402 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); 1403 case AMDGPUIntrinsic::SI_load_const: { 1404 SDValue Ops[] = { 1405 Op.getOperand(1), 1406 Op.getOperand(2) 1407 }; 1408 1409 MachineMemOperand *MMO = MF.getMachineMemOperand( 1410 MachinePointerInfo(), 1411 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, 1412 VT.getStoreSize(), 4); 1413 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 1414 Op->getVTList(), Ops, VT, MMO); 1415 } 1416 case AMDGPUIntrinsic::SI_vs_load_input: 1417 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 1418 Op.getOperand(1), 1419 Op.getOperand(2), 1420 Op.getOperand(3)); 1421 1422 case AMDGPUIntrinsic::SI_fs_constant: { 1423 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 1424 SDValue Glue = M0.getValue(1); 1425 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, 1426 DAG.getConstant(2, DL, MVT::i32), // P0 1427 Op.getOperand(1), Op.getOperand(2), Glue); 1428 } 1429 case AMDGPUIntrinsic::SI_packf16: 1430 if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) 1431 return DAG.getUNDEF(MVT::i32); 1432 return Op; 1433 case AMDGPUIntrinsic::SI_fs_interp: { 1434 SDValue IJ = Op.getOperand(4); 1435 SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 1436 DAG.getConstant(0, DL, MVT::i32)); 1437 SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 1438 DAG.getConstant(1, DL, MVT::i32)); 1439 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 1440 SDValue Glue = M0.getValue(1); 1441 SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, 1442 DAG.getVTList(MVT::f32, MVT::Glue), 1443 I, Op.getOperand(1), Op.getOperand(2), Glue); 1444 Glue = SDValue(P1.getNode(), 1); 1445 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, 1446 Op.getOperand(1), Op.getOperand(2), Glue); 1447 } 1448 case Intrinsic::amdgcn_interp_p1: { 1449 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); 1450 SDValue Glue = M0.getValue(1); 1451 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), 1452 Op.getOperand(2), Op.getOperand(3), Glue); 1453 } 1454 case Intrinsic::amdgcn_interp_p2: { 1455 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); 1456 SDValue Glue = SDValue(M0.getNode(), 1); 1457 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), 1458 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), 1459 Glue); 1460 } 1461 case Intrinsic::amdgcn_ldexp: 1462 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, 1463 Op.getOperand(1), Op.getOperand(2)); 1464 case Intrinsic::amdgcn_class: 1465 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, 1466 Op.getOperand(1), Op.getOperand(2)); 1467 case Intrinsic::amdgcn_div_fmas: 1468 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, 1469 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), 1470 Op.getOperand(4)); 1471 1472 case Intrinsic::amdgcn_div_fixup: 1473 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, 1474 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 1475 1476 case Intrinsic::amdgcn_trig_preop: 1477 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, 1478 Op.getOperand(1), Op.getOperand(2)); 1479 case Intrinsic::amdgcn_div_scale: { 1480 // 3rd parameter required to be a constant. 1481 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 1482 if (!Param) 1483 return DAG.getUNDEF(VT); 1484 1485 // Translate to the operands expected by the machine instruction. The 1486 // first parameter must be the same as the first instruction. 1487 SDValue Numerator = Op.getOperand(1); 1488 SDValue Denominator = Op.getOperand(2); 1489 1490 // Note this order is opposite of the machine instruction's operations, 1491 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The 1492 // intrinsic has the numerator as the first operand to match a normal 1493 // division operation. 1494 1495 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; 1496 1497 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, 1498 Denominator, Numerator); 1499 } 1500 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: 1501 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); 1502 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: 1503 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); 1504 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: 1505 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); 1506 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: 1507 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); 1508 default: 1509 return AMDGPUTargetLowering::LowerOperation(Op, DAG); 1510 } 1511 } 1512 1513 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 1514 SelectionDAG &DAG) const { 1515 MachineFunction &MF = DAG.getMachineFunction(); 1516 SDLoc DL(Op); 1517 SDValue Chain = Op.getOperand(0); 1518 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1519 1520 switch (IntrinsicID) { 1521 case AMDGPUIntrinsic::SI_sendmsg: { 1522 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 1523 SDValue Glue = Chain.getValue(1); 1524 return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, 1525 Op.getOperand(2), Glue); 1526 } 1527 case AMDGPUIntrinsic::SI_tbuffer_store: { 1528 SDValue Ops[] = { 1529 Chain, 1530 Op.getOperand(2), 1531 Op.getOperand(3), 1532 Op.getOperand(4), 1533 Op.getOperand(5), 1534 Op.getOperand(6), 1535 Op.getOperand(7), 1536 Op.getOperand(8), 1537 Op.getOperand(9), 1538 Op.getOperand(10), 1539 Op.getOperand(11), 1540 Op.getOperand(12), 1541 Op.getOperand(13), 1542 Op.getOperand(14) 1543 }; 1544 1545 EVT VT = Op.getOperand(3).getValueType(); 1546 1547 MachineMemOperand *MMO = MF.getMachineMemOperand( 1548 MachinePointerInfo(), 1549 MachineMemOperand::MOStore, 1550 VT.getStoreSize(), 4); 1551 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 1552 Op->getVTList(), Ops, VT, MMO); 1553 } 1554 default: 1555 return SDValue(); 1556 } 1557 } 1558 1559 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1560 SDLoc DL(Op); 1561 LoadSDNode *Load = cast<LoadSDNode>(Op); 1562 1563 if (Op.getValueType().isVector()) { 1564 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 1565 "Custom lowering for non-i32 vectors hasn't been implemented."); 1566 unsigned NumElements = Op.getValueType().getVectorNumElements(); 1567 assert(NumElements != 2 && "v2 loads are supported for all address spaces."); 1568 1569 switch (Load->getAddressSpace()) { 1570 default: break; 1571 case AMDGPUAS::CONSTANT_ADDRESS: 1572 if (isMemOpUniform(Load)) 1573 break; 1574 // Non-uniform loads will be selected to MUBUF instructions, so they 1575 // have the same legalization requires ments as global and private 1576 // loads. 1577 // 1578 // Fall-through 1579 case AMDGPUAS::GLOBAL_ADDRESS: 1580 case AMDGPUAS::PRIVATE_ADDRESS: 1581 if (NumElements >= 8) 1582 return SplitVectorLoad(Op, DAG); 1583 1584 // v4 loads are supported for private and global memory. 1585 if (NumElements <= 4) 1586 break; 1587 // fall-through 1588 case AMDGPUAS::LOCAL_ADDRESS: 1589 // If properly aligned, if we split we might be able to use ds_read_b64. 1590 return SplitVectorLoad(Op, DAG); 1591 } 1592 } 1593 1594 return AMDGPUTargetLowering::LowerLOAD(Op, DAG); 1595 } 1596 1597 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 1598 if (Op.getValueType() != MVT::i64) 1599 return SDValue(); 1600 1601 SDLoc DL(Op); 1602 SDValue Cond = Op.getOperand(0); 1603 1604 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 1605 SDValue One = DAG.getConstant(1, DL, MVT::i32); 1606 1607 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 1608 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 1609 1610 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 1611 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 1612 1613 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 1614 1615 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 1616 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 1617 1618 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 1619 1620 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); 1621 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 1622 } 1623 1624 // Catch division cases where we can use shortcuts with rcp and rsq 1625 // instructions. 1626 SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { 1627 SDLoc SL(Op); 1628 SDValue LHS = Op.getOperand(0); 1629 SDValue RHS = Op.getOperand(1); 1630 EVT VT = Op.getValueType(); 1631 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; 1632 1633 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 1634 if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && 1635 CLHS->isExactlyValue(1.0)) { 1636 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 1637 // the CI documentation has a worst case error of 1 ulp. 1638 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 1639 // use it as long as we aren't trying to use denormals. 1640 1641 // 1.0 / sqrt(x) -> rsq(x) 1642 // 1643 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 1644 // error seems really high at 2^29 ULP. 1645 if (RHS.getOpcode() == ISD::FSQRT) 1646 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); 1647 1648 // 1.0 / x -> rcp(x) 1649 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 1650 } 1651 } 1652 1653 if (Unsafe) { 1654 // Turn into multiply by the reciprocal. 1655 // x / y -> x * (1.0 / y) 1656 SDNodeFlags Flags; 1657 Flags.setUnsafeAlgebra(true); 1658 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 1659 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags); 1660 } 1661 1662 return SDValue(); 1663 } 1664 1665 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 1666 SDValue FastLowered = LowerFastFDIV(Op, DAG); 1667 if (FastLowered.getNode()) 1668 return FastLowered; 1669 1670 // This uses v_rcp_f32 which does not handle denormals. Let this hit a 1671 // selection error for now rather than do something incorrect. 1672 if (Subtarget->hasFP32Denormals()) 1673 return SDValue(); 1674 1675 SDLoc SL(Op); 1676 SDValue LHS = Op.getOperand(0); 1677 SDValue RHS = Op.getOperand(1); 1678 1679 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); 1680 1681 const APFloat K0Val(BitsToFloat(0x6f800000)); 1682 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); 1683 1684 const APFloat K1Val(BitsToFloat(0x2f800000)); 1685 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); 1686 1687 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 1688 1689 EVT SetCCVT = 1690 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); 1691 1692 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 1693 1694 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); 1695 1696 // TODO: Should this propagate fast-math-flags? 1697 1698 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); 1699 1700 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); 1701 1702 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); 1703 1704 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); 1705 } 1706 1707 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 1708 if (DAG.getTarget().Options.UnsafeFPMath) 1709 return LowerFastFDIV(Op, DAG); 1710 1711 SDLoc SL(Op); 1712 SDValue X = Op.getOperand(0); 1713 SDValue Y = Op.getOperand(1); 1714 1715 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 1716 1717 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 1718 1719 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 1720 1721 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 1722 1723 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 1724 1725 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 1726 1727 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 1728 1729 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 1730 1731 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 1732 1733 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 1734 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 1735 1736 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, 1737 NegDivScale0, Mul, DivScale1); 1738 1739 SDValue Scale; 1740 1741 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { 1742 // Workaround a hardware bug on SI where the condition output from div_scale 1743 // is not usable. 1744 1745 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); 1746 1747 // Figure out if the scale to use for div_fmas. 1748 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 1749 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 1750 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 1751 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 1752 1753 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 1754 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 1755 1756 SDValue Scale0Hi 1757 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 1758 SDValue Scale1Hi 1759 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 1760 1761 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 1762 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 1763 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 1764 } else { 1765 Scale = DivScale1.getValue(1); 1766 } 1767 1768 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, 1769 Fma4, Fma3, Mul, Scale); 1770 1771 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 1772 } 1773 1774 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 1775 EVT VT = Op.getValueType(); 1776 1777 if (VT == MVT::f32) 1778 return LowerFDIV32(Op, DAG); 1779 1780 if (VT == MVT::f64) 1781 return LowerFDIV64(Op, DAG); 1782 1783 llvm_unreachable("Unexpected type for fdiv"); 1784 } 1785 1786 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1787 SDLoc DL(Op); 1788 StoreSDNode *Store = cast<StoreSDNode>(Op); 1789 EVT VT = Store->getMemoryVT(); 1790 1791 // These stores are legal. 1792 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 1793 if (VT.isVector() && VT.getVectorNumElements() > 4) 1794 return ScalarizeVectorStore(Op, DAG); 1795 return SDValue(); 1796 } 1797 1798 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1799 if (Ret.getNode()) 1800 return Ret; 1801 1802 if (VT.isVector() && VT.getVectorNumElements() >= 8) 1803 return SplitVectorStore(Op, DAG); 1804 1805 if (VT == MVT::i1) 1806 return DAG.getTruncStore(Store->getChain(), DL, 1807 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 1808 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 1809 1810 return SDValue(); 1811 } 1812 1813 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 1814 SDLoc DL(Op); 1815 EVT VT = Op.getValueType(); 1816 SDValue Arg = Op.getOperand(0); 1817 // TODO: Should this propagate fast-math-flags? 1818 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 1819 DAG.getNode(ISD::FMUL, DL, VT, Arg, 1820 DAG.getConstantFP(0.5/M_PI, DL, 1821 VT))); 1822 1823 switch (Op.getOpcode()) { 1824 case ISD::FCOS: 1825 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); 1826 case ISD::FSIN: 1827 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); 1828 default: 1829 llvm_unreachable("Wrong trig opcode"); 1830 } 1831 } 1832 1833 //===----------------------------------------------------------------------===// 1834 // Custom DAG optimizations 1835 //===----------------------------------------------------------------------===// 1836 1837 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 1838 DAGCombinerInfo &DCI) const { 1839 EVT VT = N->getValueType(0); 1840 EVT ScalarVT = VT.getScalarType(); 1841 if (ScalarVT != MVT::f32) 1842 return SDValue(); 1843 1844 SelectionDAG &DAG = DCI.DAG; 1845 SDLoc DL(N); 1846 1847 SDValue Src = N->getOperand(0); 1848 EVT SrcVT = Src.getValueType(); 1849 1850 // TODO: We could try to match extracting the higher bytes, which would be 1851 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 1852 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 1853 // about in practice. 1854 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { 1855 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 1856 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); 1857 DCI.AddToWorklist(Cvt.getNode()); 1858 return Cvt; 1859 } 1860 } 1861 1862 // We are primarily trying to catch operations on illegal vector types 1863 // before they are expanded. 1864 // For scalars, we can use the more flexible method of checking masked bits 1865 // after legalization. 1866 if (!DCI.isBeforeLegalize() || 1867 !SrcVT.isVector() || 1868 SrcVT.getVectorElementType() != MVT::i8) { 1869 return SDValue(); 1870 } 1871 1872 assert(DCI.isBeforeLegalize() && "Unexpected legal type"); 1873 1874 // Weird sized vectors are a pain to handle, but we know 3 is really the same 1875 // size as 4. 1876 unsigned NElts = SrcVT.getVectorNumElements(); 1877 if (!SrcVT.isSimple() && NElts != 3) 1878 return SDValue(); 1879 1880 // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to 1881 // prevent a mess from expanding to v4i32 and repacking. 1882 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { 1883 EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); 1884 EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); 1885 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); 1886 LoadSDNode *Load = cast<LoadSDNode>(Src); 1887 1888 unsigned AS = Load->getAddressSpace(); 1889 unsigned Align = Load->getAlignment(); 1890 Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); 1891 unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); 1892 1893 // Don't try to replace the load if we have to expand it due to alignment 1894 // problems. Otherwise we will end up scalarizing the load, and trying to 1895 // repack into the vector for no real reason. 1896 if (Align < ABIAlignment && 1897 !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { 1898 return SDValue(); 1899 } 1900 1901 SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, 1902 Load->getChain(), 1903 Load->getBasePtr(), 1904 LoadVT, 1905 Load->getMemOperand()); 1906 1907 // Make sure successors of the original load stay after it by updating 1908 // them to use the new Chain. 1909 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); 1910 1911 SmallVector<SDValue, 4> Elts; 1912 if (RegVT.isVector()) 1913 DAG.ExtractVectorElements(NewLoad, Elts); 1914 else 1915 Elts.push_back(NewLoad); 1916 1917 SmallVector<SDValue, 4> Ops; 1918 1919 unsigned EltIdx = 0; 1920 for (SDValue Elt : Elts) { 1921 unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); 1922 for (unsigned I = 0; I < ComponentsInElt; ++I) { 1923 unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; 1924 SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); 1925 DCI.AddToWorklist(Cvt.getNode()); 1926 Ops.push_back(Cvt); 1927 } 1928 1929 ++EltIdx; 1930 } 1931 1932 assert(Ops.size() == NElts); 1933 1934 return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); 1935 } 1936 1937 return SDValue(); 1938 } 1939 1940 /// \brief Return true if the given offset Size in bytes can be folded into 1941 /// the immediate offsets of a memory instruction for the given address space. 1942 static bool canFoldOffset(unsigned OffsetSize, unsigned AS, 1943 const AMDGPUSubtarget &STI) { 1944 switch (AS) { 1945 case AMDGPUAS::GLOBAL_ADDRESS: { 1946 // MUBUF instructions a 12-bit offset in bytes. 1947 return isUInt<12>(OffsetSize); 1948 } 1949 case AMDGPUAS::CONSTANT_ADDRESS: { 1950 // SMRD instructions have an 8-bit offset in dwords on SI and 1951 // a 20-bit offset in bytes on VI. 1952 if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1953 return isUInt<20>(OffsetSize); 1954 else 1955 return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); 1956 } 1957 case AMDGPUAS::LOCAL_ADDRESS: 1958 case AMDGPUAS::REGION_ADDRESS: { 1959 // The single offset versions have a 16-bit offset in bytes. 1960 return isUInt<16>(OffsetSize); 1961 } 1962 case AMDGPUAS::PRIVATE_ADDRESS: 1963 // Indirect register addressing does not use any offsets. 1964 default: 1965 return 0; 1966 } 1967 } 1968 1969 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 1970 1971 // This is a variant of 1972 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 1973 // 1974 // The normal DAG combiner will do this, but only if the add has one use since 1975 // that would increase the number of instructions. 1976 // 1977 // This prevents us from seeing a constant offset that can be folded into a 1978 // memory instruction's addressing mode. If we know the resulting add offset of 1979 // a pointer can be folded into an addressing offset, we can replace the pointer 1980 // operand with the add of new constant offset. This eliminates one of the uses, 1981 // and may allow the remaining use to also be simplified. 1982 // 1983 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, 1984 unsigned AddrSpace, 1985 DAGCombinerInfo &DCI) const { 1986 SDValue N0 = N->getOperand(0); 1987 SDValue N1 = N->getOperand(1); 1988 1989 if (N0.getOpcode() != ISD::ADD) 1990 return SDValue(); 1991 1992 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 1993 if (!CN1) 1994 return SDValue(); 1995 1996 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 1997 if (!CAdd) 1998 return SDValue(); 1999 2000 // If the resulting offset is too large, we can't fold it into the addressing 2001 // mode offset. 2002 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 2003 if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) 2004 return SDValue(); 2005 2006 SelectionDAG &DAG = DCI.DAG; 2007 SDLoc SL(N); 2008 EVT VT = N->getValueType(0); 2009 2010 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 2011 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); 2012 2013 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); 2014 } 2015 2016 SDValue SITargetLowering::performAndCombine(SDNode *N, 2017 DAGCombinerInfo &DCI) const { 2018 if (DCI.isBeforeLegalize()) 2019 return SDValue(); 2020 2021 if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI)) 2022 return Base; 2023 2024 SelectionDAG &DAG = DCI.DAG; 2025 2026 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 2027 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 2028 SDValue LHS = N->getOperand(0); 2029 SDValue RHS = N->getOperand(1); 2030 2031 if (LHS.getOpcode() == ISD::SETCC && 2032 RHS.getOpcode() == ISD::SETCC) { 2033 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 2034 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 2035 2036 SDValue X = LHS.getOperand(0); 2037 SDValue Y = RHS.getOperand(0); 2038 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) 2039 return SDValue(); 2040 2041 if (LCC == ISD::SETO) { 2042 if (X != LHS.getOperand(1)) 2043 return SDValue(); 2044 2045 if (RCC == ISD::SETUNE) { 2046 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 2047 if (!C1 || !C1->isInfinity() || C1->isNegative()) 2048 return SDValue(); 2049 2050 const uint32_t Mask = SIInstrFlags::N_NORMAL | 2051 SIInstrFlags::N_SUBNORMAL | 2052 SIInstrFlags::N_ZERO | 2053 SIInstrFlags::P_ZERO | 2054 SIInstrFlags::P_SUBNORMAL | 2055 SIInstrFlags::P_NORMAL; 2056 2057 static_assert(((~(SIInstrFlags::S_NAN | 2058 SIInstrFlags::Q_NAN | 2059 SIInstrFlags::N_INFINITY | 2060 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, 2061 "mask not equal"); 2062 2063 SDLoc DL(N); 2064 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 2065 X, DAG.getConstant(Mask, DL, MVT::i32)); 2066 } 2067 } 2068 } 2069 2070 return SDValue(); 2071 } 2072 2073 SDValue SITargetLowering::performOrCombine(SDNode *N, 2074 DAGCombinerInfo &DCI) const { 2075 SelectionDAG &DAG = DCI.DAG; 2076 SDValue LHS = N->getOperand(0); 2077 SDValue RHS = N->getOperand(1); 2078 2079 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 2080 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 2081 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 2082 SDValue Src = LHS.getOperand(0); 2083 if (Src != RHS.getOperand(0)) 2084 return SDValue(); 2085 2086 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 2087 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 2088 if (!CLHS || !CRHS) 2089 return SDValue(); 2090 2091 // Only 10 bits are used. 2092 static const uint32_t MaxMask = 0x3ff; 2093 2094 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 2095 SDLoc DL(N); 2096 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 2097 Src, DAG.getConstant(NewMask, DL, MVT::i32)); 2098 } 2099 2100 return SDValue(); 2101 } 2102 2103 SDValue SITargetLowering::performClassCombine(SDNode *N, 2104 DAGCombinerInfo &DCI) const { 2105 SelectionDAG &DAG = DCI.DAG; 2106 SDValue Mask = N->getOperand(1); 2107 2108 // fp_class x, 0 -> false 2109 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { 2110 if (CMask->isNullValue()) 2111 return DAG.getConstant(0, SDLoc(N), MVT::i1); 2112 } 2113 2114 return SDValue(); 2115 } 2116 2117 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 2118 switch (Opc) { 2119 case ISD::FMAXNUM: 2120 return AMDGPUISD::FMAX3; 2121 case ISD::SMAX: 2122 return AMDGPUISD::SMAX3; 2123 case ISD::UMAX: 2124 return AMDGPUISD::UMAX3; 2125 case ISD::FMINNUM: 2126 return AMDGPUISD::FMIN3; 2127 case ISD::SMIN: 2128 return AMDGPUISD::SMIN3; 2129 case ISD::UMIN: 2130 return AMDGPUISD::UMIN3; 2131 default: 2132 llvm_unreachable("Not a min/max opcode"); 2133 } 2134 } 2135 2136 SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, 2137 DAGCombinerInfo &DCI) const { 2138 SelectionDAG &DAG = DCI.DAG; 2139 2140 unsigned Opc = N->getOpcode(); 2141 SDValue Op0 = N->getOperand(0); 2142 SDValue Op1 = N->getOperand(1); 2143 2144 // Only do this if the inner op has one use since this will just increases 2145 // register pressure for no benefit. 2146 2147 // max(max(a, b), c) 2148 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 2149 SDLoc DL(N); 2150 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 2151 DL, 2152 N->getValueType(0), 2153 Op0.getOperand(0), 2154 Op0.getOperand(1), 2155 Op1); 2156 } 2157 2158 // max(a, max(b, c)) 2159 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 2160 SDLoc DL(N); 2161 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 2162 DL, 2163 N->getValueType(0), 2164 Op0, 2165 Op1.getOperand(0), 2166 Op1.getOperand(1)); 2167 } 2168 2169 return SDValue(); 2170 } 2171 2172 SDValue SITargetLowering::performSetCCCombine(SDNode *N, 2173 DAGCombinerInfo &DCI) const { 2174 SelectionDAG &DAG = DCI.DAG; 2175 SDLoc SL(N); 2176 2177 SDValue LHS = N->getOperand(0); 2178 SDValue RHS = N->getOperand(1); 2179 EVT VT = LHS.getValueType(); 2180 2181 if (VT != MVT::f32 && VT != MVT::f64) 2182 return SDValue(); 2183 2184 // Match isinf pattern 2185 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 2186 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 2187 if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { 2188 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 2189 if (!CRHS) 2190 return SDValue(); 2191 2192 const APFloat &APF = CRHS->getValueAPF(); 2193 if (APF.isInfinity() && !APF.isNegative()) { 2194 unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; 2195 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), 2196 DAG.getConstant(Mask, SL, MVT::i32)); 2197 } 2198 } 2199 2200 return SDValue(); 2201 } 2202 2203 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 2204 DAGCombinerInfo &DCI) const { 2205 SelectionDAG &DAG = DCI.DAG; 2206 SDLoc DL(N); 2207 2208 switch (N->getOpcode()) { 2209 default: 2210 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 2211 case ISD::SETCC: 2212 return performSetCCCombine(N, DCI); 2213 case ISD::FMAXNUM: // TODO: What about fmax_legacy? 2214 case ISD::FMINNUM: 2215 case ISD::SMAX: 2216 case ISD::SMIN: 2217 case ISD::UMAX: 2218 case ISD::UMIN: { 2219 if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && 2220 N->getValueType(0) != MVT::f64 && 2221 getTargetMachine().getOptLevel() > CodeGenOpt::None) 2222 return performMin3Max3Combine(N, DCI); 2223 break; 2224 } 2225 2226 case AMDGPUISD::CVT_F32_UBYTE0: 2227 case AMDGPUISD::CVT_F32_UBYTE1: 2228 case AMDGPUISD::CVT_F32_UBYTE2: 2229 case AMDGPUISD::CVT_F32_UBYTE3: { 2230 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 2231 2232 SDValue Src = N->getOperand(0); 2233 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 2234 2235 APInt KnownZero, KnownOne; 2236 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 2237 !DCI.isBeforeLegalizeOps()); 2238 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 2239 if (TLO.ShrinkDemandedConstant(Src, Demanded) || 2240 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { 2241 DCI.CommitTargetLoweringOpt(TLO); 2242 } 2243 2244 break; 2245 } 2246 2247 case ISD::UINT_TO_FP: { 2248 return performUCharToFloatCombine(N, DCI); 2249 } 2250 case ISD::FADD: { 2251 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 2252 break; 2253 2254 EVT VT = N->getValueType(0); 2255 if (VT != MVT::f32) 2256 break; 2257 2258 // Only do this if we are not trying to support denormals. v_mad_f32 does 2259 // not support denormals ever. 2260 if (Subtarget->hasFP32Denormals()) 2261 break; 2262 2263 SDValue LHS = N->getOperand(0); 2264 SDValue RHS = N->getOperand(1); 2265 2266 // These should really be instruction patterns, but writing patterns with 2267 // source modiifiers is a pain. 2268 2269 // fadd (fadd (a, a), b) -> mad 2.0, a, b 2270 if (LHS.getOpcode() == ISD::FADD) { 2271 SDValue A = LHS.getOperand(0); 2272 if (A == LHS.getOperand(1)) { 2273 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 2274 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); 2275 } 2276 } 2277 2278 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 2279 if (RHS.getOpcode() == ISD::FADD) { 2280 SDValue A = RHS.getOperand(0); 2281 if (A == RHS.getOperand(1)) { 2282 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 2283 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); 2284 } 2285 } 2286 2287 return SDValue(); 2288 } 2289 case ISD::FSUB: { 2290 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 2291 break; 2292 2293 EVT VT = N->getValueType(0); 2294 2295 // Try to get the fneg to fold into the source modifier. This undoes generic 2296 // DAG combines and folds them into the mad. 2297 // 2298 // Only do this if we are not trying to support denormals. v_mad_f32 does 2299 // not support denormals ever. 2300 if (VT == MVT::f32 && 2301 !Subtarget->hasFP32Denormals()) { 2302 SDValue LHS = N->getOperand(0); 2303 SDValue RHS = N->getOperand(1); 2304 if (LHS.getOpcode() == ISD::FADD) { 2305 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 2306 2307 SDValue A = LHS.getOperand(0); 2308 if (A == LHS.getOperand(1)) { 2309 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 2310 SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); 2311 2312 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); 2313 } 2314 } 2315 2316 if (RHS.getOpcode() == ISD::FADD) { 2317 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 2318 2319 SDValue A = RHS.getOperand(0); 2320 if (A == RHS.getOperand(1)) { 2321 const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); 2322 return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); 2323 } 2324 } 2325 2326 return SDValue(); 2327 } 2328 2329 break; 2330 } 2331 case ISD::LOAD: 2332 case ISD::STORE: 2333 case ISD::ATOMIC_LOAD: 2334 case ISD::ATOMIC_STORE: 2335 case ISD::ATOMIC_CMP_SWAP: 2336 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 2337 case ISD::ATOMIC_SWAP: 2338 case ISD::ATOMIC_LOAD_ADD: 2339 case ISD::ATOMIC_LOAD_SUB: 2340 case ISD::ATOMIC_LOAD_AND: 2341 case ISD::ATOMIC_LOAD_OR: 2342 case ISD::ATOMIC_LOAD_XOR: 2343 case ISD::ATOMIC_LOAD_NAND: 2344 case ISD::ATOMIC_LOAD_MIN: 2345 case ISD::ATOMIC_LOAD_MAX: 2346 case ISD::ATOMIC_LOAD_UMIN: 2347 case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. 2348 if (DCI.isBeforeLegalize()) 2349 break; 2350 2351 MemSDNode *MemNode = cast<MemSDNode>(N); 2352 SDValue Ptr = MemNode->getBasePtr(); 2353 2354 // TODO: We could also do this for multiplies. 2355 unsigned AS = MemNode->getAddressSpace(); 2356 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { 2357 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); 2358 if (NewPtr) { 2359 SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); 2360 2361 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; 2362 return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); 2363 } 2364 } 2365 break; 2366 } 2367 case ISD::AND: 2368 return performAndCombine(N, DCI); 2369 case ISD::OR: 2370 return performOrCombine(N, DCI); 2371 case AMDGPUISD::FP_CLASS: 2372 return performClassCombine(N, DCI); 2373 } 2374 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 2375 } 2376 2377 /// \brief Analyze the possible immediate value Op 2378 /// 2379 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 2380 /// and the immediate value if it's a literal immediate 2381 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 2382 2383 const SIInstrInfo *TII = 2384 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2385 2386 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 2387 if (TII->isInlineConstant(Node->getAPIntValue())) 2388 return 0; 2389 2390 uint64_t Val = Node->getZExtValue(); 2391 return isUInt<32>(Val) ? Val : -1; 2392 } 2393 2394 if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { 2395 if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) 2396 return 0; 2397 2398 if (Node->getValueType(0) == MVT::f32) 2399 return FloatToBits(Node->getValueAPF().convertToFloat()); 2400 2401 return -1; 2402 } 2403 2404 return -1; 2405 } 2406 2407 /// \brief Helper function for adjustWritemask 2408 static unsigned SubIdx2Lane(unsigned Idx) { 2409 switch (Idx) { 2410 default: return 0; 2411 case AMDGPU::sub0: return 0; 2412 case AMDGPU::sub1: return 1; 2413 case AMDGPU::sub2: return 2; 2414 case AMDGPU::sub3: return 3; 2415 } 2416 } 2417 2418 /// \brief Adjust the writemask of MIMG instructions 2419 void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 2420 SelectionDAG &DAG) const { 2421 SDNode *Users[4] = { }; 2422 unsigned Lane = 0; 2423 unsigned OldDmask = Node->getConstantOperandVal(0); 2424 unsigned NewDmask = 0; 2425 2426 // Try to figure out the used register components 2427 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 2428 I != E; ++I) { 2429 2430 // Abort if we can't understand the usage 2431 if (!I->isMachineOpcode() || 2432 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 2433 return; 2434 2435 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 2436 // Note that subregs are packed, i.e. Lane==0 is the first bit set 2437 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 2438 // set, etc. 2439 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 2440 2441 // Set which texture component corresponds to the lane. 2442 unsigned Comp; 2443 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 2444 assert(Dmask); 2445 Comp = countTrailingZeros(Dmask); 2446 Dmask &= ~(1 << Comp); 2447 } 2448 2449 // Abort if we have more than one user per component 2450 if (Users[Lane]) 2451 return; 2452 2453 Users[Lane] = *I; 2454 NewDmask |= 1 << Comp; 2455 } 2456 2457 // Abort if there's no change 2458 if (NewDmask == OldDmask) 2459 return; 2460 2461 // Adjust the writemask in the node 2462 std::vector<SDValue> Ops; 2463 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); 2464 Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); 2465 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); 2466 2467 // If we only got one lane, replace it with a copy 2468 // (if NewDmask has only one bit set...) 2469 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 2470 SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), 2471 MVT::i32); 2472 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 2473 SDLoc(), Users[Lane]->getValueType(0), 2474 SDValue(Node, 0), RC); 2475 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 2476 return; 2477 } 2478 2479 // Update the users of the node with the new indices 2480 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 2481 2482 SDNode *User = Users[i]; 2483 if (!User) 2484 continue; 2485 2486 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); 2487 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 2488 2489 switch (Idx) { 2490 default: break; 2491 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 2492 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 2493 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 2494 } 2495 } 2496 } 2497 2498 static bool isFrameIndexOp(SDValue Op) { 2499 if (Op.getOpcode() == ISD::AssertZext) 2500 Op = Op.getOperand(0); 2501 2502 return isa<FrameIndexSDNode>(Op); 2503 } 2504 2505 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) 2506 /// with frame index operands. 2507 /// LLVM assumes that inputs are to these instructions are registers. 2508 void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 2509 SelectionDAG &DAG) const { 2510 2511 SmallVector<SDValue, 8> Ops; 2512 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 2513 if (!isFrameIndexOp(Node->getOperand(i))) { 2514 Ops.push_back(Node->getOperand(i)); 2515 continue; 2516 } 2517 2518 SDLoc DL(Node); 2519 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 2520 Node->getOperand(i).getValueType(), 2521 Node->getOperand(i)), 0)); 2522 } 2523 2524 DAG.UpdateNodeOperands(Node, Ops); 2525 } 2526 2527 /// \brief Fold the instructions after selecting them. 2528 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 2529 SelectionDAG &DAG) const { 2530 const SIInstrInfo *TII = 2531 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2532 2533 if (TII->isMIMG(Node->getMachineOpcode())) 2534 adjustWritemask(Node, DAG); 2535 2536 if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || 2537 Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { 2538 legalizeTargetIndependentNode(Node, DAG); 2539 return Node; 2540 } 2541 return Node; 2542 } 2543 2544 /// \brief Assign the register class depending on the number of 2545 /// bits set in the writemask 2546 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 2547 SDNode *Node) const { 2548 const SIInstrInfo *TII = 2549 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2550 2551 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 2552 2553 if (TII->isVOP3(MI->getOpcode())) { 2554 // Make sure constant bus requirements are respected. 2555 TII->legalizeOperandsVOP3(MRI, MI); 2556 return; 2557 } 2558 2559 if (TII->isMIMG(*MI)) { 2560 unsigned VReg = MI->getOperand(0).getReg(); 2561 unsigned Writemask = MI->getOperand(1).getImm(); 2562 unsigned BitsSet = 0; 2563 for (unsigned i = 0; i < 4; ++i) 2564 BitsSet += Writemask & (1 << i) ? 1 : 0; 2565 2566 const TargetRegisterClass *RC; 2567 switch (BitsSet) { 2568 default: return; 2569 case 1: RC = &AMDGPU::VGPR_32RegClass; break; 2570 case 2: RC = &AMDGPU::VReg_64RegClass; break; 2571 case 3: RC = &AMDGPU::VReg_96RegClass; break; 2572 } 2573 2574 unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); 2575 MI->setDesc(TII->get(NewOpcode)); 2576 MRI.setRegClass(VReg, RC); 2577 return; 2578 } 2579 2580 // Replace unused atomics with the no return version. 2581 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); 2582 if (NoRetAtomicOp != -1) { 2583 if (!Node->hasAnyUseOfValue(0)) { 2584 MI->setDesc(TII->get(NoRetAtomicOp)); 2585 MI->RemoveOperand(0); 2586 } 2587 2588 return; 2589 } 2590 } 2591 2592 static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { 2593 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); 2594 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 2595 } 2596 2597 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 2598 SDLoc DL, 2599 SDValue Ptr) const { 2600 const SIInstrInfo *TII = 2601 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 2602 2603 // Build the half of the subregister with the constants before building the 2604 // full 128-bit register. If we are building multiple resource descriptors, 2605 // this will allow CSEing of the 2-component register. 2606 const SDValue Ops0[] = { 2607 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), 2608 buildSMovImm32(DAG, DL, 0), 2609 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 2610 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 2611 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) 2612 }; 2613 2614 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, 2615 MVT::v2i32, Ops0), 0); 2616 2617 // Combine the constants and the pointer. 2618 const SDValue Ops1[] = { 2619 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 2620 Ptr, 2621 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), 2622 SubRegHi, 2623 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) 2624 }; 2625 2626 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 2627 } 2628 2629 /// \brief Return a resource descriptor with the 'Add TID' bit enabled 2630 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] 2631 /// of the resource descriptor) to create an offset, which is added to 2632 /// the resource pointer. 2633 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, 2634 SDLoc DL, 2635 SDValue Ptr, 2636 uint32_t RsrcDword1, 2637 uint64_t RsrcDword2And3) const { 2638 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 2639 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 2640 if (RsrcDword1) { 2641 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 2642 DAG.getConstant(RsrcDword1, DL, MVT::i32)), 2643 0); 2644 } 2645 2646 SDValue DataLo = buildSMovImm32(DAG, DL, 2647 RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 2648 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 2649 2650 const SDValue Ops[] = { 2651 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 2652 PtrLo, 2653 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 2654 PtrHi, 2655 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), 2656 DataLo, 2657 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), 2658 DataHi, 2659 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) 2660 }; 2661 2662 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 2663 } 2664 2665 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 2666 const TargetRegisterClass *RC, 2667 unsigned Reg, EVT VT) const { 2668 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 2669 2670 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 2671 cast<RegisterSDNode>(VReg)->getReg(), VT); 2672 } 2673 2674 //===----------------------------------------------------------------------===// 2675 // SI Inline Assembly Support 2676 //===----------------------------------------------------------------------===// 2677 2678 std::pair<unsigned, const TargetRegisterClass *> 2679 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 2680 StringRef Constraint, 2681 MVT VT) const { 2682 2683 if (Constraint.size() == 1) { 2684 switch (Constraint[0]) { 2685 case 's': 2686 case 'r': 2687 switch (VT.getSizeInBits()) { 2688 default: 2689 return std::make_pair(0U, nullptr); 2690 case 32: 2691 return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); 2692 case 64: 2693 return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); 2694 case 128: 2695 return std::make_pair(0U, &AMDGPU::SReg_128RegClass); 2696 case 256: 2697 return std::make_pair(0U, &AMDGPU::SReg_256RegClass); 2698 } 2699 2700 case 'v': 2701 switch (VT.getSizeInBits()) { 2702 default: 2703 return std::make_pair(0U, nullptr); 2704 case 32: 2705 return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); 2706 case 64: 2707 return std::make_pair(0U, &AMDGPU::VReg_64RegClass); 2708 case 96: 2709 return std::make_pair(0U, &AMDGPU::VReg_96RegClass); 2710 case 128: 2711 return std::make_pair(0U, &AMDGPU::VReg_128RegClass); 2712 case 256: 2713 return std::make_pair(0U, &AMDGPU::VReg_256RegClass); 2714 case 512: 2715 return std::make_pair(0U, &AMDGPU::VReg_512RegClass); 2716 } 2717 } 2718 } 2719 2720 if (Constraint.size() > 1) { 2721 const TargetRegisterClass *RC = nullptr; 2722 if (Constraint[1] == 'v') { 2723 RC = &AMDGPU::VGPR_32RegClass; 2724 } else if (Constraint[1] == 's') { 2725 RC = &AMDGPU::SGPR_32RegClass; 2726 } 2727 2728 if (RC) { 2729 uint32_t Idx; 2730 bool Failed = Constraint.substr(2).getAsInteger(10, Idx); 2731 if (!Failed && Idx < RC->getNumRegs()) 2732 return std::make_pair(RC->getRegister(Idx), RC); 2733 } 2734 } 2735 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 2736 } 2737 2738 SITargetLowering::ConstraintType 2739 SITargetLowering::getConstraintType(StringRef Constraint) const { 2740 if (Constraint.size() == 1) { 2741 switch (Constraint[0]) { 2742 default: break; 2743 case 's': 2744 case 'v': 2745 return C_RegisterClass; 2746 } 2747 } 2748 return TargetLowering::getConstraintType(Constraint); 2749 } 2750