1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for SI 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifdef _MSC_VER 16 // Provide M_PI. 17 #define _USE_MATH_DEFINES 18 #include <cmath> 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPUIntrinsicInfo.h" 23 #include "AMDGPUSubtarget.h" 24 #include "SIDefines.h" 25 #include "SIISelLowering.h" 26 #include "SIInstrInfo.h" 27 #include "SIMachineFunctionInfo.h" 28 #include "SIRegisterInfo.h" 29 #include "llvm/ADT/BitVector.h" 30 #include "llvm/ADT/StringSwitch.h" 31 #include "llvm/CodeGen/CallingConvLower.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/CodeGen/SelectionDAG.h" 35 #include "llvm/CodeGen/Analysis.h" 36 #include "llvm/IR/DiagnosticInfo.h" 37 #include "llvm/IR/Function.h" 38 39 using namespace llvm; 40 41 static cl::opt<bool> EnableVGPRIndexMode( 42 "amdgpu-vgpr-index-mode", 43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 44 cl::init(false)); 45 46 47 static unsigned findFirstFreeSGPR(CCState &CCInfo) { 48 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 49 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { 50 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { 51 return AMDGPU::SGPR0 + Reg; 52 } 53 } 54 llvm_unreachable("Cannot allocate sgpr"); 55 } 56 57 SITargetLowering::SITargetLowering(const TargetMachine &TM, 58 const SISubtarget &STI) 59 : AMDGPUTargetLowering(TM, STI) { 60 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 61 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 62 63 addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass); 64 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 65 66 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 67 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 68 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 69 70 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); 71 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); 72 73 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); 74 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 75 76 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); 77 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 78 79 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); 80 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 81 82 if (Subtarget->has16BitInsts()) { 83 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); 84 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); 85 } 86 87 computeRegisterProperties(STI.getRegisterInfo()); 88 89 // We need to custom lower vector stores from local memory 90 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 91 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 92 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 93 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 94 setOperationAction(ISD::LOAD, MVT::i1, Custom); 95 96 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 97 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 98 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 99 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 100 setOperationAction(ISD::STORE, MVT::i1, Custom); 101 102 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 103 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); 104 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 105 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 106 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand); 107 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand); 108 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand); 109 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); 110 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); 111 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); 112 113 114 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 115 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 116 setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); 117 118 setOperationAction(ISD::SELECT, MVT::i1, Promote); 119 setOperationAction(ISD::SELECT, MVT::i64, Custom); 120 setOperationAction(ISD::SELECT, MVT::f64, Promote); 121 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 122 123 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 124 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 125 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 126 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 127 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 128 129 setOperationAction(ISD::SETCC, MVT::i1, Promote); 130 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 131 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 132 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32); 133 134 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); 135 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 136 137 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 138 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 139 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 140 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 141 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 142 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 143 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 144 145 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 146 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 147 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 148 149 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 150 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 151 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 152 setOperationAction(ISD::BR_CC, MVT::i64, Expand); 153 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 154 setOperationAction(ISD::BR_CC, MVT::f64, Expand); 155 156 // We only support LOAD/STORE and vector manipulation ops for vectors 157 // with > 4 elements. 158 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { 159 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 160 switch (Op) { 161 case ISD::LOAD: 162 case ISD::STORE: 163 case ISD::BUILD_VECTOR: 164 case ISD::BITCAST: 165 case ISD::EXTRACT_VECTOR_ELT: 166 case ISD::INSERT_VECTOR_ELT: 167 case ISD::INSERT_SUBVECTOR: 168 case ISD::EXTRACT_SUBVECTOR: 169 case ISD::SCALAR_TO_VECTOR: 170 break; 171 case ISD::CONCAT_VECTORS: 172 setOperationAction(Op, VT, Custom); 173 break; 174 default: 175 setOperationAction(Op, VT, Expand); 176 break; 177 } 178 } 179 } 180 181 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that 182 // is expanded to avoid having two separate loops in case the index is a VGPR. 183 184 // Most operations are naturally 32-bit vector operations. We only support 185 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. 186 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { 187 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 188 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); 189 190 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 191 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); 192 193 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 194 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); 195 196 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 197 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); 198 } 199 200 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 201 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 202 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 203 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 204 205 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, 206 // and output demarshalling 207 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 208 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 209 210 // We can't return success/failure, only the old value, 211 // let LLVM add the comparison 212 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); 213 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); 214 215 if (getSubtarget()->hasFlatAddressSpace()) { 216 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); 217 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); 218 } 219 220 setOperationAction(ISD::BSWAP, MVT::i32, Legal); 221 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 222 223 // On SI this is s_memtime and s_memrealtime on VI. 224 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 225 setOperationAction(ISD::TRAP, MVT::Other, Custom); 226 227 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 228 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 229 230 if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) { 231 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 232 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 233 setOperationAction(ISD::FRINT, MVT::f64, Legal); 234 } 235 236 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 237 238 setOperationAction(ISD::FSIN, MVT::f32, Custom); 239 setOperationAction(ISD::FCOS, MVT::f32, Custom); 240 setOperationAction(ISD::FDIV, MVT::f32, Custom); 241 setOperationAction(ISD::FDIV, MVT::f64, Custom); 242 243 if (Subtarget->has16BitInsts()) { 244 setOperationAction(ISD::Constant, MVT::i16, Legal); 245 246 setOperationAction(ISD::SMIN, MVT::i16, Legal); 247 setOperationAction(ISD::SMAX, MVT::i16, Legal); 248 249 setOperationAction(ISD::UMIN, MVT::i16, Legal); 250 setOperationAction(ISD::UMAX, MVT::i16, Legal); 251 252 setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote); 253 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); 254 255 setOperationAction(ISD::ROTR, MVT::i16, Promote); 256 setOperationAction(ISD::ROTL, MVT::i16, Promote); 257 258 setOperationAction(ISD::SDIV, MVT::i16, Promote); 259 setOperationAction(ISD::UDIV, MVT::i16, Promote); 260 setOperationAction(ISD::SREM, MVT::i16, Promote); 261 setOperationAction(ISD::UREM, MVT::i16, Promote); 262 263 setOperationAction(ISD::BSWAP, MVT::i16, Promote); 264 setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); 265 266 setOperationAction(ISD::CTTZ, MVT::i16, Promote); 267 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote); 268 setOperationAction(ISD::CTLZ, MVT::i16, Promote); 269 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote); 270 271 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); 272 273 setOperationAction(ISD::BR_CC, MVT::i16, Expand); 274 275 setOperationAction(ISD::LOAD, MVT::i16, Custom); 276 277 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 278 279 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); 280 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); 281 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); 282 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); 283 284 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); 285 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); 286 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); 287 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); 288 289 // F16 - Constant Actions. 290 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 291 292 // F16 - Load/Store Actions. 293 setOperationAction(ISD::LOAD, MVT::f16, Promote); 294 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16); 295 setOperationAction(ISD::STORE, MVT::f16, Promote); 296 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); 297 298 // F16 - VOP1 Actions. 299 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 300 setOperationAction(ISD::FCOS, MVT::f16, Promote); 301 setOperationAction(ISD::FSIN, MVT::f16, Promote); 302 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote); 303 setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote); 304 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote); 305 setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote); 306 307 // F16 - VOP2 Actions. 308 setOperationAction(ISD::BR_CC, MVT::f16, Expand); 309 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); 310 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 311 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 312 setOperationAction(ISD::FDIV, MVT::f16, Custom); 313 314 // F16 - VOP3 Actions. 315 setOperationAction(ISD::FMA, MVT::f16, Legal); 316 if (!Subtarget->hasFP16Denormals()) 317 setOperationAction(ISD::FMAD, MVT::f16, Legal); 318 } 319 320 setTargetDAGCombine(ISD::FADD); 321 setTargetDAGCombine(ISD::FSUB); 322 setTargetDAGCombine(ISD::FMINNUM); 323 setTargetDAGCombine(ISD::FMAXNUM); 324 setTargetDAGCombine(ISD::SMIN); 325 setTargetDAGCombine(ISD::SMAX); 326 setTargetDAGCombine(ISD::UMIN); 327 setTargetDAGCombine(ISD::UMAX); 328 setTargetDAGCombine(ISD::SETCC); 329 setTargetDAGCombine(ISD::AND); 330 setTargetDAGCombine(ISD::OR); 331 setTargetDAGCombine(ISD::XOR); 332 setTargetDAGCombine(ISD::SINT_TO_FP); 333 setTargetDAGCombine(ISD::UINT_TO_FP); 334 setTargetDAGCombine(ISD::FCANONICALIZE); 335 336 // All memory operations. Some folding on the pointer operand is done to help 337 // matching the constant offsets in the addressing modes. 338 setTargetDAGCombine(ISD::LOAD); 339 setTargetDAGCombine(ISD::STORE); 340 setTargetDAGCombine(ISD::ATOMIC_LOAD); 341 setTargetDAGCombine(ISD::ATOMIC_STORE); 342 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); 343 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); 344 setTargetDAGCombine(ISD::ATOMIC_SWAP); 345 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); 346 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); 347 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); 348 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); 349 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); 350 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); 351 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); 352 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); 353 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); 354 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); 355 356 setSchedulingPreference(Sched::RegPressure); 357 } 358 359 const SISubtarget *SITargetLowering::getSubtarget() const { 360 return static_cast<const SISubtarget *>(Subtarget); 361 } 362 363 //===----------------------------------------------------------------------===// 364 // TargetLowering queries 365 //===----------------------------------------------------------------------===// 366 367 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 368 const CallInst &CI, 369 unsigned IntrID) const { 370 switch (IntrID) { 371 case Intrinsic::amdgcn_atomic_inc: 372 case Intrinsic::amdgcn_atomic_dec: 373 Info.opc = ISD::INTRINSIC_W_CHAIN; 374 Info.memVT = MVT::getVT(CI.getType()); 375 Info.ptrVal = CI.getOperand(0); 376 Info.align = 0; 377 Info.vol = false; 378 Info.readMem = true; 379 Info.writeMem = true; 380 return true; 381 default: 382 return false; 383 } 384 } 385 386 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, 387 EVT) const { 388 // SI has some legal vector types, but no legal vector operations. Say no 389 // shuffles are legal in order to prefer scalarizing some vector operations. 390 return false; 391 } 392 393 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { 394 // Flat instructions do not have offsets, and only have the register 395 // address. 396 return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); 397 } 398 399 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { 400 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and 401 // additionally can do r + r + i with addr64. 32-bit has more addressing 402 // mode options. Depending on the resource constant, it can also do 403 // (i64 r0) + (i32 r1) * (i14 i). 404 // 405 // Private arrays end up using a scratch buffer most of the time, so also 406 // assume those use MUBUF instructions. Scratch loads / stores are currently 407 // implemented as mubuf instructions with offen bit set, so slightly 408 // different than the normal addr64. 409 if (!isUInt<12>(AM.BaseOffs)) 410 return false; 411 412 // FIXME: Since we can split immediate into soffset and immediate offset, 413 // would it make sense to allow any immediate? 414 415 switch (AM.Scale) { 416 case 0: // r + i or just i, depending on HasBaseReg. 417 return true; 418 case 1: 419 return true; // We have r + r or r + i. 420 case 2: 421 if (AM.HasBaseReg) { 422 // Reject 2 * r + r. 423 return false; 424 } 425 426 // Allow 2 * r as r + r 427 // Or 2 * r + i is allowed as r + r + i. 428 return true; 429 default: // Don't allow n * r 430 return false; 431 } 432 } 433 434 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, 435 const AddrMode &AM, Type *Ty, 436 unsigned AS) const { 437 // No global is ever allowed as a base. 438 if (AM.BaseGV) 439 return false; 440 441 switch (AS) { 442 case AMDGPUAS::GLOBAL_ADDRESS: { 443 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 444 // Assume the we will use FLAT for all global memory accesses 445 // on VI. 446 // FIXME: This assumption is currently wrong. On VI we still use 447 // MUBUF instructions for the r + i addressing mode. As currently 448 // implemented, the MUBUF instructions only work on buffer < 4GB. 449 // It may be possible to support > 4GB buffers with MUBUF instructions, 450 // by setting the stride value in the resource descriptor which would 451 // increase the size limit to (stride * 4GB). However, this is risky, 452 // because it has never been validated. 453 return isLegalFlatAddressingMode(AM); 454 } 455 456 return isLegalMUBUFAddressingMode(AM); 457 } 458 case AMDGPUAS::CONSTANT_ADDRESS: { 459 // If the offset isn't a multiple of 4, it probably isn't going to be 460 // correctly aligned. 461 // FIXME: Can we get the real alignment here? 462 if (AM.BaseOffs % 4 != 0) 463 return isLegalMUBUFAddressingMode(AM); 464 465 // There are no SMRD extloads, so if we have to do a small type access we 466 // will use a MUBUF load. 467 // FIXME?: We also need to do this if unaligned, but we don't know the 468 // alignment here. 469 if (DL.getTypeStoreSize(Ty) < 4) 470 return isLegalMUBUFAddressingMode(AM); 471 472 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { 473 // SMRD instructions have an 8-bit, dword offset on SI. 474 if (!isUInt<8>(AM.BaseOffs / 4)) 475 return false; 476 } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) { 477 // On CI+, this can also be a 32-bit literal constant offset. If it fits 478 // in 8-bits, it can use a smaller encoding. 479 if (!isUInt<32>(AM.BaseOffs / 4)) 480 return false; 481 } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) { 482 // On VI, these use the SMEM format and the offset is 20-bit in bytes. 483 if (!isUInt<20>(AM.BaseOffs)) 484 return false; 485 } else 486 llvm_unreachable("unhandled generation"); 487 488 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 489 return true; 490 491 if (AM.Scale == 1 && AM.HasBaseReg) 492 return true; 493 494 return false; 495 } 496 497 case AMDGPUAS::PRIVATE_ADDRESS: 498 return isLegalMUBUFAddressingMode(AM); 499 500 case AMDGPUAS::LOCAL_ADDRESS: 501 case AMDGPUAS::REGION_ADDRESS: { 502 // Basic, single offset DS instructions allow a 16-bit unsigned immediate 503 // field. 504 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have 505 // an 8-bit dword offset but we don't know the alignment here. 506 if (!isUInt<16>(AM.BaseOffs)) 507 return false; 508 509 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 510 return true; 511 512 if (AM.Scale == 1 && AM.HasBaseReg) 513 return true; 514 515 return false; 516 } 517 case AMDGPUAS::FLAT_ADDRESS: 518 case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: 519 // For an unknown address space, this usually means that this is for some 520 // reason being used for pure arithmetic, and not based on some addressing 521 // computation. We don't have instructions that compute pointers with any 522 // addressing modes, so treat them as having no offset like flat 523 // instructions. 524 return isLegalFlatAddressingMode(AM); 525 526 default: 527 llvm_unreachable("unhandled address space"); 528 } 529 } 530 531 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 532 unsigned AddrSpace, 533 unsigned Align, 534 bool *IsFast) const { 535 if (IsFast) 536 *IsFast = false; 537 538 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, 539 // which isn't a simple VT. 540 // Until MVT is extended to handle this, simply check for the size and 541 // rely on the condition below: allow accesses if the size is a multiple of 4. 542 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && 543 VT.getStoreSize() > 16)) { 544 return false; 545 } 546 547 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 548 AddrSpace == AMDGPUAS::REGION_ADDRESS) { 549 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte 550 // aligned, 8 byte access in a single operation using ds_read2/write2_b32 551 // with adjacent offsets. 552 bool AlignedBy4 = (Align % 4 == 0); 553 if (IsFast) 554 *IsFast = AlignedBy4; 555 556 return AlignedBy4; 557 } 558 559 // FIXME: We have to be conservative here and assume that flat operations 560 // will access scratch. If we had access to the IR function, then we 561 // could determine if any private memory was used in the function. 562 if (!Subtarget->hasUnalignedScratchAccess() && 563 (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 564 AddrSpace == AMDGPUAS::FLAT_ADDRESS)) { 565 return false; 566 } 567 568 if (Subtarget->hasUnalignedBufferAccess()) { 569 // If we have an uniform constant load, it still requires using a slow 570 // buffer instruction if unaligned. 571 if (IsFast) { 572 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ? 573 (Align % 4 == 0) : true; 574 } 575 576 return true; 577 } 578 579 // Smaller than dword value must be aligned. 580 if (VT.bitsLT(MVT::i32)) 581 return false; 582 583 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 584 // byte-address are ignored, thus forcing Dword alignment. 585 // This applies to private, global, and constant memory. 586 if (IsFast) 587 *IsFast = true; 588 589 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 590 } 591 592 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 593 unsigned SrcAlign, bool IsMemset, 594 bool ZeroMemset, 595 bool MemcpyStrSrc, 596 MachineFunction &MF) const { 597 // FIXME: Should account for address space here. 598 599 // The default fallback uses the private pointer size as a guess for a type to 600 // use. Make sure we switch these to 64-bit accesses. 601 602 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global 603 return MVT::v4i32; 604 605 if (Size >= 8 && DstAlign >= 4) 606 return MVT::v2i32; 607 608 // Use the default. 609 return MVT::Other; 610 } 611 612 static bool isFlatGlobalAddrSpace(unsigned AS) { 613 return AS == AMDGPUAS::GLOBAL_ADDRESS || 614 AS == AMDGPUAS::FLAT_ADDRESS || 615 AS == AMDGPUAS::CONSTANT_ADDRESS; 616 } 617 618 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 619 unsigned DestAS) const { 620 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); 621 } 622 623 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { 624 const MemSDNode *MemNode = cast<MemSDNode>(N); 625 const Value *Ptr = MemNode->getMemOperand()->getValue(); 626 const Instruction *I = dyn_cast<Instruction>(Ptr); 627 return I && I->getMetadata("amdgpu.noclobber"); 628 } 629 630 bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, 631 unsigned DestAS) const { 632 // Flat -> private/local is a simple truncate. 633 // Flat -> global is no-op 634 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) 635 return true; 636 637 return isNoopAddrSpaceCast(SrcAS, DestAS); 638 } 639 640 bool SITargetLowering::isMemOpUniform(const SDNode *N) const { 641 const MemSDNode *MemNode = cast<MemSDNode>(N); 642 const Value *Ptr = MemNode->getMemOperand()->getValue(); 643 644 // UndefValue means this is a load of a kernel input. These are uniform. 645 // Sometimes LDS instructions have constant pointers. 646 // If Ptr is null, then that means this mem operand contains a 647 // PseudoSourceValue like GOT. 648 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 649 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 650 return true; 651 652 const Instruction *I = dyn_cast<Instruction>(Ptr); 653 return I && I->getMetadata("amdgpu.uniform"); 654 } 655 656 TargetLoweringBase::LegalizeTypeAction 657 SITargetLowering::getPreferredVectorAction(EVT VT) const { 658 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) 659 return TypeSplitVector; 660 661 return TargetLoweringBase::getPreferredVectorAction(VT); 662 } 663 664 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 665 Type *Ty) const { 666 // FIXME: Could be smarter if called for vector constants. 667 return true; 668 } 669 670 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { 671 if (Subtarget->has16BitInsts() && VT == MVT::i16) { 672 switch (Op) { 673 case ISD::LOAD: 674 case ISD::STORE: 675 676 // These operations are done with 32-bit instructions anyway. 677 case ISD::AND: 678 case ISD::OR: 679 case ISD::XOR: 680 case ISD::SELECT: 681 // TODO: Extensions? 682 return true; 683 default: 684 return false; 685 } 686 } 687 688 // SimplifySetCC uses this function to determine whether or not it should 689 // create setcc with i1 operands. We don't have instructions for i1 setcc. 690 if (VT == MVT::i1 && Op == ISD::SETCC) 691 return false; 692 693 return TargetLowering::isTypeDesirableForOp(Op, VT); 694 } 695 696 SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, 697 const SDLoc &SL, SDValue Chain, 698 unsigned Offset) const { 699 const DataLayout &DL = DAG.getDataLayout(); 700 MachineFunction &MF = DAG.getMachineFunction(); 701 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 702 unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 703 704 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 705 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 706 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, 707 MRI.getLiveInVirtReg(InputPtrReg), PtrVT); 708 return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 709 DAG.getConstant(Offset, SL, PtrVT)); 710 } 711 712 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 713 const SDLoc &SL, SDValue Chain, 714 unsigned Offset, bool Signed, 715 const ISD::InputArg *Arg) const { 716 const DataLayout &DL = DAG.getDataLayout(); 717 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); 718 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 719 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 720 721 unsigned Align = DL.getABITypeAlignment(Ty); 722 723 SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset); 724 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, 725 MachineMemOperand::MONonTemporal | 726 MachineMemOperand::MODereferenceable | 727 MachineMemOperand::MOInvariant); 728 729 SDValue Val = Load; 730 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && 731 VT.bitsLT(MemVT)) { 732 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; 733 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT)); 734 } 735 736 if (MemVT.isFloatingPoint()) 737 Val = getFPExtOrFPTrunc(DAG, Val, SL, VT); 738 else if (Signed) 739 Val = DAG.getSExtOrTrunc(Val, SL, VT); 740 else 741 Val = DAG.getZExtOrTrunc(Val, SL, VT); 742 743 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); 744 } 745 746 SDValue SITargetLowering::LowerFormalArguments( 747 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 748 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 749 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 750 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 751 752 MachineFunction &MF = DAG.getMachineFunction(); 753 FunctionType *FType = MF.getFunction()->getFunctionType(); 754 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 755 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 756 757 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { 758 const Function *Fn = MF.getFunction(); 759 DiagnosticInfoUnsupported NoGraphicsHSA( 760 *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); 761 DAG.getContext()->diagnose(NoGraphicsHSA); 762 return DAG.getEntryNode(); 763 } 764 765 // Create stack objects that are used for emitting debugger prologue if 766 // "amdgpu-debugger-emit-prologue" attribute was specified. 767 if (ST.debuggerEmitPrologue()) 768 createDebuggerPrologueStackObjects(MF); 769 770 SmallVector<ISD::InputArg, 16> Splits; 771 BitVector Skipped(Ins.size()); 772 773 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 774 const ISD::InputArg &Arg = Ins[i]; 775 776 // First check if it's a PS input addr 777 if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && 778 !Arg.Flags.isByVal() && PSInputNum <= 15) { 779 780 if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { 781 // We can safely skip PS inputs 782 Skipped.set(i); 783 ++PSInputNum; 784 continue; 785 } 786 787 Info->markPSInputAllocated(PSInputNum); 788 if (Arg.Used) 789 Info->PSInputEna |= 1 << PSInputNum; 790 791 ++PSInputNum; 792 } 793 794 if (AMDGPU::isShader(CallConv)) { 795 // Second split vertices into their elements 796 if (Arg.VT.isVector()) { 797 ISD::InputArg NewArg = Arg; 798 NewArg.Flags.setSplit(); 799 NewArg.VT = Arg.VT.getVectorElementType(); 800 801 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 802 // three or five element vertex only needs three or five registers, 803 // NOT four or eight. 804 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 805 unsigned NumElements = ParamType->getVectorNumElements(); 806 807 for (unsigned j = 0; j != NumElements; ++j) { 808 Splits.push_back(NewArg); 809 NewArg.PartOffset += NewArg.VT.getStoreSize(); 810 } 811 } else { 812 Splits.push_back(Arg); 813 } 814 } 815 } 816 817 SmallVector<CCValAssign, 16> ArgLocs; 818 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 819 *DAG.getContext()); 820 821 // At least one interpolation mode must be enabled or else the GPU will hang. 822 // 823 // Check PSInputAddr instead of PSInputEna. The idea is that if the user set 824 // PSInputAddr, the user wants to enable some bits after the compilation 825 // based on run-time states. Since we can't know what the final PSInputEna 826 // will look like, so we shouldn't do anything here and the user should take 827 // responsibility for the correct programming. 828 // 829 // Otherwise, the following restrictions apply: 830 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 831 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 832 // enabled too. 833 if (CallConv == CallingConv::AMDGPU_PS && 834 ((Info->getPSInputAddr() & 0x7F) == 0 || 835 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) { 836 CCInfo.AllocateReg(AMDGPU::VGPR0); 837 CCInfo.AllocateReg(AMDGPU::VGPR1); 838 Info->markPSInputAllocated(0); 839 Info->PSInputEna |= 1; 840 } 841 842 if (!AMDGPU::isShader(CallConv)) { 843 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); 844 } else { 845 assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() && 846 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && 847 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && 848 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && 849 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && 850 !Info->hasWorkItemIDZ()); 851 } 852 853 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 854 if (Info->hasPrivateSegmentBuffer()) { 855 unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); 856 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); 857 CCInfo.AllocateReg(PrivateSegmentBufferReg); 858 } 859 860 if (Info->hasDispatchPtr()) { 861 unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); 862 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 863 CCInfo.AllocateReg(DispatchPtrReg); 864 } 865 866 if (Info->hasQueuePtr()) { 867 unsigned QueuePtrReg = Info->addQueuePtr(*TRI); 868 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 869 CCInfo.AllocateReg(QueuePtrReg); 870 } 871 872 if (Info->hasKernargSegmentPtr()) { 873 unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); 874 MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); 875 CCInfo.AllocateReg(InputPtrReg); 876 } 877 878 if (Info->hasDispatchID()) { 879 unsigned DispatchIDReg = Info->addDispatchID(*TRI); 880 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 881 CCInfo.AllocateReg(DispatchIDReg); 882 } 883 884 if (Info->hasFlatScratchInit()) { 885 unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); 886 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 887 CCInfo.AllocateReg(FlatScratchInitReg); 888 } 889 890 if (!AMDGPU::isShader(CallConv)) 891 analyzeFormalArgumentsCompute(CCInfo, Ins); 892 else 893 AnalyzeFormalArguments(CCInfo, Splits); 894 895 SmallVector<SDValue, 16> Chains; 896 897 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 898 899 const ISD::InputArg &Arg = Ins[i]; 900 if (Skipped[i]) { 901 InVals.push_back(DAG.getUNDEF(Arg.VT)); 902 continue; 903 } 904 905 CCValAssign &VA = ArgLocs[ArgIdx++]; 906 MVT VT = VA.getLocVT(); 907 908 if (VA.isMemLoc()) { 909 VT = Ins[i].VT; 910 EVT MemVT = VA.getLocVT(); 911 const unsigned Offset = Subtarget->getExplicitKernelArgOffset() + 912 VA.getLocMemOffset(); 913 // The first 36 bytes of the input buffer contains information about 914 // thread group and global sizes. 915 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, 916 Offset, Ins[i].Flags.isSExt(), 917 &Ins[i]); 918 Chains.push_back(Arg.getValue(1)); 919 920 auto *ParamTy = 921 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 922 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && 923 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 924 // On SI local pointers are just offsets into LDS, so they are always 925 // less than 16-bits. On CI and newer they could potentially be 926 // real pointers, so we can't guarantee their size. 927 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, 928 DAG.getValueType(MVT::i16)); 929 } 930 931 InVals.push_back(Arg); 932 Info->setABIArgOffset(Offset + MemVT.getStoreSize()); 933 continue; 934 } 935 assert(VA.isRegLoc() && "Parameter must be in a register!"); 936 937 unsigned Reg = VA.getLocReg(); 938 939 if (VT == MVT::i64) { 940 // For now assume it is a pointer 941 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 942 &AMDGPU::SGPR_64RegClass); 943 Reg = MF.addLiveIn(Reg, &AMDGPU::SGPR_64RegClass); 944 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 945 InVals.push_back(Copy); 946 continue; 947 } 948 949 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 950 951 Reg = MF.addLiveIn(Reg, RC); 952 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 953 954 if (Arg.VT.isVector()) { 955 956 // Build a vector from the registers 957 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 958 unsigned NumElements = ParamType->getVectorNumElements(); 959 960 SmallVector<SDValue, 4> Regs; 961 Regs.push_back(Val); 962 for (unsigned j = 1; j != NumElements; ++j) { 963 Reg = ArgLocs[ArgIdx++].getLocReg(); 964 Reg = MF.addLiveIn(Reg, RC); 965 966 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 967 Regs.push_back(Copy); 968 } 969 970 // Fill up the missing vector elements 971 NumElements = Arg.VT.getVectorNumElements() - NumElements; 972 Regs.append(NumElements, DAG.getUNDEF(VT)); 973 974 InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); 975 continue; 976 } 977 978 InVals.push_back(Val); 979 } 980 981 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 982 // these from the dispatch pointer. 983 984 // Start adding system SGPRs. 985 if (Info->hasWorkGroupIDX()) { 986 unsigned Reg = Info->addWorkGroupIDX(); 987 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); 988 CCInfo.AllocateReg(Reg); 989 } 990 991 if (Info->hasWorkGroupIDY()) { 992 unsigned Reg = Info->addWorkGroupIDY(); 993 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); 994 CCInfo.AllocateReg(Reg); 995 } 996 997 if (Info->hasWorkGroupIDZ()) { 998 unsigned Reg = Info->addWorkGroupIDZ(); 999 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); 1000 CCInfo.AllocateReg(Reg); 1001 } 1002 1003 if (Info->hasWorkGroupInfo()) { 1004 unsigned Reg = Info->addWorkGroupInfo(); 1005 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); 1006 CCInfo.AllocateReg(Reg); 1007 } 1008 1009 if (Info->hasPrivateSegmentWaveByteOffset()) { 1010 // Scratch wave offset passed in system SGPR. 1011 unsigned PrivateSegmentWaveByteOffsetReg; 1012 1013 if (AMDGPU::isShader(CallConv)) { 1014 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); 1015 Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); 1016 } else 1017 PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset(); 1018 1019 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); 1020 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); 1021 } 1022 1023 // Now that we've figured out where the scratch register inputs are, see if 1024 // should reserve the arguments and use them directly. 1025 bool HasStackObjects = MF.getFrameInfo().hasStackObjects(); 1026 // Record that we know we have non-spill stack objects so we don't need to 1027 // check all stack objects later. 1028 if (HasStackObjects) 1029 Info->setHasNonSpillStackObjects(true); 1030 1031 // Everything live out of a block is spilled with fast regalloc, so it's 1032 // almost certain that spilling will be required. 1033 if (getTargetMachine().getOptLevel() == CodeGenOpt::None) 1034 HasStackObjects = true; 1035 1036 if (ST.isAmdCodeObjectV2()) { 1037 if (HasStackObjects) { 1038 // If we have stack objects, we unquestionably need the private buffer 1039 // resource. For the Code Object V2 ABI, this will be the first 4 user 1040 // SGPR inputs. We can reserve those and use them directly. 1041 1042 unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( 1043 MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); 1044 Info->setScratchRSrcReg(PrivateSegmentBufferReg); 1045 1046 unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( 1047 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 1048 Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); 1049 } else { 1050 unsigned ReservedBufferReg 1051 = TRI->reservedPrivateSegmentBufferReg(MF); 1052 unsigned ReservedOffsetReg 1053 = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); 1054 1055 // We tentatively reserve the last registers (skipping the last two 1056 // which may contain VCC). After register allocation, we'll replace 1057 // these with the ones immediately after those which were really 1058 // allocated. In the prologue copies will be inserted from the argument 1059 // to these reserved registers. 1060 Info->setScratchRSrcReg(ReservedBufferReg); 1061 Info->setScratchWaveOffsetReg(ReservedOffsetReg); 1062 } 1063 } else { 1064 unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); 1065 1066 // Without HSA, relocations are used for the scratch pointer and the 1067 // buffer resource setup is always inserted in the prologue. Scratch wave 1068 // offset is still in an input SGPR. 1069 Info->setScratchRSrcReg(ReservedBufferReg); 1070 1071 if (HasStackObjects) { 1072 unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( 1073 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 1074 Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); 1075 } else { 1076 unsigned ReservedOffsetReg 1077 = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); 1078 Info->setScratchWaveOffsetReg(ReservedOffsetReg); 1079 } 1080 } 1081 1082 if (Info->hasWorkItemIDX()) { 1083 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); 1084 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 1085 CCInfo.AllocateReg(Reg); 1086 } 1087 1088 if (Info->hasWorkItemIDY()) { 1089 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); 1090 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 1091 CCInfo.AllocateReg(Reg); 1092 } 1093 1094 if (Info->hasWorkItemIDZ()) { 1095 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); 1096 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 1097 CCInfo.AllocateReg(Reg); 1098 } 1099 1100 if (Chains.empty()) 1101 return Chain; 1102 1103 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 1104 } 1105 1106 SDValue 1107 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 1108 bool isVarArg, 1109 const SmallVectorImpl<ISD::OutputArg> &Outs, 1110 const SmallVectorImpl<SDValue> &OutVals, 1111 const SDLoc &DL, SelectionDAG &DAG) const { 1112 MachineFunction &MF = DAG.getMachineFunction(); 1113 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1114 1115 if (!AMDGPU::isShader(CallConv)) 1116 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, 1117 OutVals, DL, DAG); 1118 1119 Info->setIfReturnsVoid(Outs.size() == 0); 1120 1121 SmallVector<ISD::OutputArg, 48> Splits; 1122 SmallVector<SDValue, 48> SplitVals; 1123 1124 // Split vectors into their elements. 1125 for (unsigned i = 0, e = Outs.size(); i != e; ++i) { 1126 const ISD::OutputArg &Out = Outs[i]; 1127 1128 if (Out.VT.isVector()) { 1129 MVT VT = Out.VT.getVectorElementType(); 1130 ISD::OutputArg NewOut = Out; 1131 NewOut.Flags.setSplit(); 1132 NewOut.VT = VT; 1133 1134 // We want the original number of vector elements here, e.g. 1135 // three or five, not four or eight. 1136 unsigned NumElements = Out.ArgVT.getVectorNumElements(); 1137 1138 for (unsigned j = 0; j != NumElements; ++j) { 1139 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], 1140 DAG.getConstant(j, DL, MVT::i32)); 1141 SplitVals.push_back(Elem); 1142 Splits.push_back(NewOut); 1143 NewOut.PartOffset += NewOut.VT.getStoreSize(); 1144 } 1145 } else { 1146 SplitVals.push_back(OutVals[i]); 1147 Splits.push_back(Out); 1148 } 1149 } 1150 1151 // CCValAssign - represent the assignment of the return value to a location. 1152 SmallVector<CCValAssign, 48> RVLocs; 1153 1154 // CCState - Info about the registers and stack slots. 1155 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1156 *DAG.getContext()); 1157 1158 // Analyze outgoing return values. 1159 AnalyzeReturn(CCInfo, Splits); 1160 1161 SDValue Flag; 1162 SmallVector<SDValue, 48> RetOps; 1163 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1164 1165 // Copy the result values into the output registers. 1166 for (unsigned i = 0, realRVLocIdx = 0; 1167 i != RVLocs.size(); 1168 ++i, ++realRVLocIdx) { 1169 CCValAssign &VA = RVLocs[i]; 1170 assert(VA.isRegLoc() && "Can only return in registers!"); 1171 1172 SDValue Arg = SplitVals[realRVLocIdx]; 1173 1174 // Copied from other backends. 1175 switch (VA.getLocInfo()) { 1176 default: llvm_unreachable("Unknown loc info!"); 1177 case CCValAssign::Full: 1178 break; 1179 case CCValAssign::BCvt: 1180 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 1181 break; 1182 } 1183 1184 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 1185 Flag = Chain.getValue(1); 1186 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1187 } 1188 1189 // Update chain and glue. 1190 RetOps[0] = Chain; 1191 if (Flag.getNode()) 1192 RetOps.push_back(Flag); 1193 1194 unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN; 1195 return DAG.getNode(Opc, DL, MVT::Other, RetOps); 1196 } 1197 1198 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, 1199 SelectionDAG &DAG) const { 1200 unsigned Reg = StringSwitch<unsigned>(RegName) 1201 .Case("m0", AMDGPU::M0) 1202 .Case("exec", AMDGPU::EXEC) 1203 .Case("exec_lo", AMDGPU::EXEC_LO) 1204 .Case("exec_hi", AMDGPU::EXEC_HI) 1205 .Case("flat_scratch", AMDGPU::FLAT_SCR) 1206 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) 1207 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) 1208 .Default(AMDGPU::NoRegister); 1209 1210 if (Reg == AMDGPU::NoRegister) { 1211 report_fatal_error(Twine("invalid register name \"" 1212 + StringRef(RegName) + "\".")); 1213 1214 } 1215 1216 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && 1217 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { 1218 report_fatal_error(Twine("invalid register \"" 1219 + StringRef(RegName) + "\" for subtarget.")); 1220 } 1221 1222 switch (Reg) { 1223 case AMDGPU::M0: 1224 case AMDGPU::EXEC_LO: 1225 case AMDGPU::EXEC_HI: 1226 case AMDGPU::FLAT_SCR_LO: 1227 case AMDGPU::FLAT_SCR_HI: 1228 if (VT.getSizeInBits() == 32) 1229 return Reg; 1230 break; 1231 case AMDGPU::EXEC: 1232 case AMDGPU::FLAT_SCR: 1233 if (VT.getSizeInBits() == 64) 1234 return Reg; 1235 break; 1236 default: 1237 llvm_unreachable("missing register type checking"); 1238 } 1239 1240 report_fatal_error(Twine("invalid type for register \"" 1241 + StringRef(RegName) + "\".")); 1242 } 1243 1244 // If kill is not the last instruction, split the block so kill is always a 1245 // proper terminator. 1246 MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, 1247 MachineBasicBlock *BB) const { 1248 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1249 1250 MachineBasicBlock::iterator SplitPoint(&MI); 1251 ++SplitPoint; 1252 1253 if (SplitPoint == BB->end()) { 1254 // Don't bother with a new block. 1255 MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); 1256 return BB; 1257 } 1258 1259 MachineFunction *MF = BB->getParent(); 1260 MachineBasicBlock *SplitBB 1261 = MF->CreateMachineBasicBlock(BB->getBasicBlock()); 1262 1263 MF->insert(++MachineFunction::iterator(BB), SplitBB); 1264 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); 1265 1266 SplitBB->transferSuccessorsAndUpdatePHIs(BB); 1267 BB->addSuccessor(SplitBB); 1268 1269 MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); 1270 return SplitBB; 1271 } 1272 1273 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the 1274 // wavefront. If the value is uniform and just happens to be in a VGPR, this 1275 // will only do one iteration. In the worst case, this will loop 64 times. 1276 // 1277 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. 1278 static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( 1279 const SIInstrInfo *TII, 1280 MachineRegisterInfo &MRI, 1281 MachineBasicBlock &OrigBB, 1282 MachineBasicBlock &LoopBB, 1283 const DebugLoc &DL, 1284 const MachineOperand &IdxReg, 1285 unsigned InitReg, 1286 unsigned ResultReg, 1287 unsigned PhiReg, 1288 unsigned InitSaveExecReg, 1289 int Offset, 1290 bool UseGPRIdxMode) { 1291 MachineBasicBlock::iterator I = LoopBB.begin(); 1292 1293 unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1294 unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1295 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1296 unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1297 1298 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) 1299 .addReg(InitReg) 1300 .addMBB(&OrigBB) 1301 .addReg(ResultReg) 1302 .addMBB(&LoopBB); 1303 1304 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec) 1305 .addReg(InitSaveExecReg) 1306 .addMBB(&OrigBB) 1307 .addReg(NewExec) 1308 .addMBB(&LoopBB); 1309 1310 // Read the next variant <- also loop target. 1311 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) 1312 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); 1313 1314 // Compare the just read M0 value to all possible Idx values. 1315 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) 1316 .addReg(CurrentIdxReg) 1317 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); 1318 1319 if (UseGPRIdxMode) { 1320 unsigned IdxReg; 1321 if (Offset == 0) { 1322 IdxReg = CurrentIdxReg; 1323 } else { 1324 IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1325 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg) 1326 .addReg(CurrentIdxReg, RegState::Kill) 1327 .addImm(Offset); 1328 } 1329 1330 MachineInstr *SetIdx = 1331 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX)) 1332 .addReg(IdxReg, RegState::Kill); 1333 SetIdx->getOperand(2).setIsUndef(); 1334 } else { 1335 // Move index from VCC into M0 1336 if (Offset == 0) { 1337 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1338 .addReg(CurrentIdxReg, RegState::Kill); 1339 } else { 1340 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 1341 .addReg(CurrentIdxReg, RegState::Kill) 1342 .addImm(Offset); 1343 } 1344 } 1345 1346 // Update EXEC, save the original EXEC value to VCC. 1347 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) 1348 .addReg(CondReg, RegState::Kill); 1349 1350 MRI.setSimpleHint(NewExec, CondReg); 1351 1352 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 1353 MachineInstr *InsertPt = 1354 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 1355 .addReg(AMDGPU::EXEC) 1356 .addReg(NewExec); 1357 1358 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 1359 // s_cbranch_scc0? 1360 1361 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 1362 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 1363 .addMBB(&LoopBB); 1364 1365 return InsertPt->getIterator(); 1366 } 1367 1368 // This has slightly sub-optimal regalloc when the source vector is killed by 1369 // the read. The register allocator does not understand that the kill is 1370 // per-workitem, so is kept alive for the whole loop so we end up not re-using a 1371 // subregister from it, using 1 more VGPR than necessary. This was saved when 1372 // this was expanded after register allocation. 1373 static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, 1374 MachineBasicBlock &MBB, 1375 MachineInstr &MI, 1376 unsigned InitResultReg, 1377 unsigned PhiReg, 1378 int Offset, 1379 bool UseGPRIdxMode) { 1380 MachineFunction *MF = MBB.getParent(); 1381 MachineRegisterInfo &MRI = MF->getRegInfo(); 1382 const DebugLoc &DL = MI.getDebugLoc(); 1383 MachineBasicBlock::iterator I(&MI); 1384 1385 unsigned DstReg = MI.getOperand(0).getReg(); 1386 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1387 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1388 1389 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); 1390 1391 // Save the EXEC mask 1392 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec) 1393 .addReg(AMDGPU::EXEC); 1394 1395 // To insert the loop we need to split the block. Move everything after this 1396 // point to a new block, and insert a new empty block between the two. 1397 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 1398 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 1399 MachineFunction::iterator MBBI(MBB); 1400 ++MBBI; 1401 1402 MF->insert(MBBI, LoopBB); 1403 MF->insert(MBBI, RemainderBB); 1404 1405 LoopBB->addSuccessor(LoopBB); 1406 LoopBB->addSuccessor(RemainderBB); 1407 1408 // Move the rest of the block into a new block. 1409 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 1410 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 1411 1412 MBB.addSuccessor(LoopBB); 1413 1414 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 1415 1416 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, 1417 InitResultReg, DstReg, PhiReg, TmpExec, 1418 Offset, UseGPRIdxMode); 1419 1420 MachineBasicBlock::iterator First = RemainderBB->begin(); 1421 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 1422 .addReg(SaveExec); 1423 1424 return InsPt; 1425 } 1426 1427 // Returns subreg index, offset 1428 static std::pair<unsigned, int> 1429 computeIndirectRegAndOffset(const SIRegisterInfo &TRI, 1430 const TargetRegisterClass *SuperRC, 1431 unsigned VecReg, 1432 int Offset) { 1433 int NumElts = SuperRC->getSize() / 4; 1434 1435 // Skip out of bounds offsets, or else we would end up using an undefined 1436 // register. 1437 if (Offset >= NumElts || Offset < 0) 1438 return std::make_pair(AMDGPU::sub0, Offset); 1439 1440 return std::make_pair(AMDGPU::sub0 + Offset, 0); 1441 } 1442 1443 // Return true if the index is an SGPR and was set. 1444 static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, 1445 MachineRegisterInfo &MRI, 1446 MachineInstr &MI, 1447 int Offset, 1448 bool UseGPRIdxMode, 1449 bool IsIndirectSrc) { 1450 MachineBasicBlock *MBB = MI.getParent(); 1451 const DebugLoc &DL = MI.getDebugLoc(); 1452 MachineBasicBlock::iterator I(&MI); 1453 1454 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 1455 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); 1456 1457 assert(Idx->getReg() != AMDGPU::NoRegister); 1458 1459 if (!TII->getRegisterInfo().isSGPRClass(IdxRC)) 1460 return false; 1461 1462 if (UseGPRIdxMode) { 1463 unsigned IdxMode = IsIndirectSrc ? 1464 VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; 1465 if (Offset == 0) { 1466 MachineInstr *SetOn = 1467 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 1468 .addOperand(*Idx) 1469 .addImm(IdxMode); 1470 1471 SetOn->getOperand(3).setIsUndef(); 1472 } else { 1473 unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1474 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) 1475 .addOperand(*Idx) 1476 .addImm(Offset); 1477 MachineInstr *SetOn = 1478 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 1479 .addReg(Tmp, RegState::Kill) 1480 .addImm(IdxMode); 1481 1482 SetOn->getOperand(3).setIsUndef(); 1483 } 1484 1485 return true; 1486 } 1487 1488 if (Offset == 0) { 1489 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1490 .addOperand(*Idx); 1491 } else { 1492 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 1493 .addOperand(*Idx) 1494 .addImm(Offset); 1495 } 1496 1497 return true; 1498 } 1499 1500 // Control flow needs to be inserted if indexing with a VGPR. 1501 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, 1502 MachineBasicBlock &MBB, 1503 const SISubtarget &ST) { 1504 const SIInstrInfo *TII = ST.getInstrInfo(); 1505 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1506 MachineFunction *MF = MBB.getParent(); 1507 MachineRegisterInfo &MRI = MF->getRegInfo(); 1508 1509 unsigned Dst = MI.getOperand(0).getReg(); 1510 unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); 1511 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 1512 1513 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); 1514 1515 unsigned SubReg; 1516 std::tie(SubReg, Offset) 1517 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); 1518 1519 bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; 1520 1521 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) { 1522 MachineBasicBlock::iterator I(&MI); 1523 const DebugLoc &DL = MI.getDebugLoc(); 1524 1525 if (UseGPRIdxMode) { 1526 // TODO: Look at the uses to avoid the copy. This may require rescheduling 1527 // to avoid interfering with other uses, so probably requires a new 1528 // optimization pass. 1529 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) 1530 .addReg(SrcReg, RegState::Undef, SubReg) 1531 .addReg(SrcReg, RegState::Implicit) 1532 .addReg(AMDGPU::M0, RegState::Implicit); 1533 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 1534 } else { 1535 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 1536 .addReg(SrcReg, RegState::Undef, SubReg) 1537 .addReg(SrcReg, RegState::Implicit); 1538 } 1539 1540 MI.eraseFromParent(); 1541 1542 return &MBB; 1543 } 1544 1545 1546 const DebugLoc &DL = MI.getDebugLoc(); 1547 MachineBasicBlock::iterator I(&MI); 1548 1549 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1550 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1551 1552 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); 1553 1554 if (UseGPRIdxMode) { 1555 MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 1556 .addImm(0) // Reset inside loop. 1557 .addImm(VGPRIndexMode::SRC0_ENABLE); 1558 SetOn->getOperand(3).setIsUndef(); 1559 1560 // Disable again after the loop. 1561 BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 1562 } 1563 1564 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode); 1565 MachineBasicBlock *LoopBB = InsPt->getParent(); 1566 1567 if (UseGPRIdxMode) { 1568 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) 1569 .addReg(SrcReg, RegState::Undef, SubReg) 1570 .addReg(SrcReg, RegState::Implicit) 1571 .addReg(AMDGPU::M0, RegState::Implicit); 1572 } else { 1573 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 1574 .addReg(SrcReg, RegState::Undef, SubReg) 1575 .addReg(SrcReg, RegState::Implicit); 1576 } 1577 1578 MI.eraseFromParent(); 1579 1580 return LoopBB; 1581 } 1582 1583 static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) { 1584 switch (VecRC->getSize()) { 1585 case 4: 1586 return AMDGPU::V_MOVRELD_B32_V1; 1587 case 8: 1588 return AMDGPU::V_MOVRELD_B32_V2; 1589 case 16: 1590 return AMDGPU::V_MOVRELD_B32_V4; 1591 case 32: 1592 return AMDGPU::V_MOVRELD_B32_V8; 1593 case 64: 1594 return AMDGPU::V_MOVRELD_B32_V16; 1595 default: 1596 llvm_unreachable("unsupported size for MOVRELD pseudos"); 1597 } 1598 } 1599 1600 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, 1601 MachineBasicBlock &MBB, 1602 const SISubtarget &ST) { 1603 const SIInstrInfo *TII = ST.getInstrInfo(); 1604 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1605 MachineFunction *MF = MBB.getParent(); 1606 MachineRegisterInfo &MRI = MF->getRegInfo(); 1607 1608 unsigned Dst = MI.getOperand(0).getReg(); 1609 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); 1610 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 1611 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); 1612 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 1613 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); 1614 1615 // This can be an immediate, but will be folded later. 1616 assert(Val->getReg()); 1617 1618 unsigned SubReg; 1619 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, 1620 SrcVec->getReg(), 1621 Offset); 1622 bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; 1623 1624 if (Idx->getReg() == AMDGPU::NoRegister) { 1625 MachineBasicBlock::iterator I(&MI); 1626 const DebugLoc &DL = MI.getDebugLoc(); 1627 1628 assert(Offset == 0); 1629 1630 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) 1631 .addOperand(*SrcVec) 1632 .addOperand(*Val) 1633 .addImm(SubReg); 1634 1635 MI.eraseFromParent(); 1636 return &MBB; 1637 } 1638 1639 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) { 1640 MachineBasicBlock::iterator I(&MI); 1641 const DebugLoc &DL = MI.getDebugLoc(); 1642 1643 if (UseGPRIdxMode) { 1644 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) 1645 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst 1646 .addOperand(*Val) 1647 .addReg(Dst, RegState::ImplicitDefine) 1648 .addReg(SrcVec->getReg(), RegState::Implicit) 1649 .addReg(AMDGPU::M0, RegState::Implicit); 1650 1651 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 1652 } else { 1653 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); 1654 1655 BuildMI(MBB, I, DL, MovRelDesc) 1656 .addReg(Dst, RegState::Define) 1657 .addReg(SrcVec->getReg()) 1658 .addOperand(*Val) 1659 .addImm(SubReg - AMDGPU::sub0); 1660 } 1661 1662 MI.eraseFromParent(); 1663 return &MBB; 1664 } 1665 1666 if (Val->isReg()) 1667 MRI.clearKillFlags(Val->getReg()); 1668 1669 const DebugLoc &DL = MI.getDebugLoc(); 1670 1671 if (UseGPRIdxMode) { 1672 MachineBasicBlock::iterator I(&MI); 1673 1674 MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 1675 .addImm(0) // Reset inside loop. 1676 .addImm(VGPRIndexMode::DST_ENABLE); 1677 SetOn->getOperand(3).setIsUndef(); 1678 1679 // Disable again after the loop. 1680 BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 1681 } 1682 1683 unsigned PhiReg = MRI.createVirtualRegister(VecRC); 1684 1685 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, 1686 Offset, UseGPRIdxMode); 1687 MachineBasicBlock *LoopBB = InsPt->getParent(); 1688 1689 if (UseGPRIdxMode) { 1690 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) 1691 .addReg(PhiReg, RegState::Undef, SubReg) // vdst 1692 .addOperand(*Val) // src0 1693 .addReg(Dst, RegState::ImplicitDefine) 1694 .addReg(PhiReg, RegState::Implicit) 1695 .addReg(AMDGPU::M0, RegState::Implicit); 1696 } else { 1697 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); 1698 1699 BuildMI(*LoopBB, InsPt, DL, MovRelDesc) 1700 .addReg(Dst, RegState::Define) 1701 .addReg(PhiReg) 1702 .addOperand(*Val) 1703 .addImm(SubReg - AMDGPU::sub0); 1704 } 1705 1706 MI.eraseFromParent(); 1707 1708 return LoopBB; 1709 } 1710 1711 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( 1712 MachineInstr &MI, MachineBasicBlock *BB) const { 1713 1714 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1715 MachineFunction *MF = BB->getParent(); 1716 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1717 1718 if (TII->isMIMG(MI)) { 1719 if (!MI.memoperands_empty()) 1720 return BB; 1721 // Add a memoperand for mimg instructions so that they aren't assumed to 1722 // be ordered memory instuctions. 1723 1724 MachinePointerInfo PtrInfo(MFI->getImagePSV()); 1725 MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable; 1726 if (MI.mayStore()) 1727 Flags |= MachineMemOperand::MOStore; 1728 1729 if (MI.mayLoad()) 1730 Flags |= MachineMemOperand::MOLoad; 1731 1732 auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0); 1733 MI.addMemOperand(*MF, MMO); 1734 return BB; 1735 } 1736 1737 switch (MI.getOpcode()) { 1738 case AMDGPU::SI_INIT_M0: { 1739 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), 1740 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1741 .addOperand(MI.getOperand(0)); 1742 MI.eraseFromParent(); 1743 return BB; 1744 } 1745 case AMDGPU::GET_GROUPSTATICSIZE: { 1746 DebugLoc DL = MI.getDebugLoc(); 1747 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) 1748 .addOperand(MI.getOperand(0)) 1749 .addImm(MFI->getLDSSize()); 1750 MI.eraseFromParent(); 1751 return BB; 1752 } 1753 case AMDGPU::SI_INDIRECT_SRC_V1: 1754 case AMDGPU::SI_INDIRECT_SRC_V2: 1755 case AMDGPU::SI_INDIRECT_SRC_V4: 1756 case AMDGPU::SI_INDIRECT_SRC_V8: 1757 case AMDGPU::SI_INDIRECT_SRC_V16: 1758 return emitIndirectSrc(MI, *BB, *getSubtarget()); 1759 case AMDGPU::SI_INDIRECT_DST_V1: 1760 case AMDGPU::SI_INDIRECT_DST_V2: 1761 case AMDGPU::SI_INDIRECT_DST_V4: 1762 case AMDGPU::SI_INDIRECT_DST_V8: 1763 case AMDGPU::SI_INDIRECT_DST_V16: 1764 return emitIndirectDst(MI, *BB, *getSubtarget()); 1765 case AMDGPU::SI_KILL: 1766 return splitKillBlock(MI, BB); 1767 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 1768 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 1769 1770 unsigned Dst = MI.getOperand(0).getReg(); 1771 unsigned Src0 = MI.getOperand(1).getReg(); 1772 unsigned Src1 = MI.getOperand(2).getReg(); 1773 const DebugLoc &DL = MI.getDebugLoc(); 1774 unsigned SrcCond = MI.getOperand(3).getReg(); 1775 1776 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1777 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1778 1779 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 1780 .addReg(Src0, 0, AMDGPU::sub0) 1781 .addReg(Src1, 0, AMDGPU::sub0) 1782 .addReg(SrcCond); 1783 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 1784 .addReg(Src0, 0, AMDGPU::sub1) 1785 .addReg(Src1, 0, AMDGPU::sub1) 1786 .addReg(SrcCond); 1787 1788 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) 1789 .addReg(DstLo) 1790 .addImm(AMDGPU::sub0) 1791 .addReg(DstHi) 1792 .addImm(AMDGPU::sub1); 1793 MI.eraseFromParent(); 1794 return BB; 1795 } 1796 case AMDGPU::SI_BR_UNDEF: { 1797 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1798 const DebugLoc &DL = MI.getDebugLoc(); 1799 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) 1800 .addOperand(MI.getOperand(0)); 1801 Br->getOperand(1).setIsUndef(true); // read undef SCC 1802 MI.eraseFromParent(); 1803 return BB; 1804 } 1805 default: 1806 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 1807 } 1808 } 1809 1810 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 1811 // This currently forces unfolding various combinations of fsub into fma with 1812 // free fneg'd operands. As long as we have fast FMA (controlled by 1813 // isFMAFasterThanFMulAndFAdd), we should perform these. 1814 1815 // When fma is quarter rate, for f64 where add / sub are at best half rate, 1816 // most of these combines appear to be cycle neutral but save on instruction 1817 // count / code size. 1818 return true; 1819 } 1820 1821 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, 1822 EVT VT) const { 1823 if (!VT.isVector()) { 1824 return MVT::i1; 1825 } 1826 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 1827 } 1828 1829 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { 1830 // TODO: Should i16 be used always if legal? For now it would force VALU 1831 // shifts. 1832 return (VT == MVT::i16) ? MVT::i16 : MVT::i32; 1833 } 1834 1835 // Answering this is somewhat tricky and depends on the specific device which 1836 // have different rates for fma or all f64 operations. 1837 // 1838 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 1839 // regardless of which device (although the number of cycles differs between 1840 // devices), so it is always profitable for f64. 1841 // 1842 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 1843 // only on full rate devices. Normally, we should prefer selecting v_mad_f32 1844 // which we can always do even without fused FP ops since it returns the same 1845 // result as the separate operations and since it is always full 1846 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 1847 // however does not support denormals, so we do report fma as faster if we have 1848 // a fast fma device and require denormals. 1849 // 1850 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 1851 VT = VT.getScalarType(); 1852 1853 if (!VT.isSimple()) 1854 return false; 1855 1856 switch (VT.getSimpleVT().SimpleTy) { 1857 case MVT::f32: 1858 // This is as fast on some subtargets. However, we always have full rate f32 1859 // mad available which returns the same result as the separate operations 1860 // which we should prefer over fma. We can't use this if we want to support 1861 // denormals, so only report this in these cases. 1862 return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); 1863 case MVT::f64: 1864 return true; 1865 case MVT::f16: 1866 return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals(); 1867 default: 1868 break; 1869 } 1870 1871 return false; 1872 } 1873 1874 //===----------------------------------------------------------------------===// 1875 // Custom DAG Lowering Operations 1876 //===----------------------------------------------------------------------===// 1877 1878 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 1879 switch (Op.getOpcode()) { 1880 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 1881 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 1882 case ISD::LOAD: { 1883 SDValue Result = LowerLOAD(Op, DAG); 1884 assert((!Result.getNode() || 1885 Result.getNode()->getNumValues() == 2) && 1886 "Load should return a value and a chain"); 1887 return Result; 1888 } 1889 1890 case ISD::FSIN: 1891 case ISD::FCOS: 1892 return LowerTrig(Op, DAG); 1893 case ISD::SELECT: return LowerSELECT(Op, DAG); 1894 case ISD::FDIV: return LowerFDIV(Op, DAG); 1895 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); 1896 case ISD::STORE: return LowerSTORE(Op, DAG); 1897 case ISD::GlobalAddress: { 1898 MachineFunction &MF = DAG.getMachineFunction(); 1899 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1900 return LowerGlobalAddress(MFI, Op, DAG); 1901 } 1902 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 1903 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); 1904 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); 1905 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); 1906 case ISD::TRAP: return lowerTRAP(Op, DAG); 1907 case ISD::FP_ROUND: 1908 return lowerFP_ROUND(Op, DAG); 1909 } 1910 return SDValue(); 1911 } 1912 1913 /// \brief Helper function for LowerBRCOND 1914 static SDNode *findUser(SDValue Value, unsigned Opcode) { 1915 1916 SDNode *Parent = Value.getNode(); 1917 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 1918 I != E; ++I) { 1919 1920 if (I.getUse().get() != Value) 1921 continue; 1922 1923 if (I->getOpcode() == Opcode) 1924 return *I; 1925 } 1926 return nullptr; 1927 } 1928 1929 bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { 1930 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 1931 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { 1932 case AMDGPUIntrinsic::amdgcn_if: 1933 case AMDGPUIntrinsic::amdgcn_else: 1934 case AMDGPUIntrinsic::amdgcn_end_cf: 1935 case AMDGPUIntrinsic::amdgcn_loop: 1936 return true; 1937 default: 1938 return false; 1939 } 1940 } 1941 1942 if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) { 1943 switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) { 1944 case AMDGPUIntrinsic::amdgcn_break: 1945 case AMDGPUIntrinsic::amdgcn_if_break: 1946 case AMDGPUIntrinsic::amdgcn_else_break: 1947 return true; 1948 default: 1949 return false; 1950 } 1951 } 1952 1953 return false; 1954 } 1955 1956 void SITargetLowering::createDebuggerPrologueStackObjects( 1957 MachineFunction &MF) const { 1958 // Create stack objects that are used for emitting debugger prologue. 1959 // 1960 // Debugger prologue writes work group IDs and work item IDs to scratch memory 1961 // at fixed location in the following format: 1962 // offset 0: work group ID x 1963 // offset 4: work group ID y 1964 // offset 8: work group ID z 1965 // offset 16: work item ID x 1966 // offset 20: work item ID y 1967 // offset 24: work item ID z 1968 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1969 int ObjectIdx = 0; 1970 1971 // For each dimension: 1972 for (unsigned i = 0; i < 3; ++i) { 1973 // Create fixed stack object for work group ID. 1974 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true); 1975 Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); 1976 // Create fixed stack object for work item ID. 1977 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true); 1978 Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); 1979 } 1980 } 1981 1982 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { 1983 const Triple &TT = getTargetMachine().getTargetTriple(); 1984 return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && 1985 AMDGPU::shouldEmitConstantsToTextSection(TT); 1986 } 1987 1988 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { 1989 return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || 1990 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) && 1991 !shouldEmitFixup(GV) && 1992 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); 1993 } 1994 1995 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { 1996 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); 1997 } 1998 1999 /// This transforms the control flow intrinsics to get the branch destination as 2000 /// last parameter, also switches branch target with BR if the need arise 2001 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 2002 SelectionDAG &DAG) const { 2003 2004 SDLoc DL(BRCOND); 2005 2006 SDNode *Intr = BRCOND.getOperand(1).getNode(); 2007 SDValue Target = BRCOND.getOperand(2); 2008 SDNode *BR = nullptr; 2009 SDNode *SetCC = nullptr; 2010 2011 if (Intr->getOpcode() == ISD::SETCC) { 2012 // As long as we negate the condition everything is fine 2013 SetCC = Intr; 2014 Intr = SetCC->getOperand(0).getNode(); 2015 2016 } else { 2017 // Get the target from BR if we don't negate the condition 2018 BR = findUser(BRCOND, ISD::BR); 2019 Target = BR->getOperand(1); 2020 } 2021 2022 // FIXME: This changes the types of the intrinsics instead of introducing new 2023 // nodes with the correct types. 2024 // e.g. llvm.amdgcn.loop 2025 2026 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3 2027 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088> 2028 2029 if (!isCFIntrinsic(Intr)) { 2030 // This is a uniform branch so we don't need to legalize. 2031 return BRCOND; 2032 } 2033 2034 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID || 2035 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; 2036 2037 assert(!SetCC || 2038 (SetCC->getConstantOperandVal(1) == 1 && 2039 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 2040 ISD::SETNE)); 2041 2042 // operands of the new intrinsic call 2043 SmallVector<SDValue, 4> Ops; 2044 if (HaveChain) 2045 Ops.push_back(BRCOND.getOperand(0)); 2046 2047 Ops.append(Intr->op_begin() + (HaveChain ? 1 : 0), Intr->op_end()); 2048 Ops.push_back(Target); 2049 2050 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 2051 2052 // build the new intrinsic call 2053 SDNode *Result = DAG.getNode( 2054 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 2055 DAG.getVTList(Res), Ops).getNode(); 2056 2057 if (!HaveChain) { 2058 SDValue Ops[] = { 2059 SDValue(Result, 0), 2060 BRCOND.getOperand(0) 2061 }; 2062 2063 Result = DAG.getMergeValues(Ops, DL).getNode(); 2064 } 2065 2066 if (BR) { 2067 // Give the branch instruction our target 2068 SDValue Ops[] = { 2069 BR->getOperand(0), 2070 BRCOND.getOperand(2) 2071 }; 2072 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 2073 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 2074 BR = NewBR.getNode(); 2075 } 2076 2077 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 2078 2079 // Copy the intrinsic results to registers 2080 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 2081 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 2082 if (!CopyToReg) 2083 continue; 2084 2085 Chain = DAG.getCopyToReg( 2086 Chain, DL, 2087 CopyToReg->getOperand(1), 2088 SDValue(Result, i - 1), 2089 SDValue()); 2090 2091 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 2092 } 2093 2094 // Remove the old intrinsic from the chain 2095 DAG.ReplaceAllUsesOfValueWith( 2096 SDValue(Intr, Intr->getNumValues() - 1), 2097 Intr->getOperand(0)); 2098 2099 return Chain; 2100 } 2101 2102 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, 2103 SDValue Op, 2104 const SDLoc &DL, 2105 EVT VT) const { 2106 return Op.getValueType().bitsLE(VT) ? 2107 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) : 2108 DAG.getNode(ISD::FTRUNC, DL, VT, Op); 2109 } 2110 2111 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 2112 assert(Op.getValueType() == MVT::f16 && 2113 "Do not know how to custom lower FP_ROUND for non-f16 type"); 2114 2115 SDValue Src = Op.getOperand(0); 2116 EVT SrcVT = Src.getValueType(); 2117 if (SrcVT != MVT::f64) 2118 return Op; 2119 2120 SDLoc DL(Op); 2121 2122 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); 2123 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); 2124 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);; 2125 } 2126 2127 SDValue SITargetLowering::getSegmentAperture(unsigned AS, 2128 SelectionDAG &DAG) const { 2129 SDLoc SL; 2130 MachineFunction &MF = DAG.getMachineFunction(); 2131 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2132 unsigned UserSGPR = Info->getQueuePtrUserSGPR(); 2133 assert(UserSGPR != AMDGPU::NoRegister); 2134 2135 SDValue QueuePtr = CreateLiveInRegister( 2136 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); 2137 2138 // Offset into amd_queue_t for group_segment_aperture_base_hi / 2139 // private_segment_aperture_base_hi. 2140 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 2141 2142 SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr, 2143 DAG.getConstant(StructOffset, SL, MVT::i64)); 2144 2145 // TODO: Use custom target PseudoSourceValue. 2146 // TODO: We should use the value from the IR intrinsic call, but it might not 2147 // be available and how do we get it? 2148 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), 2149 AMDGPUAS::CONSTANT_ADDRESS)); 2150 2151 MachinePointerInfo PtrInfo(V, StructOffset); 2152 return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo, 2153 MinAlign(64, StructOffset), 2154 MachineMemOperand::MODereferenceable | 2155 MachineMemOperand::MOInvariant); 2156 } 2157 2158 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, 2159 SelectionDAG &DAG) const { 2160 SDLoc SL(Op); 2161 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op); 2162 2163 SDValue Src = ASC->getOperand(0); 2164 2165 // FIXME: Really support non-0 null pointers. 2166 SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32); 2167 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); 2168 2169 // flat -> local/private 2170 if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { 2171 if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 2172 ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 2173 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); 2174 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); 2175 2176 return DAG.getNode(ISD::SELECT, SL, MVT::i32, 2177 NonNull, Ptr, SegmentNullPtr); 2178 } 2179 } 2180 2181 // local/private -> flat 2182 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { 2183 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 2184 ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 2185 SDValue NonNull 2186 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); 2187 2188 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG); 2189 SDValue CvtPtr 2190 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); 2191 2192 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, 2193 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), 2194 FlatNullPtr); 2195 } 2196 } 2197 2198 // global <-> flat are no-ops and never emitted. 2199 2200 const MachineFunction &MF = DAG.getMachineFunction(); 2201 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2202 *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); 2203 DAG.getContext()->diagnose(InvalidAddrSpaceCast); 2204 2205 return DAG.getUNDEF(ASC->getValueType(0)); 2206 } 2207 2208 bool 2209 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 2210 // We can fold offsets for anything that doesn't require a GOT relocation. 2211 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || 2212 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) && 2213 !shouldEmitGOTReloc(GA->getGlobal()); 2214 } 2215 2216 static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, 2217 SDLoc DL, unsigned Offset, EVT PtrVT, 2218 unsigned GAFlags = SIInstrInfo::MO_NONE) { 2219 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is 2220 // lowered to the following code sequence: 2221 // 2222 // For constant address space: 2223 // s_getpc_b64 s[0:1] 2224 // s_add_u32 s0, s0, $symbol 2225 // s_addc_u32 s1, s1, 0 2226 // 2227 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2228 // a fixup or relocation is emitted to replace $symbol with a literal 2229 // constant, which is a pc-relative offset from the encoding of the $symbol 2230 // operand to the global variable. 2231 // 2232 // For global address space: 2233 // s_getpc_b64 s[0:1] 2234 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2235 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2236 // 2237 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2238 // fixups or relocations are emitted to replace $symbol@*@lo and 2239 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2240 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2241 // operand to the global variable. 2242 // 2243 // What we want here is an offset from the value returned by s_getpc 2244 // (which is the address of the s_add_u32 instruction) to the global 2245 // variable, but since the encoding of $symbol starts 4 bytes after the start 2246 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2247 // small. This requires us to add 4 to the global variable offset in order to 2248 // compute the correct address. 2249 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, 2250 GAFlags); 2251 SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, 2252 GAFlags == SIInstrInfo::MO_NONE ? 2253 GAFlags : GAFlags + 1); 2254 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); 2255 } 2256 2257 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 2258 SDValue Op, 2259 SelectionDAG &DAG) const { 2260 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 2261 2262 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && 2263 GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) 2264 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 2265 2266 SDLoc DL(GSD); 2267 const GlobalValue *GV = GSD->getGlobal(); 2268 EVT PtrVT = Op.getValueType(); 2269 2270 if (shouldEmitFixup(GV)) 2271 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); 2272 else if (shouldEmitPCReloc(GV)) 2273 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT, 2274 SIInstrInfo::MO_REL32); 2275 2276 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, 2277 SIInstrInfo::MO_GOTPCREL32); 2278 2279 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); 2280 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 2281 const DataLayout &DataLayout = DAG.getDataLayout(); 2282 unsigned Align = DataLayout.getABITypeAlignment(PtrTy); 2283 // FIXME: Use a PseudoSourceValue once those can be assigned an address space. 2284 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 2285 2286 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align, 2287 MachineMemOperand::MODereferenceable | 2288 MachineMemOperand::MOInvariant); 2289 } 2290 2291 SDValue SITargetLowering::lowerTRAP(SDValue Op, 2292 SelectionDAG &DAG) const { 2293 const MachineFunction &MF = DAG.getMachineFunction(); 2294 DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), 2295 "trap handler not supported", 2296 Op.getDebugLoc(), 2297 DS_Warning); 2298 DAG.getContext()->diagnose(NoTrap); 2299 2300 // Emit s_endpgm. 2301 2302 // FIXME: This should really be selected to s_trap, but that requires 2303 // setting up the trap handler for it o do anything. 2304 return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other, 2305 Op.getOperand(0)); 2306 } 2307 2308 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, 2309 const SDLoc &DL, SDValue V) const { 2310 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as 2311 // the destination register. 2312 // 2313 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, 2314 // so we will end up with redundant moves to m0. 2315 // 2316 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. 2317 2318 // A Null SDValue creates a glue result. 2319 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, 2320 V, Chain); 2321 return SDValue(M0, 0); 2322 } 2323 2324 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, 2325 SDValue Op, 2326 MVT VT, 2327 unsigned Offset) const { 2328 SDLoc SL(Op); 2329 SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, 2330 DAG.getEntryNode(), Offset, false); 2331 // The local size values will have the hi 16-bits as zero. 2332 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, 2333 DAG.getValueType(VT)); 2334 } 2335 2336 static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { 2337 DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), 2338 "non-hsa intrinsic with hsa target", 2339 DL.getDebugLoc()); 2340 DAG.getContext()->diagnose(BadIntrin); 2341 return DAG.getUNDEF(VT); 2342 } 2343 2344 static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { 2345 DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), 2346 "intrinsic not supported on subtarget", 2347 DL.getDebugLoc()); 2348 DAG.getContext()->diagnose(BadIntrin); 2349 return DAG.getUNDEF(VT); 2350 } 2351 2352 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 2353 SelectionDAG &DAG) const { 2354 MachineFunction &MF = DAG.getMachineFunction(); 2355 auto MFI = MF.getInfo<SIMachineFunctionInfo>(); 2356 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 2357 2358 EVT VT = Op.getValueType(); 2359 SDLoc DL(Op); 2360 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2361 2362 // TODO: Should this propagate fast-math-flags? 2363 2364 switch (IntrinsicID) { 2365 case Intrinsic::amdgcn_dispatch_ptr: 2366 case Intrinsic::amdgcn_queue_ptr: { 2367 if (!Subtarget->isAmdCodeObjectV2()) { 2368 DiagnosticInfoUnsupported BadIntrin( 2369 *MF.getFunction(), "unsupported hsa intrinsic without hsa target", 2370 DL.getDebugLoc()); 2371 DAG.getContext()->diagnose(BadIntrin); 2372 return DAG.getUNDEF(VT); 2373 } 2374 2375 auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? 2376 SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR; 2377 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, 2378 TRI->getPreloadedValue(MF, Reg), VT); 2379 } 2380 case Intrinsic::amdgcn_implicitarg_ptr: { 2381 unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); 2382 return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset); 2383 } 2384 case Intrinsic::amdgcn_kernarg_segment_ptr: { 2385 unsigned Reg 2386 = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 2387 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); 2388 } 2389 case Intrinsic::amdgcn_dispatch_id: { 2390 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID); 2391 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); 2392 } 2393 case Intrinsic::amdgcn_rcp: 2394 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); 2395 case Intrinsic::amdgcn_rsq: 2396 case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name 2397 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 2398 case Intrinsic::amdgcn_rsq_legacy: { 2399 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 2400 return emitRemovedIntrinsicError(DAG, DL, VT); 2401 2402 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 2403 } 2404 case Intrinsic::amdgcn_rcp_legacy: { 2405 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 2406 return emitRemovedIntrinsicError(DAG, DL, VT); 2407 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1)); 2408 } 2409 case Intrinsic::amdgcn_rsq_clamp: { 2410 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) 2411 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 2412 2413 Type *Type = VT.getTypeForEVT(*DAG.getContext()); 2414 APFloat Max = APFloat::getLargest(Type->getFltSemantics()); 2415 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); 2416 2417 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 2418 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, 2419 DAG.getConstantFP(Max, DL, VT)); 2420 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, 2421 DAG.getConstantFP(Min, DL, VT)); 2422 } 2423 case Intrinsic::r600_read_ngroups_x: 2424 if (Subtarget->isAmdHsaOS()) 2425 return emitNonHSAIntrinsicError(DAG, DL, VT); 2426 2427 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2428 SI::KernelInputOffsets::NGROUPS_X, false); 2429 case Intrinsic::r600_read_ngroups_y: 2430 if (Subtarget->isAmdHsaOS()) 2431 return emitNonHSAIntrinsicError(DAG, DL, VT); 2432 2433 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2434 SI::KernelInputOffsets::NGROUPS_Y, false); 2435 case Intrinsic::r600_read_ngroups_z: 2436 if (Subtarget->isAmdHsaOS()) 2437 return emitNonHSAIntrinsicError(DAG, DL, VT); 2438 2439 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2440 SI::KernelInputOffsets::NGROUPS_Z, false); 2441 case Intrinsic::r600_read_global_size_x: 2442 if (Subtarget->isAmdHsaOS()) 2443 return emitNonHSAIntrinsicError(DAG, DL, VT); 2444 2445 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2446 SI::KernelInputOffsets::GLOBAL_SIZE_X, false); 2447 case Intrinsic::r600_read_global_size_y: 2448 if (Subtarget->isAmdHsaOS()) 2449 return emitNonHSAIntrinsicError(DAG, DL, VT); 2450 2451 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2452 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); 2453 case Intrinsic::r600_read_global_size_z: 2454 if (Subtarget->isAmdHsaOS()) 2455 return emitNonHSAIntrinsicError(DAG, DL, VT); 2456 2457 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2458 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); 2459 case Intrinsic::r600_read_local_size_x: 2460 if (Subtarget->isAmdHsaOS()) 2461 return emitNonHSAIntrinsicError(DAG, DL, VT); 2462 2463 return lowerImplicitZextParam(DAG, Op, MVT::i16, 2464 SI::KernelInputOffsets::LOCAL_SIZE_X); 2465 case Intrinsic::r600_read_local_size_y: 2466 if (Subtarget->isAmdHsaOS()) 2467 return emitNonHSAIntrinsicError(DAG, DL, VT); 2468 2469 return lowerImplicitZextParam(DAG, Op, MVT::i16, 2470 SI::KernelInputOffsets::LOCAL_SIZE_Y); 2471 case Intrinsic::r600_read_local_size_z: 2472 if (Subtarget->isAmdHsaOS()) 2473 return emitNonHSAIntrinsicError(DAG, DL, VT); 2474 2475 return lowerImplicitZextParam(DAG, Op, MVT::i16, 2476 SI::KernelInputOffsets::LOCAL_SIZE_Z); 2477 case Intrinsic::amdgcn_workgroup_id_x: 2478 case Intrinsic::r600_read_tgid_x: 2479 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, 2480 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); 2481 case Intrinsic::amdgcn_workgroup_id_y: 2482 case Intrinsic::r600_read_tgid_y: 2483 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, 2484 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); 2485 case Intrinsic::amdgcn_workgroup_id_z: 2486 case Intrinsic::r600_read_tgid_z: 2487 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, 2488 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); 2489 case Intrinsic::amdgcn_workitem_id_x: 2490 case Intrinsic::r600_read_tidig_x: 2491 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 2492 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); 2493 case Intrinsic::amdgcn_workitem_id_y: 2494 case Intrinsic::r600_read_tidig_y: 2495 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 2496 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); 2497 case Intrinsic::amdgcn_workitem_id_z: 2498 case Intrinsic::r600_read_tidig_z: 2499 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 2500 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); 2501 case AMDGPUIntrinsic::SI_load_const: { 2502 SDValue Ops[] = { 2503 Op.getOperand(1), 2504 Op.getOperand(2) 2505 }; 2506 2507 MachineMemOperand *MMO = MF.getMachineMemOperand( 2508 MachinePointerInfo(), 2509 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2510 MachineMemOperand::MOInvariant, 2511 VT.getStoreSize(), 4); 2512 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 2513 Op->getVTList(), Ops, VT, MMO); 2514 } 2515 case AMDGPUIntrinsic::amdgcn_fdiv_fast: { 2516 return lowerFDIV_FAST(Op, DAG); 2517 } 2518 case AMDGPUIntrinsic::SI_vs_load_input: 2519 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 2520 Op.getOperand(1), 2521 Op.getOperand(2), 2522 Op.getOperand(3)); 2523 2524 case AMDGPUIntrinsic::SI_fs_constant: { 2525 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 2526 SDValue Glue = M0.getValue(1); 2527 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, 2528 DAG.getConstant(2, DL, MVT::i32), // P0 2529 Op.getOperand(1), Op.getOperand(2), Glue); 2530 } 2531 case AMDGPUIntrinsic::SI_packf16: 2532 if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) 2533 return DAG.getUNDEF(MVT::i32); 2534 return Op; 2535 case AMDGPUIntrinsic::SI_fs_interp: { 2536 SDValue IJ = Op.getOperand(4); 2537 SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 2538 DAG.getConstant(0, DL, MVT::i32)); 2539 SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 2540 DAG.getConstant(1, DL, MVT::i32)); 2541 I = DAG.getNode(ISD::BITCAST, DL, MVT::f32, I); 2542 J = DAG.getNode(ISD::BITCAST, DL, MVT::f32, J); 2543 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 2544 SDValue Glue = M0.getValue(1); 2545 SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, 2546 DAG.getVTList(MVT::f32, MVT::Glue), 2547 I, Op.getOperand(1), Op.getOperand(2), Glue); 2548 Glue = SDValue(P1.getNode(), 1); 2549 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, 2550 Op.getOperand(1), Op.getOperand(2), Glue); 2551 } 2552 case Intrinsic::amdgcn_interp_mov: { 2553 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); 2554 SDValue Glue = M0.getValue(1); 2555 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1), 2556 Op.getOperand(2), Op.getOperand(3), Glue); 2557 } 2558 case Intrinsic::amdgcn_interp_p1: { 2559 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); 2560 SDValue Glue = M0.getValue(1); 2561 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), 2562 Op.getOperand(2), Op.getOperand(3), Glue); 2563 } 2564 case Intrinsic::amdgcn_interp_p2: { 2565 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); 2566 SDValue Glue = SDValue(M0.getNode(), 1); 2567 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), 2568 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), 2569 Glue); 2570 } 2571 case Intrinsic::amdgcn_sin: 2572 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); 2573 2574 case Intrinsic::amdgcn_cos: 2575 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); 2576 2577 case Intrinsic::amdgcn_log_clamp: { 2578 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) 2579 return SDValue(); 2580 2581 DiagnosticInfoUnsupported BadIntrin( 2582 *MF.getFunction(), "intrinsic not supported on subtarget", 2583 DL.getDebugLoc()); 2584 DAG.getContext()->diagnose(BadIntrin); 2585 return DAG.getUNDEF(VT); 2586 } 2587 case Intrinsic::amdgcn_ldexp: 2588 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, 2589 Op.getOperand(1), Op.getOperand(2)); 2590 2591 case Intrinsic::amdgcn_fract: 2592 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 2593 2594 case Intrinsic::amdgcn_class: 2595 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, 2596 Op.getOperand(1), Op.getOperand(2)); 2597 case Intrinsic::amdgcn_div_fmas: 2598 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, 2599 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), 2600 Op.getOperand(4)); 2601 2602 case Intrinsic::amdgcn_div_fixup: 2603 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, 2604 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 2605 2606 case Intrinsic::amdgcn_trig_preop: 2607 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, 2608 Op.getOperand(1), Op.getOperand(2)); 2609 case Intrinsic::amdgcn_div_scale: { 2610 // 3rd parameter required to be a constant. 2611 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 2612 if (!Param) 2613 return DAG.getUNDEF(VT); 2614 2615 // Translate to the operands expected by the machine instruction. The 2616 // first parameter must be the same as the first instruction. 2617 SDValue Numerator = Op.getOperand(1); 2618 SDValue Denominator = Op.getOperand(2); 2619 2620 // Note this order is opposite of the machine instruction's operations, 2621 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The 2622 // intrinsic has the numerator as the first operand to match a normal 2623 // division operation. 2624 2625 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; 2626 2627 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, 2628 Denominator, Numerator); 2629 } 2630 case Intrinsic::amdgcn_icmp: { 2631 const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 2632 int CondCode = CD->getSExtValue(); 2633 2634 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || 2635 CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE) 2636 return DAG.getUNDEF(VT); 2637 2638 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); 2639 ISD::CondCode CCOpcode = getICmpCondCode(IcInput); 2640 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), 2641 Op.getOperand(2), DAG.getCondCode(CCOpcode)); 2642 } 2643 case Intrinsic::amdgcn_fcmp: { 2644 const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 2645 int CondCode = CD->getSExtValue(); 2646 2647 if (CondCode <= FCmpInst::Predicate::FCMP_FALSE || 2648 CondCode >= FCmpInst::Predicate::FCMP_TRUE) 2649 return DAG.getUNDEF(VT); 2650 2651 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); 2652 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); 2653 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), 2654 Op.getOperand(2), DAG.getCondCode(CCOpcode)); 2655 } 2656 case Intrinsic::amdgcn_fmul_legacy: 2657 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, 2658 Op.getOperand(1), Op.getOperand(2)); 2659 case Intrinsic::amdgcn_sffbh: 2660 case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name. 2661 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); 2662 default: 2663 return AMDGPUTargetLowering::LowerOperation(Op, DAG); 2664 } 2665 } 2666 2667 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, 2668 SelectionDAG &DAG) const { 2669 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 2670 SDLoc DL(Op); 2671 switch (IntrID) { 2672 case Intrinsic::amdgcn_atomic_inc: 2673 case Intrinsic::amdgcn_atomic_dec: { 2674 MemSDNode *M = cast<MemSDNode>(Op); 2675 unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ? 2676 AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC; 2677 SDValue Ops[] = { 2678 M->getOperand(0), // Chain 2679 M->getOperand(2), // Ptr 2680 M->getOperand(3) // Value 2681 }; 2682 2683 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, 2684 M->getMemoryVT(), M->getMemOperand()); 2685 } 2686 case Intrinsic::amdgcn_buffer_load: 2687 case Intrinsic::amdgcn_buffer_load_format: { 2688 SDValue Ops[] = { 2689 Op.getOperand(0), // Chain 2690 Op.getOperand(2), // rsrc 2691 Op.getOperand(3), // vindex 2692 Op.getOperand(4), // offset 2693 Op.getOperand(5), // glc 2694 Op.getOperand(6) // slc 2695 }; 2696 MachineFunction &MF = DAG.getMachineFunction(); 2697 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2698 2699 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? 2700 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; 2701 EVT VT = Op.getValueType(); 2702 EVT IntVT = VT.changeTypeToInteger(); 2703 2704 MachineMemOperand *MMO = MF.getMachineMemOperand( 2705 MachinePointerInfo(MFI->getBufferPSV()), 2706 MachineMemOperand::MOLoad, 2707 VT.getStoreSize(), VT.getStoreSize()); 2708 2709 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); 2710 } 2711 default: 2712 return SDValue(); 2713 } 2714 } 2715 2716 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 2717 SelectionDAG &DAG) const { 2718 MachineFunction &MF = DAG.getMachineFunction(); 2719 SDLoc DL(Op); 2720 SDValue Chain = Op.getOperand(0); 2721 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 2722 2723 switch (IntrinsicID) { 2724 case AMDGPUIntrinsic::SI_sendmsg: 2725 case Intrinsic::amdgcn_s_sendmsg: { 2726 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 2727 SDValue Glue = Chain.getValue(1); 2728 return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, 2729 Op.getOperand(2), Glue); 2730 } 2731 case Intrinsic::amdgcn_s_sendmsghalt: { 2732 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 2733 SDValue Glue = Chain.getValue(1); 2734 return DAG.getNode(AMDGPUISD::SENDMSGHALT, DL, MVT::Other, Chain, 2735 Op.getOperand(2), Glue); 2736 } 2737 case AMDGPUIntrinsic::SI_tbuffer_store: { 2738 SDValue Ops[] = { 2739 Chain, 2740 Op.getOperand(2), 2741 Op.getOperand(3), 2742 Op.getOperand(4), 2743 Op.getOperand(5), 2744 Op.getOperand(6), 2745 Op.getOperand(7), 2746 Op.getOperand(8), 2747 Op.getOperand(9), 2748 Op.getOperand(10), 2749 Op.getOperand(11), 2750 Op.getOperand(12), 2751 Op.getOperand(13), 2752 Op.getOperand(14) 2753 }; 2754 2755 EVT VT = Op.getOperand(3).getValueType(); 2756 2757 MachineMemOperand *MMO = MF.getMachineMemOperand( 2758 MachinePointerInfo(), 2759 MachineMemOperand::MOStore, 2760 VT.getStoreSize(), 4); 2761 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 2762 Op->getVTList(), Ops, VT, MMO); 2763 } 2764 case AMDGPUIntrinsic::AMDGPU_kill: { 2765 SDValue Src = Op.getOperand(2); 2766 if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) { 2767 if (!K->isNegative()) 2768 return Chain; 2769 2770 SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32); 2771 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne); 2772 } 2773 2774 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); 2775 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast); 2776 } 2777 case AMDGPUIntrinsic::SI_export: { 2778 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2)); 2779 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3)); 2780 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4)); 2781 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5)); 2782 const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6)); 2783 2784 const SDValue Ops[] = { 2785 Chain, 2786 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), 2787 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1), 2788 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), 2789 DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1), 2790 Op.getOperand(7), // src0 2791 Op.getOperand(8), // src1 2792 Op.getOperand(9), // src2 2793 Op.getOperand(10) // src3 2794 }; 2795 2796 unsigned Opc = Done->isNullValue() ? 2797 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; 2798 return DAG.getNode(Opc, DL, Op->getVTList(), Ops); 2799 } 2800 default: 2801 return SDValue(); 2802 } 2803 } 2804 2805 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2806 SDLoc DL(Op); 2807 LoadSDNode *Load = cast<LoadSDNode>(Op); 2808 ISD::LoadExtType ExtType = Load->getExtensionType(); 2809 EVT MemVT = Load->getMemoryVT(); 2810 2811 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { 2812 // FIXME: Copied from PPC 2813 // First, load into 32 bits, then truncate to 1 bit. 2814 2815 SDValue Chain = Load->getChain(); 2816 SDValue BasePtr = Load->getBasePtr(); 2817 MachineMemOperand *MMO = Load->getMemOperand(); 2818 2819 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; 2820 2821 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, 2822 BasePtr, RealMemVT, MMO); 2823 2824 SDValue Ops[] = { 2825 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), 2826 NewLD.getValue(1) 2827 }; 2828 2829 return DAG.getMergeValues(Ops, DL); 2830 } 2831 2832 if (!MemVT.isVector()) 2833 return SDValue(); 2834 2835 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 2836 "Custom lowering for non-i32 vectors hasn't been implemented."); 2837 2838 unsigned AS = Load->getAddressSpace(); 2839 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, 2840 AS, Load->getAlignment())) { 2841 SDValue Ops[2]; 2842 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2843 return DAG.getMergeValues(Ops, DL); 2844 } 2845 2846 MachineFunction &MF = DAG.getMachineFunction(); 2847 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2848 // If there is a possibilty that flat instruction access scratch memory 2849 // then we need to use the same legalization rules we use for private. 2850 if (AS == AMDGPUAS::FLAT_ADDRESS) 2851 AS = MFI->hasFlatScratchInit() ? 2852 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; 2853 2854 unsigned NumElements = MemVT.getVectorNumElements(); 2855 switch (AS) { 2856 case AMDGPUAS::CONSTANT_ADDRESS: 2857 if (isMemOpUniform(Load)) 2858 return SDValue(); 2859 // Non-uniform loads will be selected to MUBUF instructions, so they 2860 // have the same legalization requirements as global and private 2861 // loads. 2862 // 2863 LLVM_FALLTHROUGH; 2864 case AMDGPUAS::GLOBAL_ADDRESS: { 2865 if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && 2866 isMemOpHasNoClobberedMemOperand(Load)) 2867 return SDValue(); 2868 // Non-uniform loads will be selected to MUBUF instructions, so they 2869 // have the same legalization requirements as global and private 2870 // loads. 2871 // 2872 } 2873 LLVM_FALLTHROUGH; 2874 case AMDGPUAS::FLAT_ADDRESS: 2875 if (NumElements > 4) 2876 return SplitVectorLoad(Op, DAG); 2877 // v4 loads are supported for private and global memory. 2878 return SDValue(); 2879 case AMDGPUAS::PRIVATE_ADDRESS: { 2880 // Depending on the setting of the private_element_size field in the 2881 // resource descriptor, we can only make private accesses up to a certain 2882 // size. 2883 switch (Subtarget->getMaxPrivateElementSize()) { 2884 case 4: 2885 return scalarizeVectorLoad(Load, DAG); 2886 case 8: 2887 if (NumElements > 2) 2888 return SplitVectorLoad(Op, DAG); 2889 return SDValue(); 2890 case 16: 2891 // Same as global/flat 2892 if (NumElements > 4) 2893 return SplitVectorLoad(Op, DAG); 2894 return SDValue(); 2895 default: 2896 llvm_unreachable("unsupported private_element_size"); 2897 } 2898 } 2899 case AMDGPUAS::LOCAL_ADDRESS: { 2900 if (NumElements > 2) 2901 return SplitVectorLoad(Op, DAG); 2902 2903 if (NumElements == 2) 2904 return SDValue(); 2905 2906 // If properly aligned, if we split we might be able to use ds_read_b64. 2907 return SplitVectorLoad(Op, DAG); 2908 } 2909 default: 2910 return SDValue(); 2911 } 2912 } 2913 2914 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 2915 if (Op.getValueType() != MVT::i64) 2916 return SDValue(); 2917 2918 SDLoc DL(Op); 2919 SDValue Cond = Op.getOperand(0); 2920 2921 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 2922 SDValue One = DAG.getConstant(1, DL, MVT::i32); 2923 2924 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 2925 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 2926 2927 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 2928 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 2929 2930 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 2931 2932 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 2933 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 2934 2935 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 2936 2937 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); 2938 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 2939 } 2940 2941 // Catch division cases where we can use shortcuts with rcp and rsq 2942 // instructions. 2943 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, 2944 SelectionDAG &DAG) const { 2945 SDLoc SL(Op); 2946 SDValue LHS = Op.getOperand(0); 2947 SDValue RHS = Op.getOperand(1); 2948 EVT VT = Op.getValueType(); 2949 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; 2950 2951 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 2952 if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || 2953 VT == MVT::f16) { 2954 if (CLHS->isExactlyValue(1.0)) { 2955 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 2956 // the CI documentation has a worst case error of 1 ulp. 2957 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 2958 // use it as long as we aren't trying to use denormals. 2959 // 2960 // v_rcp_f16 and v_rsq_f16 DO support denormals. 2961 2962 // 1.0 / sqrt(x) -> rsq(x) 2963 2964 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 2965 // error seems really high at 2^29 ULP. 2966 if (RHS.getOpcode() == ISD::FSQRT) 2967 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); 2968 2969 // 1.0 / x -> rcp(x) 2970 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 2971 } 2972 2973 // Same as for 1.0, but expand the sign out of the constant. 2974 if (CLHS->isExactlyValue(-1.0)) { 2975 // -1.0 / x -> rcp (fneg x) 2976 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 2977 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); 2978 } 2979 } 2980 } 2981 2982 const SDNodeFlags *Flags = Op->getFlags(); 2983 2984 if (Unsafe || Flags->hasAllowReciprocal()) { 2985 // Turn into multiply by the reciprocal. 2986 // x / y -> x * (1.0 / y) 2987 SDNodeFlags Flags; 2988 Flags.setUnsafeAlgebra(true); 2989 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 2990 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags); 2991 } 2992 2993 return SDValue(); 2994 } 2995 2996 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, 2997 EVT VT, SDValue A, SDValue B, SDValue GlueChain) { 2998 if (GlueChain->getNumValues() <= 1) { 2999 return DAG.getNode(Opcode, SL, VT, A, B); 3000 } 3001 3002 assert(GlueChain->getNumValues() == 3); 3003 3004 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); 3005 switch (Opcode) { 3006 default: llvm_unreachable("no chain equivalent for opcode"); 3007 case ISD::FMUL: 3008 Opcode = AMDGPUISD::FMUL_W_CHAIN; 3009 break; 3010 } 3011 3012 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, 3013 GlueChain.getValue(2)); 3014 } 3015 3016 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, 3017 EVT VT, SDValue A, SDValue B, SDValue C, 3018 SDValue GlueChain) { 3019 if (GlueChain->getNumValues() <= 1) { 3020 return DAG.getNode(Opcode, SL, VT, A, B, C); 3021 } 3022 3023 assert(GlueChain->getNumValues() == 3); 3024 3025 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); 3026 switch (Opcode) { 3027 default: llvm_unreachable("no chain equivalent for opcode"); 3028 case ISD::FMA: 3029 Opcode = AMDGPUISD::FMA_W_CHAIN; 3030 break; 3031 } 3032 3033 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C, 3034 GlueChain.getValue(2)); 3035 } 3036 3037 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { 3038 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) 3039 return FastLowered; 3040 3041 SDLoc SL(Op); 3042 SDValue Src0 = Op.getOperand(0); 3043 SDValue Src1 = Op.getOperand(1); 3044 3045 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); 3046 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); 3047 3048 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1); 3049 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1); 3050 3051 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32); 3052 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag); 3053 3054 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0); 3055 } 3056 3057 // Faster 2.5 ULP division that does not support denormals. 3058 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { 3059 SDLoc SL(Op); 3060 SDValue LHS = Op.getOperand(1); 3061 SDValue RHS = Op.getOperand(2); 3062 3063 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); 3064 3065 const APFloat K0Val(BitsToFloat(0x6f800000)); 3066 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); 3067 3068 const APFloat K1Val(BitsToFloat(0x2f800000)); 3069 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); 3070 3071 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 3072 3073 EVT SetCCVT = 3074 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); 3075 3076 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 3077 3078 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); 3079 3080 // TODO: Should this propagate fast-math-flags? 3081 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); 3082 3083 // rcp does not support denormals. 3084 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); 3085 3086 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); 3087 3088 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); 3089 } 3090 3091 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 3092 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) 3093 return FastLowered; 3094 3095 SDLoc SL(Op); 3096 SDValue LHS = Op.getOperand(0); 3097 SDValue RHS = Op.getOperand(1); 3098 3099 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 3100 3101 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); 3102 3103 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, 3104 RHS, RHS, LHS); 3105 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, 3106 LHS, RHS, LHS); 3107 3108 // Denominator is scaled to not be denormal, so using rcp is ok. 3109 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, 3110 DenominatorScaled); 3111 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, 3112 DenominatorScaled); 3113 3114 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | 3115 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3116 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3117 3118 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); 3119 3120 if (!Subtarget->hasFP32Denormals()) { 3121 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 3122 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, 3123 SL, MVT::i32); 3124 SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, 3125 DAG.getEntryNode(), 3126 EnableDenormValue, BitField); 3127 SDValue Ops[3] = { 3128 NegDivScale0, 3129 EnableDenorm.getValue(0), 3130 EnableDenorm.getValue(1) 3131 }; 3132 3133 NegDivScale0 = DAG.getMergeValues(Ops, SL); 3134 } 3135 3136 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, 3137 ApproxRcp, One, NegDivScale0); 3138 3139 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, 3140 ApproxRcp, Fma0); 3141 3142 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, 3143 Fma1, Fma1); 3144 3145 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, 3146 NumeratorScaled, Mul); 3147 3148 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2); 3149 3150 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, 3151 NumeratorScaled, Fma3); 3152 3153 if (!Subtarget->hasFP32Denormals()) { 3154 const SDValue DisableDenormValue = 3155 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); 3156 SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, 3157 Fma4.getValue(1), 3158 DisableDenormValue, 3159 BitField, 3160 Fma4.getValue(2)); 3161 3162 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 3163 DisableDenorm, DAG.getRoot()); 3164 DAG.setRoot(OutputChain); 3165 } 3166 3167 SDValue Scale = NumeratorScaled.getValue(1); 3168 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, 3169 Fma4, Fma1, Fma3, Scale); 3170 3171 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); 3172 } 3173 3174 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 3175 if (DAG.getTarget().Options.UnsafeFPMath) 3176 return lowerFastUnsafeFDIV(Op, DAG); 3177 3178 SDLoc SL(Op); 3179 SDValue X = Op.getOperand(0); 3180 SDValue Y = Op.getOperand(1); 3181 3182 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 3183 3184 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 3185 3186 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 3187 3188 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 3189 3190 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 3191 3192 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 3193 3194 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 3195 3196 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 3197 3198 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 3199 3200 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 3201 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 3202 3203 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, 3204 NegDivScale0, Mul, DivScale1); 3205 3206 SDValue Scale; 3207 3208 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { 3209 // Workaround a hardware bug on SI where the condition output from div_scale 3210 // is not usable. 3211 3212 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); 3213 3214 // Figure out if the scale to use for div_fmas. 3215 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 3216 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 3217 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 3218 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 3219 3220 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 3221 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 3222 3223 SDValue Scale0Hi 3224 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 3225 SDValue Scale1Hi 3226 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 3227 3228 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 3229 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 3230 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 3231 } else { 3232 Scale = DivScale1.getValue(1); 3233 } 3234 3235 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, 3236 Fma4, Fma3, Mul, Scale); 3237 3238 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 3239 } 3240 3241 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 3242 EVT VT = Op.getValueType(); 3243 3244 if (VT == MVT::f32) 3245 return LowerFDIV32(Op, DAG); 3246 3247 if (VT == MVT::f64) 3248 return LowerFDIV64(Op, DAG); 3249 3250 if (VT == MVT::f16) 3251 return LowerFDIV16(Op, DAG); 3252 3253 llvm_unreachable("Unexpected type for fdiv"); 3254 } 3255 3256 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 3257 SDLoc DL(Op); 3258 StoreSDNode *Store = cast<StoreSDNode>(Op); 3259 EVT VT = Store->getMemoryVT(); 3260 3261 if (VT == MVT::i1) { 3262 return DAG.getTruncStore(Store->getChain(), DL, 3263 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 3264 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 3265 } 3266 3267 assert(VT.isVector() && 3268 Store->getValue().getValueType().getScalarType() == MVT::i32); 3269 3270 unsigned AS = Store->getAddressSpace(); 3271 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 3272 AS, Store->getAlignment())) { 3273 return expandUnalignedStore(Store, DAG); 3274 } 3275 3276 MachineFunction &MF = DAG.getMachineFunction(); 3277 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3278 // If there is a possibilty that flat instruction access scratch memory 3279 // then we need to use the same legalization rules we use for private. 3280 if (AS == AMDGPUAS::FLAT_ADDRESS) 3281 AS = MFI->hasFlatScratchInit() ? 3282 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; 3283 3284 unsigned NumElements = VT.getVectorNumElements(); 3285 switch (AS) { 3286 case AMDGPUAS::GLOBAL_ADDRESS: 3287 case AMDGPUAS::FLAT_ADDRESS: 3288 if (NumElements > 4) 3289 return SplitVectorStore(Op, DAG); 3290 return SDValue(); 3291 case AMDGPUAS::PRIVATE_ADDRESS: { 3292 switch (Subtarget->getMaxPrivateElementSize()) { 3293 case 4: 3294 return scalarizeVectorStore(Store, DAG); 3295 case 8: 3296 if (NumElements > 2) 3297 return SplitVectorStore(Op, DAG); 3298 return SDValue(); 3299 case 16: 3300 if (NumElements > 4) 3301 return SplitVectorStore(Op, DAG); 3302 return SDValue(); 3303 default: 3304 llvm_unreachable("unsupported private_element_size"); 3305 } 3306 } 3307 case AMDGPUAS::LOCAL_ADDRESS: { 3308 if (NumElements > 2) 3309 return SplitVectorStore(Op, DAG); 3310 3311 if (NumElements == 2) 3312 return Op; 3313 3314 // If properly aligned, if we split we might be able to use ds_write_b64. 3315 return SplitVectorStore(Op, DAG); 3316 } 3317 default: 3318 llvm_unreachable("unhandled address space"); 3319 } 3320 } 3321 3322 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 3323 SDLoc DL(Op); 3324 EVT VT = Op.getValueType(); 3325 SDValue Arg = Op.getOperand(0); 3326 // TODO: Should this propagate fast-math-flags? 3327 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 3328 DAG.getNode(ISD::FMUL, DL, VT, Arg, 3329 DAG.getConstantFP(0.5/M_PI, DL, 3330 VT))); 3331 3332 switch (Op.getOpcode()) { 3333 case ISD::FCOS: 3334 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); 3335 case ISD::FSIN: 3336 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); 3337 default: 3338 llvm_unreachable("Wrong trig opcode"); 3339 } 3340 } 3341 3342 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 3343 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op); 3344 assert(AtomicNode->isCompareAndSwap()); 3345 unsigned AS = AtomicNode->getAddressSpace(); 3346 3347 // No custom lowering required for local address space 3348 if (!isFlatGlobalAddrSpace(AS)) 3349 return Op; 3350 3351 // Non-local address space requires custom lowering for atomic compare 3352 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 3353 SDLoc DL(Op); 3354 SDValue ChainIn = Op.getOperand(0); 3355 SDValue Addr = Op.getOperand(1); 3356 SDValue Old = Op.getOperand(2); 3357 SDValue New = Op.getOperand(3); 3358 EVT VT = Op.getValueType(); 3359 MVT SimpleVT = VT.getSimpleVT(); 3360 MVT VecType = MVT::getVectorVT(SimpleVT, 2); 3361 3362 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old}); 3363 SDValue Ops[] = { ChainIn, Addr, NewOld }; 3364 3365 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(), 3366 Ops, VT, AtomicNode->getMemOperand()); 3367 } 3368 3369 //===----------------------------------------------------------------------===// 3370 // Custom DAG optimizations 3371 //===----------------------------------------------------------------------===// 3372 3373 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 3374 DAGCombinerInfo &DCI) const { 3375 EVT VT = N->getValueType(0); 3376 EVT ScalarVT = VT.getScalarType(); 3377 if (ScalarVT != MVT::f32) 3378 return SDValue(); 3379 3380 SelectionDAG &DAG = DCI.DAG; 3381 SDLoc DL(N); 3382 3383 SDValue Src = N->getOperand(0); 3384 EVT SrcVT = Src.getValueType(); 3385 3386 // TODO: We could try to match extracting the higher bytes, which would be 3387 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 3388 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 3389 // about in practice. 3390 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { 3391 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 3392 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); 3393 DCI.AddToWorklist(Cvt.getNode()); 3394 return Cvt; 3395 } 3396 } 3397 3398 return SDValue(); 3399 } 3400 3401 /// \brief Return true if the given offset Size in bytes can be folded into 3402 /// the immediate offsets of a memory instruction for the given address space. 3403 static bool canFoldOffset(unsigned OffsetSize, unsigned AS, 3404 const SISubtarget &STI) { 3405 switch (AS) { 3406 case AMDGPUAS::GLOBAL_ADDRESS: { 3407 // MUBUF instructions a 12-bit offset in bytes. 3408 return isUInt<12>(OffsetSize); 3409 } 3410 case AMDGPUAS::CONSTANT_ADDRESS: { 3411 // SMRD instructions have an 8-bit offset in dwords on SI and 3412 // a 20-bit offset in bytes on VI. 3413 if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 3414 return isUInt<20>(OffsetSize); 3415 else 3416 return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); 3417 } 3418 case AMDGPUAS::LOCAL_ADDRESS: 3419 case AMDGPUAS::REGION_ADDRESS: { 3420 // The single offset versions have a 16-bit offset in bytes. 3421 return isUInt<16>(OffsetSize); 3422 } 3423 case AMDGPUAS::PRIVATE_ADDRESS: 3424 // Indirect register addressing does not use any offsets. 3425 default: 3426 return 0; 3427 } 3428 } 3429 3430 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 3431 3432 // This is a variant of 3433 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 3434 // 3435 // The normal DAG combiner will do this, but only if the add has one use since 3436 // that would increase the number of instructions. 3437 // 3438 // This prevents us from seeing a constant offset that can be folded into a 3439 // memory instruction's addressing mode. If we know the resulting add offset of 3440 // a pointer can be folded into an addressing offset, we can replace the pointer 3441 // operand with the add of new constant offset. This eliminates one of the uses, 3442 // and may allow the remaining use to also be simplified. 3443 // 3444 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, 3445 unsigned AddrSpace, 3446 DAGCombinerInfo &DCI) const { 3447 SDValue N0 = N->getOperand(0); 3448 SDValue N1 = N->getOperand(1); 3449 3450 if (N0.getOpcode() != ISD::ADD) 3451 return SDValue(); 3452 3453 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 3454 if (!CN1) 3455 return SDValue(); 3456 3457 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 3458 if (!CAdd) 3459 return SDValue(); 3460 3461 // If the resulting offset is too large, we can't fold it into the addressing 3462 // mode offset. 3463 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 3464 if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget())) 3465 return SDValue(); 3466 3467 SelectionDAG &DAG = DCI.DAG; 3468 SDLoc SL(N); 3469 EVT VT = N->getValueType(0); 3470 3471 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 3472 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); 3473 3474 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); 3475 } 3476 3477 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, 3478 DAGCombinerInfo &DCI) const { 3479 SDValue Ptr = N->getBasePtr(); 3480 SelectionDAG &DAG = DCI.DAG; 3481 SDLoc SL(N); 3482 3483 // TODO: We could also do this for multiplies. 3484 unsigned AS = N->getAddressSpace(); 3485 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { 3486 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); 3487 if (NewPtr) { 3488 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end()); 3489 3490 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; 3491 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); 3492 } 3493 } 3494 3495 return SDValue(); 3496 } 3497 3498 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) { 3499 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) || 3500 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) || 3501 (Opc == ISD::XOR && Val == 0); 3502 } 3503 3504 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This 3505 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit 3506 // integer combine opportunities since most 64-bit operations are decomposed 3507 // this way. TODO: We won't want this for SALU especially if it is an inline 3508 // immediate. 3509 SDValue SITargetLowering::splitBinaryBitConstantOp( 3510 DAGCombinerInfo &DCI, 3511 const SDLoc &SL, 3512 unsigned Opc, SDValue LHS, 3513 const ConstantSDNode *CRHS) const { 3514 uint64_t Val = CRHS->getZExtValue(); 3515 uint32_t ValLo = Lo_32(Val); 3516 uint32_t ValHi = Hi_32(Val); 3517 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 3518 3519 if ((bitOpWithConstantIsReducible(Opc, ValLo) || 3520 bitOpWithConstantIsReducible(Opc, ValHi)) || 3521 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { 3522 // If we need to materialize a 64-bit immediate, it will be split up later 3523 // anyway. Avoid creating the harder to understand 64-bit immediate 3524 // materialization. 3525 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi); 3526 } 3527 3528 return SDValue(); 3529 } 3530 3531 SDValue SITargetLowering::performAndCombine(SDNode *N, 3532 DAGCombinerInfo &DCI) const { 3533 if (DCI.isBeforeLegalize()) 3534 return SDValue(); 3535 3536 SelectionDAG &DAG = DCI.DAG; 3537 EVT VT = N->getValueType(0); 3538 SDValue LHS = N->getOperand(0); 3539 SDValue RHS = N->getOperand(1); 3540 3541 3542 if (VT == MVT::i64) { 3543 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 3544 if (CRHS) { 3545 if (SDValue Split 3546 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) 3547 return Split; 3548 } 3549 } 3550 3551 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 3552 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 3553 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) { 3554 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 3555 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 3556 3557 SDValue X = LHS.getOperand(0); 3558 SDValue Y = RHS.getOperand(0); 3559 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) 3560 return SDValue(); 3561 3562 if (LCC == ISD::SETO) { 3563 if (X != LHS.getOperand(1)) 3564 return SDValue(); 3565 3566 if (RCC == ISD::SETUNE) { 3567 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 3568 if (!C1 || !C1->isInfinity() || C1->isNegative()) 3569 return SDValue(); 3570 3571 const uint32_t Mask = SIInstrFlags::N_NORMAL | 3572 SIInstrFlags::N_SUBNORMAL | 3573 SIInstrFlags::N_ZERO | 3574 SIInstrFlags::P_ZERO | 3575 SIInstrFlags::P_SUBNORMAL | 3576 SIInstrFlags::P_NORMAL; 3577 3578 static_assert(((~(SIInstrFlags::S_NAN | 3579 SIInstrFlags::Q_NAN | 3580 SIInstrFlags::N_INFINITY | 3581 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, 3582 "mask not equal"); 3583 3584 SDLoc DL(N); 3585 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 3586 X, DAG.getConstant(Mask, DL, MVT::i32)); 3587 } 3588 } 3589 } 3590 3591 return SDValue(); 3592 } 3593 3594 SDValue SITargetLowering::performOrCombine(SDNode *N, 3595 DAGCombinerInfo &DCI) const { 3596 SelectionDAG &DAG = DCI.DAG; 3597 SDValue LHS = N->getOperand(0); 3598 SDValue RHS = N->getOperand(1); 3599 3600 EVT VT = N->getValueType(0); 3601 if (VT == MVT::i1) { 3602 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 3603 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 3604 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 3605 SDValue Src = LHS.getOperand(0); 3606 if (Src != RHS.getOperand(0)) 3607 return SDValue(); 3608 3609 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 3610 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 3611 if (!CLHS || !CRHS) 3612 return SDValue(); 3613 3614 // Only 10 bits are used. 3615 static const uint32_t MaxMask = 0x3ff; 3616 3617 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 3618 SDLoc DL(N); 3619 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 3620 Src, DAG.getConstant(NewMask, DL, MVT::i32)); 3621 } 3622 3623 return SDValue(); 3624 } 3625 3626 if (VT != MVT::i64) 3627 return SDValue(); 3628 3629 // TODO: This could be a generic combine with a predicate for extracting the 3630 // high half of an integer being free. 3631 3632 // (or i64:x, (zero_extend i32:y)) -> 3633 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) 3634 if (LHS.getOpcode() == ISD::ZERO_EXTEND && 3635 RHS.getOpcode() != ISD::ZERO_EXTEND) 3636 std::swap(LHS, RHS); 3637 3638 if (RHS.getOpcode() == ISD::ZERO_EXTEND) { 3639 SDValue ExtSrc = RHS.getOperand(0); 3640 EVT SrcVT = ExtSrc.getValueType(); 3641 if (SrcVT == MVT::i32) { 3642 SDLoc SL(N); 3643 SDValue LowLHS, HiBits; 3644 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); 3645 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); 3646 3647 DCI.AddToWorklist(LowOr.getNode()); 3648 DCI.AddToWorklist(HiBits.getNode()); 3649 3650 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 3651 LowOr, HiBits); 3652 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 3653 } 3654 } 3655 3656 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 3657 if (CRHS) { 3658 if (SDValue Split 3659 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS)) 3660 return Split; 3661 } 3662 3663 return SDValue(); 3664 } 3665 3666 SDValue SITargetLowering::performXorCombine(SDNode *N, 3667 DAGCombinerInfo &DCI) const { 3668 EVT VT = N->getValueType(0); 3669 if (VT != MVT::i64) 3670 return SDValue(); 3671 3672 SDValue LHS = N->getOperand(0); 3673 SDValue RHS = N->getOperand(1); 3674 3675 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 3676 if (CRHS) { 3677 if (SDValue Split 3678 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) 3679 return Split; 3680 } 3681 3682 return SDValue(); 3683 } 3684 3685 SDValue SITargetLowering::performClassCombine(SDNode *N, 3686 DAGCombinerInfo &DCI) const { 3687 SelectionDAG &DAG = DCI.DAG; 3688 SDValue Mask = N->getOperand(1); 3689 3690 // fp_class x, 0 -> false 3691 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { 3692 if (CMask->isNullValue()) 3693 return DAG.getConstant(0, SDLoc(N), MVT::i1); 3694 } 3695 3696 if (N->getOperand(0).isUndef()) 3697 return DAG.getUNDEF(MVT::i1); 3698 3699 return SDValue(); 3700 } 3701 3702 // Constant fold canonicalize. 3703 SDValue SITargetLowering::performFCanonicalizeCombine( 3704 SDNode *N, 3705 DAGCombinerInfo &DCI) const { 3706 ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 3707 if (!CFP) 3708 return SDValue(); 3709 3710 SelectionDAG &DAG = DCI.DAG; 3711 const APFloat &C = CFP->getValueAPF(); 3712 3713 // Flush denormals to 0 if not enabled. 3714 if (C.isDenormal()) { 3715 EVT VT = N->getValueType(0); 3716 if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) 3717 return DAG.getConstantFP(0.0, SDLoc(N), VT); 3718 3719 if (VT == MVT::f64 && !Subtarget->hasFP64Denormals()) 3720 return DAG.getConstantFP(0.0, SDLoc(N), VT); 3721 3722 if (VT == MVT::f16 && !Subtarget->hasFP16Denormals()) 3723 return DAG.getConstantFP(0.0, SDLoc(N), VT); 3724 } 3725 3726 if (C.isNaN()) { 3727 EVT VT = N->getValueType(0); 3728 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); 3729 if (C.isSignaling()) { 3730 // Quiet a signaling NaN. 3731 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); 3732 } 3733 3734 // Make sure it is the canonical NaN bitpattern. 3735 // 3736 // TODO: Can we use -1 as the canonical NaN value since it's an inline 3737 // immediate? 3738 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) 3739 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); 3740 } 3741 3742 return SDValue(CFP, 0); 3743 } 3744 3745 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 3746 switch (Opc) { 3747 case ISD::FMAXNUM: 3748 return AMDGPUISD::FMAX3; 3749 case ISD::SMAX: 3750 return AMDGPUISD::SMAX3; 3751 case ISD::UMAX: 3752 return AMDGPUISD::UMAX3; 3753 case ISD::FMINNUM: 3754 return AMDGPUISD::FMIN3; 3755 case ISD::SMIN: 3756 return AMDGPUISD::SMIN3; 3757 case ISD::UMIN: 3758 return AMDGPUISD::UMIN3; 3759 default: 3760 llvm_unreachable("Not a min/max opcode"); 3761 } 3762 } 3763 3764 static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, 3765 SDValue Op0, SDValue Op1, bool Signed) { 3766 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1); 3767 if (!K1) 3768 return SDValue(); 3769 3770 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1)); 3771 if (!K0) 3772 return SDValue(); 3773 3774 if (Signed) { 3775 if (K0->getAPIntValue().sge(K1->getAPIntValue())) 3776 return SDValue(); 3777 } else { 3778 if (K0->getAPIntValue().uge(K1->getAPIntValue())) 3779 return SDValue(); 3780 } 3781 3782 EVT VT = K0->getValueType(0); 3783 3784 MVT NVT = MVT::i32; 3785 unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3786 3787 SDValue Tmp1, Tmp2, Tmp3; 3788 Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); 3789 Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); 3790 Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); 3791 3792 if (VT == MVT::i16) { 3793 Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT, 3794 Tmp1, Tmp2, Tmp3); 3795 3796 return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1); 3797 } else 3798 return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, 3799 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); 3800 } 3801 3802 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { 3803 if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) 3804 return true; 3805 3806 return DAG.isKnownNeverNaN(Op); 3807 } 3808 3809 static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, 3810 SDValue Op0, SDValue Op1) { 3811 ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1); 3812 if (!K1) 3813 return SDValue(); 3814 3815 ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1)); 3816 if (!K0) 3817 return SDValue(); 3818 3819 // Ordered >= (although NaN inputs should have folded away by now). 3820 APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); 3821 if (Cmp == APFloat::cmpGreaterThan) 3822 return SDValue(); 3823 3824 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a 3825 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then 3826 // give the other result, which is different from med3 with a NaN input. 3827 SDValue Var = Op0.getOperand(0); 3828 if (!isKnownNeverSNan(DAG, Var)) 3829 return SDValue(); 3830 3831 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), 3832 Var, SDValue(K0, 0), SDValue(K1, 0)); 3833 } 3834 3835 SDValue SITargetLowering::performMinMaxCombine(SDNode *N, 3836 DAGCombinerInfo &DCI) const { 3837 SelectionDAG &DAG = DCI.DAG; 3838 3839 unsigned Opc = N->getOpcode(); 3840 SDValue Op0 = N->getOperand(0); 3841 SDValue Op1 = N->getOperand(1); 3842 3843 // Only do this if the inner op has one use since this will just increases 3844 // register pressure for no benefit. 3845 3846 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) { 3847 // max(max(a, b), c) -> max3(a, b, c) 3848 // min(min(a, b), c) -> min3(a, b, c) 3849 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 3850 SDLoc DL(N); 3851 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 3852 DL, 3853 N->getValueType(0), 3854 Op0.getOperand(0), 3855 Op0.getOperand(1), 3856 Op1); 3857 } 3858 3859 // Try commuted. 3860 // max(a, max(b, c)) -> max3(a, b, c) 3861 // min(a, min(b, c)) -> min3(a, b, c) 3862 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 3863 SDLoc DL(N); 3864 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 3865 DL, 3866 N->getValueType(0), 3867 Op0, 3868 Op1.getOperand(0), 3869 Op1.getOperand(1)); 3870 } 3871 } 3872 3873 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) 3874 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { 3875 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) 3876 return Med3; 3877 } 3878 3879 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { 3880 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) 3881 return Med3; 3882 } 3883 3884 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) 3885 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || 3886 (Opc == AMDGPUISD::FMIN_LEGACY && 3887 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && 3888 N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { 3889 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) 3890 return Res; 3891 } 3892 3893 return SDValue(); 3894 } 3895 3896 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, 3897 const SDNode *N0, 3898 const SDNode *N1) const { 3899 EVT VT = N0->getValueType(0); 3900 3901 // Only do this if we are not trying to support denormals. v_mad_f32 does not 3902 // support denormals ever. 3903 if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || 3904 (VT == MVT::f16 && !Subtarget->hasFP16Denormals())) 3905 return ISD::FMAD; 3906 3907 const TargetOptions &Options = DAG.getTarget().Options; 3908 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || 3909 Options.UnsafeFPMath || 3910 (cast<BinaryWithFlagsSDNode>(N0)->Flags.hasUnsafeAlgebra() && 3911 cast<BinaryWithFlagsSDNode>(N1)->Flags.hasUnsafeAlgebra())) && 3912 isFMAFasterThanFMulAndFAdd(VT)) { 3913 return ISD::FMA; 3914 } 3915 3916 return 0; 3917 } 3918 3919 SDValue SITargetLowering::performFAddCombine(SDNode *N, 3920 DAGCombinerInfo &DCI) const { 3921 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 3922 return SDValue(); 3923 3924 SelectionDAG &DAG = DCI.DAG; 3925 EVT VT = N->getValueType(0); 3926 assert(!VT.isVector()); 3927 3928 SDLoc SL(N); 3929 SDValue LHS = N->getOperand(0); 3930 SDValue RHS = N->getOperand(1); 3931 3932 // These should really be instruction patterns, but writing patterns with 3933 // source modiifiers is a pain. 3934 3935 // fadd (fadd (a, a), b) -> mad 2.0, a, b 3936 if (LHS.getOpcode() == ISD::FADD) { 3937 SDValue A = LHS.getOperand(0); 3938 if (A == LHS.getOperand(1)) { 3939 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); 3940 if (FusedOp != 0) { 3941 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 3942 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS); 3943 } 3944 } 3945 } 3946 3947 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 3948 if (RHS.getOpcode() == ISD::FADD) { 3949 SDValue A = RHS.getOperand(0); 3950 if (A == RHS.getOperand(1)) { 3951 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); 3952 if (FusedOp != 0) { 3953 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 3954 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS); 3955 } 3956 } 3957 } 3958 3959 return SDValue(); 3960 } 3961 3962 SDValue SITargetLowering::performFSubCombine(SDNode *N, 3963 DAGCombinerInfo &DCI) const { 3964 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 3965 return SDValue(); 3966 3967 SelectionDAG &DAG = DCI.DAG; 3968 SDLoc SL(N); 3969 EVT VT = N->getValueType(0); 3970 assert(!VT.isVector()); 3971 3972 // Try to get the fneg to fold into the source modifier. This undoes generic 3973 // DAG combines and folds them into the mad. 3974 // 3975 // Only do this if we are not trying to support denormals. v_mad_f32 does 3976 // not support denormals ever. 3977 SDValue LHS = N->getOperand(0); 3978 SDValue RHS = N->getOperand(1); 3979 if (LHS.getOpcode() == ISD::FADD) { 3980 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 3981 SDValue A = LHS.getOperand(0); 3982 if (A == LHS.getOperand(1)) { 3983 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); 3984 if (FusedOp != 0){ 3985 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 3986 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 3987 3988 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS); 3989 } 3990 } 3991 } 3992 3993 if (RHS.getOpcode() == ISD::FADD) { 3994 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 3995 3996 SDValue A = RHS.getOperand(0); 3997 if (A == RHS.getOperand(1)) { 3998 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); 3999 if (FusedOp != 0){ 4000 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT); 4001 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS); 4002 } 4003 } 4004 } 4005 4006 return SDValue(); 4007 } 4008 4009 SDValue SITargetLowering::performSetCCCombine(SDNode *N, 4010 DAGCombinerInfo &DCI) const { 4011 SelectionDAG &DAG = DCI.DAG; 4012 SDLoc SL(N); 4013 4014 SDValue LHS = N->getOperand(0); 4015 SDValue RHS = N->getOperand(1); 4016 EVT VT = LHS.getValueType(); 4017 4018 if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() && 4019 VT != MVT::f16)) 4020 return SDValue(); 4021 4022 // Match isinf pattern 4023 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 4024 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 4025 if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { 4026 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 4027 if (!CRHS) 4028 return SDValue(); 4029 4030 const APFloat &APF = CRHS->getValueAPF(); 4031 if (APF.isInfinity() && !APF.isNegative()) { 4032 unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; 4033 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), 4034 DAG.getConstant(Mask, SL, MVT::i32)); 4035 } 4036 } 4037 4038 return SDValue(); 4039 } 4040 4041 SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, 4042 DAGCombinerInfo &DCI) const { 4043 SelectionDAG &DAG = DCI.DAG; 4044 SDLoc SL(N); 4045 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 4046 4047 SDValue Src = N->getOperand(0); 4048 SDValue Srl = N->getOperand(0); 4049 if (Srl.getOpcode() == ISD::ZERO_EXTEND) 4050 Srl = Srl.getOperand(0); 4051 4052 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. 4053 if (Srl.getOpcode() == ISD::SRL) { 4054 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x 4055 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x 4056 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x 4057 4058 if (const ConstantSDNode *C = 4059 dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { 4060 Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)), 4061 EVT(MVT::i32)); 4062 4063 unsigned SrcOffset = C->getZExtValue() + 8 * Offset; 4064 if (SrcOffset < 32 && SrcOffset % 8 == 0) { 4065 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL, 4066 MVT::f32, Srl); 4067 } 4068 } 4069 } 4070 4071 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 4072 4073 APInt KnownZero, KnownOne; 4074 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 4075 !DCI.isBeforeLegalizeOps()); 4076 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 4077 if (TLO.ShrinkDemandedConstant(Src, Demanded) || 4078 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { 4079 DCI.CommitTargetLoweringOpt(TLO); 4080 } 4081 4082 return SDValue(); 4083 } 4084 4085 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 4086 DAGCombinerInfo &DCI) const { 4087 switch (N->getOpcode()) { 4088 default: 4089 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 4090 case ISD::FADD: 4091 return performFAddCombine(N, DCI); 4092 case ISD::FSUB: 4093 return performFSubCombine(N, DCI); 4094 case ISD::SETCC: 4095 return performSetCCCombine(N, DCI); 4096 case ISD::FMAXNUM: 4097 case ISD::FMINNUM: 4098 case ISD::SMAX: 4099 case ISD::SMIN: 4100 case ISD::UMAX: 4101 case ISD::UMIN: 4102 case AMDGPUISD::FMIN_LEGACY: 4103 case AMDGPUISD::FMAX_LEGACY: { 4104 if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && 4105 N->getValueType(0) != MVT::f64 && 4106 getTargetMachine().getOptLevel() > CodeGenOpt::None) 4107 return performMinMaxCombine(N, DCI); 4108 break; 4109 } 4110 case ISD::LOAD: 4111 case ISD::STORE: 4112 case ISD::ATOMIC_LOAD: 4113 case ISD::ATOMIC_STORE: 4114 case ISD::ATOMIC_CMP_SWAP: 4115 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 4116 case ISD::ATOMIC_SWAP: 4117 case ISD::ATOMIC_LOAD_ADD: 4118 case ISD::ATOMIC_LOAD_SUB: 4119 case ISD::ATOMIC_LOAD_AND: 4120 case ISD::ATOMIC_LOAD_OR: 4121 case ISD::ATOMIC_LOAD_XOR: 4122 case ISD::ATOMIC_LOAD_NAND: 4123 case ISD::ATOMIC_LOAD_MIN: 4124 case ISD::ATOMIC_LOAD_MAX: 4125 case ISD::ATOMIC_LOAD_UMIN: 4126 case ISD::ATOMIC_LOAD_UMAX: 4127 case AMDGPUISD::ATOMIC_INC: 4128 case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics. 4129 if (DCI.isBeforeLegalize()) 4130 break; 4131 return performMemSDNodeCombine(cast<MemSDNode>(N), DCI); 4132 } 4133 case ISD::AND: 4134 return performAndCombine(N, DCI); 4135 case ISD::OR: 4136 return performOrCombine(N, DCI); 4137 case ISD::XOR: 4138 return performXorCombine(N, DCI); 4139 case AMDGPUISD::FP_CLASS: 4140 return performClassCombine(N, DCI); 4141 case ISD::FCANONICALIZE: 4142 return performFCanonicalizeCombine(N, DCI); 4143 case AMDGPUISD::FRACT: 4144 case AMDGPUISD::RCP: 4145 case AMDGPUISD::RSQ: 4146 case AMDGPUISD::RCP_LEGACY: 4147 case AMDGPUISD::RSQ_LEGACY: 4148 case AMDGPUISD::RSQ_CLAMP: 4149 case AMDGPUISD::LDEXP: { 4150 SDValue Src = N->getOperand(0); 4151 if (Src.isUndef()) 4152 return Src; 4153 break; 4154 } 4155 case ISD::SINT_TO_FP: 4156 case ISD::UINT_TO_FP: 4157 return performUCharToFloatCombine(N, DCI); 4158 case AMDGPUISD::CVT_F32_UBYTE0: 4159 case AMDGPUISD::CVT_F32_UBYTE1: 4160 case AMDGPUISD::CVT_F32_UBYTE2: 4161 case AMDGPUISD::CVT_F32_UBYTE3: 4162 return performCvtF32UByteNCombine(N, DCI); 4163 } 4164 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 4165 } 4166 4167 /// \brief Helper function for adjustWritemask 4168 static unsigned SubIdx2Lane(unsigned Idx) { 4169 switch (Idx) { 4170 default: return 0; 4171 case AMDGPU::sub0: return 0; 4172 case AMDGPU::sub1: return 1; 4173 case AMDGPU::sub2: return 2; 4174 case AMDGPU::sub3: return 3; 4175 } 4176 } 4177 4178 /// \brief Adjust the writemask of MIMG instructions 4179 void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 4180 SelectionDAG &DAG) const { 4181 SDNode *Users[4] = { }; 4182 unsigned Lane = 0; 4183 unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; 4184 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); 4185 unsigned NewDmask = 0; 4186 4187 // Try to figure out the used register components 4188 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 4189 I != E; ++I) { 4190 4191 // Abort if we can't understand the usage 4192 if (!I->isMachineOpcode() || 4193 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 4194 return; 4195 4196 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 4197 // Note that subregs are packed, i.e. Lane==0 is the first bit set 4198 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 4199 // set, etc. 4200 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 4201 4202 // Set which texture component corresponds to the lane. 4203 unsigned Comp; 4204 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 4205 assert(Dmask); 4206 Comp = countTrailingZeros(Dmask); 4207 Dmask &= ~(1 << Comp); 4208 } 4209 4210 // Abort if we have more than one user per component 4211 if (Users[Lane]) 4212 return; 4213 4214 Users[Lane] = *I; 4215 NewDmask |= 1 << Comp; 4216 } 4217 4218 // Abort if there's no change 4219 if (NewDmask == OldDmask) 4220 return; 4221 4222 // Adjust the writemask in the node 4223 std::vector<SDValue> Ops; 4224 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); 4225 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); 4226 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); 4227 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); 4228 4229 // If we only got one lane, replace it with a copy 4230 // (if NewDmask has only one bit set...) 4231 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 4232 SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), 4233 MVT::i32); 4234 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 4235 SDLoc(), Users[Lane]->getValueType(0), 4236 SDValue(Node, 0), RC); 4237 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 4238 return; 4239 } 4240 4241 // Update the users of the node with the new indices 4242 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 4243 4244 SDNode *User = Users[i]; 4245 if (!User) 4246 continue; 4247 4248 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); 4249 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 4250 4251 switch (Idx) { 4252 default: break; 4253 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 4254 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 4255 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 4256 } 4257 } 4258 } 4259 4260 static bool isFrameIndexOp(SDValue Op) { 4261 if (Op.getOpcode() == ISD::AssertZext) 4262 Op = Op.getOperand(0); 4263 4264 return isa<FrameIndexSDNode>(Op); 4265 } 4266 4267 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) 4268 /// with frame index operands. 4269 /// LLVM assumes that inputs are to these instructions are registers. 4270 void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 4271 SelectionDAG &DAG) const { 4272 4273 SmallVector<SDValue, 8> Ops; 4274 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 4275 if (!isFrameIndexOp(Node->getOperand(i))) { 4276 Ops.push_back(Node->getOperand(i)); 4277 continue; 4278 } 4279 4280 SDLoc DL(Node); 4281 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 4282 Node->getOperand(i).getValueType(), 4283 Node->getOperand(i)), 0)); 4284 } 4285 4286 DAG.UpdateNodeOperands(Node, Ops); 4287 } 4288 4289 /// \brief Fold the instructions after selecting them. 4290 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 4291 SelectionDAG &DAG) const { 4292 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4293 unsigned Opcode = Node->getMachineOpcode(); 4294 4295 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && 4296 !TII->isGather4(Opcode)) 4297 adjustWritemask(Node, DAG); 4298 4299 if (Opcode == AMDGPU::INSERT_SUBREG || 4300 Opcode == AMDGPU::REG_SEQUENCE) { 4301 legalizeTargetIndependentNode(Node, DAG); 4302 return Node; 4303 } 4304 return Node; 4305 } 4306 4307 /// \brief Assign the register class depending on the number of 4308 /// bits set in the writemask 4309 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 4310 SDNode *Node) const { 4311 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4312 4313 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4314 4315 if (TII->isVOP3(MI.getOpcode())) { 4316 // Make sure constant bus requirements are respected. 4317 TII->legalizeOperandsVOP3(MRI, MI); 4318 return; 4319 } 4320 4321 if (TII->isMIMG(MI)) { 4322 unsigned VReg = MI.getOperand(0).getReg(); 4323 const TargetRegisterClass *RC = MRI.getRegClass(VReg); 4324 // TODO: Need mapping tables to handle other cases (register classes). 4325 if (RC != &AMDGPU::VReg_128RegClass) 4326 return; 4327 4328 unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; 4329 unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); 4330 unsigned BitsSet = 0; 4331 for (unsigned i = 0; i < 4; ++i) 4332 BitsSet += Writemask & (1 << i) ? 1 : 0; 4333 switch (BitsSet) { 4334 default: return; 4335 case 1: RC = &AMDGPU::VGPR_32RegClass; break; 4336 case 2: RC = &AMDGPU::VReg_64RegClass; break; 4337 case 3: RC = &AMDGPU::VReg_96RegClass; break; 4338 } 4339 4340 unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet); 4341 MI.setDesc(TII->get(NewOpcode)); 4342 MRI.setRegClass(VReg, RC); 4343 return; 4344 } 4345 4346 // Replace unused atomics with the no return version. 4347 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); 4348 if (NoRetAtomicOp != -1) { 4349 if (!Node->hasAnyUseOfValue(0)) { 4350 MI.setDesc(TII->get(NoRetAtomicOp)); 4351 MI.RemoveOperand(0); 4352 return; 4353 } 4354 4355 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg 4356 // instruction, because the return type of these instructions is a vec2 of 4357 // the memory type, so it can be tied to the input operand. 4358 // This means these instructions always have a use, so we need to add a 4359 // special case to check if the atomic has only one extract_subreg use, 4360 // which itself has no uses. 4361 if ((Node->hasNUsesOfValue(1, 0) && 4362 Node->use_begin()->isMachineOpcode() && 4363 Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && 4364 !Node->use_begin()->hasAnyUseOfValue(0))) { 4365 unsigned Def = MI.getOperand(0).getReg(); 4366 4367 // Change this into a noret atomic. 4368 MI.setDesc(TII->get(NoRetAtomicOp)); 4369 MI.RemoveOperand(0); 4370 4371 // If we only remove the def operand from the atomic instruction, the 4372 // extract_subreg will be left with a use of a vreg without a def. 4373 // So we need to insert an implicit_def to avoid machine verifier 4374 // errors. 4375 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 4376 TII->get(AMDGPU::IMPLICIT_DEF), Def); 4377 } 4378 return; 4379 } 4380 } 4381 4382 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, 4383 uint64_t Val) { 4384 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); 4385 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 4386 } 4387 4388 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 4389 const SDLoc &DL, 4390 SDValue Ptr) const { 4391 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4392 4393 // Build the half of the subregister with the constants before building the 4394 // full 128-bit register. If we are building multiple resource descriptors, 4395 // this will allow CSEing of the 2-component register. 4396 const SDValue Ops0[] = { 4397 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), 4398 buildSMovImm32(DAG, DL, 0), 4399 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 4400 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 4401 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) 4402 }; 4403 4404 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, 4405 MVT::v2i32, Ops0), 0); 4406 4407 // Combine the constants and the pointer. 4408 const SDValue Ops1[] = { 4409 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 4410 Ptr, 4411 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), 4412 SubRegHi, 4413 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) 4414 }; 4415 4416 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 4417 } 4418 4419 /// \brief Return a resource descriptor with the 'Add TID' bit enabled 4420 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] 4421 /// of the resource descriptor) to create an offset, which is added to 4422 /// the resource pointer. 4423 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, 4424 SDValue Ptr, uint32_t RsrcDword1, 4425 uint64_t RsrcDword2And3) const { 4426 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 4427 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 4428 if (RsrcDword1) { 4429 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 4430 DAG.getConstant(RsrcDword1, DL, MVT::i32)), 4431 0); 4432 } 4433 4434 SDValue DataLo = buildSMovImm32(DAG, DL, 4435 RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 4436 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 4437 4438 const SDValue Ops[] = { 4439 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 4440 PtrLo, 4441 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 4442 PtrHi, 4443 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), 4444 DataLo, 4445 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), 4446 DataHi, 4447 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) 4448 }; 4449 4450 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 4451 } 4452 4453 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 4454 const TargetRegisterClass *RC, 4455 unsigned Reg, EVT VT) const { 4456 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 4457 4458 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 4459 cast<RegisterSDNode>(VReg)->getReg(), VT); 4460 } 4461 4462 //===----------------------------------------------------------------------===// 4463 // SI Inline Assembly Support 4464 //===----------------------------------------------------------------------===// 4465 4466 std::pair<unsigned, const TargetRegisterClass *> 4467 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 4468 StringRef Constraint, 4469 MVT VT) const { 4470 if (!isTypeLegal(VT)) 4471 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4472 4473 if (Constraint.size() == 1) { 4474 switch (Constraint[0]) { 4475 case 's': 4476 case 'r': 4477 switch (VT.getSizeInBits()) { 4478 default: 4479 return std::make_pair(0U, nullptr); 4480 case 32: 4481 case 16: 4482 return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass); 4483 case 64: 4484 return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); 4485 case 128: 4486 return std::make_pair(0U, &AMDGPU::SReg_128RegClass); 4487 case 256: 4488 return std::make_pair(0U, &AMDGPU::SReg_256RegClass); 4489 } 4490 4491 case 'v': 4492 switch (VT.getSizeInBits()) { 4493 default: 4494 return std::make_pair(0U, nullptr); 4495 case 32: 4496 case 16: 4497 return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); 4498 case 64: 4499 return std::make_pair(0U, &AMDGPU::VReg_64RegClass); 4500 case 96: 4501 return std::make_pair(0U, &AMDGPU::VReg_96RegClass); 4502 case 128: 4503 return std::make_pair(0U, &AMDGPU::VReg_128RegClass); 4504 case 256: 4505 return std::make_pair(0U, &AMDGPU::VReg_256RegClass); 4506 case 512: 4507 return std::make_pair(0U, &AMDGPU::VReg_512RegClass); 4508 } 4509 } 4510 } 4511 4512 if (Constraint.size() > 1) { 4513 const TargetRegisterClass *RC = nullptr; 4514 if (Constraint[1] == 'v') { 4515 RC = &AMDGPU::VGPR_32RegClass; 4516 } else if (Constraint[1] == 's') { 4517 RC = &AMDGPU::SGPR_32RegClass; 4518 } 4519 4520 if (RC) { 4521 uint32_t Idx; 4522 bool Failed = Constraint.substr(2).getAsInteger(10, Idx); 4523 if (!Failed && Idx < RC->getNumRegs()) 4524 return std::make_pair(RC->getRegister(Idx), RC); 4525 } 4526 } 4527 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4528 } 4529 4530 SITargetLowering::ConstraintType 4531 SITargetLowering::getConstraintType(StringRef Constraint) const { 4532 if (Constraint.size() == 1) { 4533 switch (Constraint[0]) { 4534 default: break; 4535 case 's': 4536 case 'v': 4537 return C_RegisterClass; 4538 } 4539 } 4540 return TargetLowering::getConstraintType(Constraint); 4541 } 4542