1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for SI 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifdef _MSC_VER 16 // Provide M_PI. 17 #define _USE_MATH_DEFINES 18 #include <cmath> 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPUIntrinsicInfo.h" 23 #include "AMDGPUSubtarget.h" 24 #include "SIDefines.h" 25 #include "SIISelLowering.h" 26 #include "SIInstrInfo.h" 27 #include "SIMachineFunctionInfo.h" 28 #include "SIRegisterInfo.h" 29 #include "llvm/ADT/BitVector.h" 30 #include "llvm/ADT/StringSwitch.h" 31 #include "llvm/CodeGen/CallingConvLower.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/CodeGen/SelectionDAG.h" 35 #include "llvm/CodeGen/Analysis.h" 36 #include "llvm/IR/DiagnosticInfo.h" 37 #include "llvm/IR/Function.h" 38 39 using namespace llvm; 40 41 static cl::opt<bool> EnableVGPRIndexMode( 42 "amdgpu-vgpr-index-mode", 43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 44 cl::init(false)); 45 46 47 static unsigned findFirstFreeSGPR(CCState &CCInfo) { 48 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 49 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { 50 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { 51 return AMDGPU::SGPR0 + Reg; 52 } 53 } 54 llvm_unreachable("Cannot allocate sgpr"); 55 } 56 57 SITargetLowering::SITargetLowering(const TargetMachine &TM, 58 const SISubtarget &STI) 59 : AMDGPUTargetLowering(TM, STI) { 60 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 61 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 62 63 addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass); 64 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 65 66 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 67 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 68 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 69 70 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); 71 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); 72 73 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); 74 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 75 76 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); 77 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 78 79 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); 80 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 81 82 if (Subtarget->has16BitInsts()) { 83 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); 84 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); 85 } 86 87 computeRegisterProperties(STI.getRegisterInfo()); 88 89 // We need to custom lower vector stores from local memory 90 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 91 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 92 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 93 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 94 setOperationAction(ISD::LOAD, MVT::i1, Custom); 95 96 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 97 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 98 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 99 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 100 setOperationAction(ISD::STORE, MVT::i1, Custom); 101 102 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 103 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 104 setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); 105 106 setOperationAction(ISD::SELECT, MVT::i1, Promote); 107 setOperationAction(ISD::SELECT, MVT::i64, Custom); 108 setOperationAction(ISD::SELECT, MVT::f64, Promote); 109 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 110 111 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 112 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 113 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 114 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 115 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 116 117 setOperationAction(ISD::SETCC, MVT::i1, Promote); 118 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 119 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 120 121 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); 122 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 123 124 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 125 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 126 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 127 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 128 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 129 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 130 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 131 132 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 133 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 134 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 135 136 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 137 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 138 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 139 setOperationAction(ISD::BR_CC, MVT::i64, Expand); 140 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 141 setOperationAction(ISD::BR_CC, MVT::f64, Expand); 142 143 // We only support LOAD/STORE and vector manipulation ops for vectors 144 // with > 4 elements. 145 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { 146 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 147 switch (Op) { 148 case ISD::LOAD: 149 case ISD::STORE: 150 case ISD::BUILD_VECTOR: 151 case ISD::BITCAST: 152 case ISD::EXTRACT_VECTOR_ELT: 153 case ISD::INSERT_VECTOR_ELT: 154 case ISD::INSERT_SUBVECTOR: 155 case ISD::EXTRACT_SUBVECTOR: 156 case ISD::SCALAR_TO_VECTOR: 157 break; 158 case ISD::CONCAT_VECTORS: 159 setOperationAction(Op, VT, Custom); 160 break; 161 default: 162 setOperationAction(Op, VT, Expand); 163 break; 164 } 165 } 166 } 167 168 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that 169 // is expanded to avoid having two separate loops in case the index is a VGPR. 170 171 // Most operations are naturally 32-bit vector operations. We only support 172 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. 173 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { 174 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 175 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); 176 177 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 178 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); 179 180 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 181 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); 182 183 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 184 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); 185 } 186 187 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 188 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 189 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 190 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 191 192 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, 193 // and output demarshalling 194 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 195 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 196 197 // We can't return success/failure, only the old value, 198 // let LLVM add the comparison 199 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); 200 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); 201 202 if (getSubtarget()->hasFlatAddressSpace()) { 203 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); 204 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); 205 } 206 207 setOperationAction(ISD::BSWAP, MVT::i32, Legal); 208 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 209 210 // On SI this is s_memtime and s_memrealtime on VI. 211 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 212 setOperationAction(ISD::TRAP, MVT::Other, Custom); 213 214 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 215 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 216 217 if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) { 218 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 219 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 220 setOperationAction(ISD::FRINT, MVT::f64, Legal); 221 } 222 223 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 224 225 setOperationAction(ISD::FSIN, MVT::f32, Custom); 226 setOperationAction(ISD::FCOS, MVT::f32, Custom); 227 setOperationAction(ISD::FDIV, MVT::f32, Custom); 228 setOperationAction(ISD::FDIV, MVT::f64, Custom); 229 230 if (Subtarget->has16BitInsts()) { 231 setOperationAction(ISD::Constant, MVT::i16, Legal); 232 233 setOperationAction(ISD::SMIN, MVT::i16, Legal); 234 setOperationAction(ISD::SMAX, MVT::i16, Legal); 235 236 setOperationAction(ISD::UMIN, MVT::i16, Legal); 237 setOperationAction(ISD::UMAX, MVT::i16, Legal); 238 239 setOperationAction(ISD::SETCC, MVT::i16, Promote); 240 AddPromotedToType(ISD::SETCC, MVT::i16, MVT::i32); 241 242 setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote); 243 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); 244 245 setOperationAction(ISD::ROTR, MVT::i16, Promote); 246 setOperationAction(ISD::ROTL, MVT::i16, Promote); 247 248 setOperationAction(ISD::SDIV, MVT::i16, Promote); 249 setOperationAction(ISD::UDIV, MVT::i16, Promote); 250 setOperationAction(ISD::SREM, MVT::i16, Promote); 251 setOperationAction(ISD::UREM, MVT::i16, Promote); 252 253 setOperationAction(ISD::BSWAP, MVT::i16, Promote); 254 setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); 255 256 setOperationAction(ISD::CTTZ, MVT::i16, Promote); 257 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote); 258 setOperationAction(ISD::CTLZ, MVT::i16, Promote); 259 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote); 260 261 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); 262 263 setOperationAction(ISD::BR_CC, MVT::i16, Expand); 264 265 setOperationAction(ISD::LOAD, MVT::i16, Custom); 266 267 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 268 269 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); 270 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); 271 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); 272 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); 273 274 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); 275 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); 276 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); 277 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); 278 279 // F16 - Constant Actions. 280 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 281 282 // F16 - Load/Store Actions. 283 setOperationAction(ISD::LOAD, MVT::f16, Promote); 284 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16); 285 setOperationAction(ISD::STORE, MVT::f16, Promote); 286 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); 287 288 // F16 - VOP1 Actions. 289 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 290 setOperationAction(ISD::FCOS, MVT::f16, Promote); 291 setOperationAction(ISD::FSIN, MVT::f16, Promote); 292 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote); 293 setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote); 294 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote); 295 setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote); 296 297 // F16 - VOP2 Actions. 298 setOperationAction(ISD::BR_CC, MVT::f16, Expand); 299 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); 300 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 301 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 302 setOperationAction(ISD::FDIV, MVT::f16, Promote); 303 304 // F16 - VOP3 Actions. 305 setOperationAction(ISD::FMA, MVT::f16, Legal); 306 if (!Subtarget->hasFP16Denormals()) 307 setOperationAction(ISD::FMAD, MVT::f16, Legal); 308 } 309 310 setTargetDAGCombine(ISD::FADD); 311 setTargetDAGCombine(ISD::FSUB); 312 setTargetDAGCombine(ISD::FMINNUM); 313 setTargetDAGCombine(ISD::FMAXNUM); 314 setTargetDAGCombine(ISD::SMIN); 315 setTargetDAGCombine(ISD::SMAX); 316 setTargetDAGCombine(ISD::UMIN); 317 setTargetDAGCombine(ISD::UMAX); 318 setTargetDAGCombine(ISD::SETCC); 319 setTargetDAGCombine(ISD::AND); 320 setTargetDAGCombine(ISD::OR); 321 setTargetDAGCombine(ISD::XOR); 322 setTargetDAGCombine(ISD::SINT_TO_FP); 323 setTargetDAGCombine(ISD::UINT_TO_FP); 324 setTargetDAGCombine(ISD::FCANONICALIZE); 325 326 // All memory operations. Some folding on the pointer operand is done to help 327 // matching the constant offsets in the addressing modes. 328 setTargetDAGCombine(ISD::LOAD); 329 setTargetDAGCombine(ISD::STORE); 330 setTargetDAGCombine(ISD::ATOMIC_LOAD); 331 setTargetDAGCombine(ISD::ATOMIC_STORE); 332 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); 333 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); 334 setTargetDAGCombine(ISD::ATOMIC_SWAP); 335 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); 336 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); 337 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); 338 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); 339 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); 340 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); 341 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); 342 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); 343 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); 344 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); 345 346 setSchedulingPreference(Sched::RegPressure); 347 } 348 349 const SISubtarget *SITargetLowering::getSubtarget() const { 350 return static_cast<const SISubtarget *>(Subtarget); 351 } 352 353 //===----------------------------------------------------------------------===// 354 // TargetLowering queries 355 //===----------------------------------------------------------------------===// 356 357 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 358 const CallInst &CI, 359 unsigned IntrID) const { 360 switch (IntrID) { 361 case Intrinsic::amdgcn_atomic_inc: 362 case Intrinsic::amdgcn_atomic_dec: 363 Info.opc = ISD::INTRINSIC_W_CHAIN; 364 Info.memVT = MVT::getVT(CI.getType()); 365 Info.ptrVal = CI.getOperand(0); 366 Info.align = 0; 367 Info.vol = false; 368 Info.readMem = true; 369 Info.writeMem = true; 370 return true; 371 default: 372 return false; 373 } 374 } 375 376 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, 377 EVT) const { 378 // SI has some legal vector types, but no legal vector operations. Say no 379 // shuffles are legal in order to prefer scalarizing some vector operations. 380 return false; 381 } 382 383 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { 384 // Flat instructions do not have offsets, and only have the register 385 // address. 386 return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); 387 } 388 389 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { 390 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and 391 // additionally can do r + r + i with addr64. 32-bit has more addressing 392 // mode options. Depending on the resource constant, it can also do 393 // (i64 r0) + (i32 r1) * (i14 i). 394 // 395 // Private arrays end up using a scratch buffer most of the time, so also 396 // assume those use MUBUF instructions. Scratch loads / stores are currently 397 // implemented as mubuf instructions with offen bit set, so slightly 398 // different than the normal addr64. 399 if (!isUInt<12>(AM.BaseOffs)) 400 return false; 401 402 // FIXME: Since we can split immediate into soffset and immediate offset, 403 // would it make sense to allow any immediate? 404 405 switch (AM.Scale) { 406 case 0: // r + i or just i, depending on HasBaseReg. 407 return true; 408 case 1: 409 return true; // We have r + r or r + i. 410 case 2: 411 if (AM.HasBaseReg) { 412 // Reject 2 * r + r. 413 return false; 414 } 415 416 // Allow 2 * r as r + r 417 // Or 2 * r + i is allowed as r + r + i. 418 return true; 419 default: // Don't allow n * r 420 return false; 421 } 422 } 423 424 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, 425 const AddrMode &AM, Type *Ty, 426 unsigned AS) const { 427 // No global is ever allowed as a base. 428 if (AM.BaseGV) 429 return false; 430 431 switch (AS) { 432 case AMDGPUAS::GLOBAL_ADDRESS: { 433 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 434 // Assume the we will use FLAT for all global memory accesses 435 // on VI. 436 // FIXME: This assumption is currently wrong. On VI we still use 437 // MUBUF instructions for the r + i addressing mode. As currently 438 // implemented, the MUBUF instructions only work on buffer < 4GB. 439 // It may be possible to support > 4GB buffers with MUBUF instructions, 440 // by setting the stride value in the resource descriptor which would 441 // increase the size limit to (stride * 4GB). However, this is risky, 442 // because it has never been validated. 443 return isLegalFlatAddressingMode(AM); 444 } 445 446 return isLegalMUBUFAddressingMode(AM); 447 } 448 case AMDGPUAS::CONSTANT_ADDRESS: { 449 // If the offset isn't a multiple of 4, it probably isn't going to be 450 // correctly aligned. 451 // FIXME: Can we get the real alignment here? 452 if (AM.BaseOffs % 4 != 0) 453 return isLegalMUBUFAddressingMode(AM); 454 455 // There are no SMRD extloads, so if we have to do a small type access we 456 // will use a MUBUF load. 457 // FIXME?: We also need to do this if unaligned, but we don't know the 458 // alignment here. 459 if (DL.getTypeStoreSize(Ty) < 4) 460 return isLegalMUBUFAddressingMode(AM); 461 462 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { 463 // SMRD instructions have an 8-bit, dword offset on SI. 464 if (!isUInt<8>(AM.BaseOffs / 4)) 465 return false; 466 } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) { 467 // On CI+, this can also be a 32-bit literal constant offset. If it fits 468 // in 8-bits, it can use a smaller encoding. 469 if (!isUInt<32>(AM.BaseOffs / 4)) 470 return false; 471 } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) { 472 // On VI, these use the SMEM format and the offset is 20-bit in bytes. 473 if (!isUInt<20>(AM.BaseOffs)) 474 return false; 475 } else 476 llvm_unreachable("unhandled generation"); 477 478 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 479 return true; 480 481 if (AM.Scale == 1 && AM.HasBaseReg) 482 return true; 483 484 return false; 485 } 486 487 case AMDGPUAS::PRIVATE_ADDRESS: 488 return isLegalMUBUFAddressingMode(AM); 489 490 case AMDGPUAS::LOCAL_ADDRESS: 491 case AMDGPUAS::REGION_ADDRESS: { 492 // Basic, single offset DS instructions allow a 16-bit unsigned immediate 493 // field. 494 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have 495 // an 8-bit dword offset but we don't know the alignment here. 496 if (!isUInt<16>(AM.BaseOffs)) 497 return false; 498 499 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 500 return true; 501 502 if (AM.Scale == 1 && AM.HasBaseReg) 503 return true; 504 505 return false; 506 } 507 case AMDGPUAS::FLAT_ADDRESS: 508 case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: 509 // For an unknown address space, this usually means that this is for some 510 // reason being used for pure arithmetic, and not based on some addressing 511 // computation. We don't have instructions that compute pointers with any 512 // addressing modes, so treat them as having no offset like flat 513 // instructions. 514 return isLegalFlatAddressingMode(AM); 515 516 default: 517 llvm_unreachable("unhandled address space"); 518 } 519 } 520 521 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 522 unsigned AddrSpace, 523 unsigned Align, 524 bool *IsFast) const { 525 if (IsFast) 526 *IsFast = false; 527 528 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, 529 // which isn't a simple VT. 530 // Until MVT is extended to handle this, simply check for the size and 531 // rely on the condition below: allow accesses if the size is a multiple of 4. 532 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && 533 VT.getStoreSize() > 16)) { 534 return false; 535 } 536 537 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 538 AddrSpace == AMDGPUAS::REGION_ADDRESS) { 539 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte 540 // aligned, 8 byte access in a single operation using ds_read2/write2_b32 541 // with adjacent offsets. 542 bool AlignedBy4 = (Align % 4 == 0); 543 if (IsFast) 544 *IsFast = AlignedBy4; 545 546 return AlignedBy4; 547 } 548 549 // FIXME: We have to be conservative here and assume that flat operations 550 // will access scratch. If we had access to the IR function, then we 551 // could determine if any private memory was used in the function. 552 if (!Subtarget->hasUnalignedScratchAccess() && 553 (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 554 AddrSpace == AMDGPUAS::FLAT_ADDRESS)) { 555 return false; 556 } 557 558 if (Subtarget->hasUnalignedBufferAccess()) { 559 // If we have an uniform constant load, it still requires using a slow 560 // buffer instruction if unaligned. 561 if (IsFast) { 562 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ? 563 (Align % 4 == 0) : true; 564 } 565 566 return true; 567 } 568 569 // Smaller than dword value must be aligned. 570 if (VT.bitsLT(MVT::i32)) 571 return false; 572 573 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 574 // byte-address are ignored, thus forcing Dword alignment. 575 // This applies to private, global, and constant memory. 576 if (IsFast) 577 *IsFast = true; 578 579 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 580 } 581 582 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 583 unsigned SrcAlign, bool IsMemset, 584 bool ZeroMemset, 585 bool MemcpyStrSrc, 586 MachineFunction &MF) const { 587 // FIXME: Should account for address space here. 588 589 // The default fallback uses the private pointer size as a guess for a type to 590 // use. Make sure we switch these to 64-bit accesses. 591 592 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global 593 return MVT::v4i32; 594 595 if (Size >= 8 && DstAlign >= 4) 596 return MVT::v2i32; 597 598 // Use the default. 599 return MVT::Other; 600 } 601 602 static bool isFlatGlobalAddrSpace(unsigned AS) { 603 return AS == AMDGPUAS::GLOBAL_ADDRESS || 604 AS == AMDGPUAS::FLAT_ADDRESS || 605 AS == AMDGPUAS::CONSTANT_ADDRESS; 606 } 607 608 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 609 unsigned DestAS) const { 610 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); 611 } 612 613 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { 614 const MemSDNode *MemNode = cast<MemSDNode>(N); 615 const Value *Ptr = MemNode->getMemOperand()->getValue(); 616 const Instruction *I = dyn_cast<Instruction>(Ptr); 617 return I && I->getMetadata("amdgpu.noclobber"); 618 } 619 620 bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, 621 unsigned DestAS) const { 622 // Flat -> private/local is a simple truncate. 623 // Flat -> global is no-op 624 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) 625 return true; 626 627 return isNoopAddrSpaceCast(SrcAS, DestAS); 628 } 629 630 bool SITargetLowering::isMemOpUniform(const SDNode *N) const { 631 const MemSDNode *MemNode = cast<MemSDNode>(N); 632 const Value *Ptr = MemNode->getMemOperand()->getValue(); 633 634 // UndefValue means this is a load of a kernel input. These are uniform. 635 // Sometimes LDS instructions have constant pointers. 636 // If Ptr is null, then that means this mem operand contains a 637 // PseudoSourceValue like GOT. 638 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 639 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 640 return true; 641 642 const Instruction *I = dyn_cast<Instruction>(Ptr); 643 return I && I->getMetadata("amdgpu.uniform"); 644 } 645 646 TargetLoweringBase::LegalizeTypeAction 647 SITargetLowering::getPreferredVectorAction(EVT VT) const { 648 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) 649 return TypeSplitVector; 650 651 return TargetLoweringBase::getPreferredVectorAction(VT); 652 } 653 654 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 655 Type *Ty) const { 656 // FIXME: Could be smarter if called for vector constants. 657 return true; 658 } 659 660 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { 661 662 // i16 is not desirable unless it is a load or a store. 663 if (VT == MVT::i16 && Op != ISD::LOAD && Op != ISD::STORE) 664 return false; 665 666 // SimplifySetCC uses this function to determine whether or not it should 667 // create setcc with i1 operands. We don't have instructions for i1 setcc. 668 if (VT == MVT::i1 && Op == ISD::SETCC) 669 return false; 670 671 return TargetLowering::isTypeDesirableForOp(Op, VT); 672 } 673 674 SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, 675 const SDLoc &SL, SDValue Chain, 676 unsigned Offset) const { 677 const DataLayout &DL = DAG.getDataLayout(); 678 MachineFunction &MF = DAG.getMachineFunction(); 679 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 680 unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 681 682 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 683 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 684 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, 685 MRI.getLiveInVirtReg(InputPtrReg), PtrVT); 686 return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 687 DAG.getConstant(Offset, SL, PtrVT)); 688 } 689 690 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 691 const SDLoc &SL, SDValue Chain, 692 unsigned Offset, bool Signed) const { 693 const DataLayout &DL = DAG.getDataLayout(); 694 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); 695 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 696 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 697 698 unsigned Align = DL.getABITypeAlignment(Ty); 699 700 SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset); 701 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, 702 MachineMemOperand::MONonTemporal | 703 MachineMemOperand::MODereferenceable | 704 MachineMemOperand::MOInvariant); 705 706 SDValue Val; 707 if (MemVT.isFloatingPoint()) 708 Val = getFPExtOrFPTrunc(DAG, Load, SL, VT); 709 else if (Signed) 710 Val = DAG.getSExtOrTrunc(Load, SL, VT); 711 else 712 Val = DAG.getZExtOrTrunc(Load, SL, VT); 713 714 SDValue Ops[] = { 715 Val, 716 Load.getValue(1) 717 }; 718 719 return DAG.getMergeValues(Ops, SL); 720 } 721 722 SDValue SITargetLowering::LowerFormalArguments( 723 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 724 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 725 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 726 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 727 728 MachineFunction &MF = DAG.getMachineFunction(); 729 FunctionType *FType = MF.getFunction()->getFunctionType(); 730 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 731 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 732 733 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { 734 const Function *Fn = MF.getFunction(); 735 DiagnosticInfoUnsupported NoGraphicsHSA( 736 *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); 737 DAG.getContext()->diagnose(NoGraphicsHSA); 738 return DAG.getEntryNode(); 739 } 740 741 // Create stack objects that are used for emitting debugger prologue if 742 // "amdgpu-debugger-emit-prologue" attribute was specified. 743 if (ST.debuggerEmitPrologue()) 744 createDebuggerPrologueStackObjects(MF); 745 746 SmallVector<ISD::InputArg, 16> Splits; 747 BitVector Skipped(Ins.size()); 748 749 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 750 const ISD::InputArg &Arg = Ins[i]; 751 752 // First check if it's a PS input addr 753 if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && 754 !Arg.Flags.isByVal() && PSInputNum <= 15) { 755 756 if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { 757 // We can safely skip PS inputs 758 Skipped.set(i); 759 ++PSInputNum; 760 continue; 761 } 762 763 Info->markPSInputAllocated(PSInputNum); 764 if (Arg.Used) 765 Info->PSInputEna |= 1 << PSInputNum; 766 767 ++PSInputNum; 768 } 769 770 if (AMDGPU::isShader(CallConv)) { 771 // Second split vertices into their elements 772 if (Arg.VT.isVector()) { 773 ISD::InputArg NewArg = Arg; 774 NewArg.Flags.setSplit(); 775 NewArg.VT = Arg.VT.getVectorElementType(); 776 777 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 778 // three or five element vertex only needs three or five registers, 779 // NOT four or eight. 780 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 781 unsigned NumElements = ParamType->getVectorNumElements(); 782 783 for (unsigned j = 0; j != NumElements; ++j) { 784 Splits.push_back(NewArg); 785 NewArg.PartOffset += NewArg.VT.getStoreSize(); 786 } 787 } else { 788 Splits.push_back(Arg); 789 } 790 } 791 } 792 793 SmallVector<CCValAssign, 16> ArgLocs; 794 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 795 *DAG.getContext()); 796 797 // At least one interpolation mode must be enabled or else the GPU will hang. 798 // 799 // Check PSInputAddr instead of PSInputEna. The idea is that if the user set 800 // PSInputAddr, the user wants to enable some bits after the compilation 801 // based on run-time states. Since we can't know what the final PSInputEna 802 // will look like, so we shouldn't do anything here and the user should take 803 // responsibility for the correct programming. 804 // 805 // Otherwise, the following restrictions apply: 806 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 807 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 808 // enabled too. 809 if (CallConv == CallingConv::AMDGPU_PS && 810 ((Info->getPSInputAddr() & 0x7F) == 0 || 811 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) { 812 CCInfo.AllocateReg(AMDGPU::VGPR0); 813 CCInfo.AllocateReg(AMDGPU::VGPR1); 814 Info->markPSInputAllocated(0); 815 Info->PSInputEna |= 1; 816 } 817 818 if (!AMDGPU::isShader(CallConv)) { 819 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); 820 } else { 821 assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() && 822 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && 823 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && 824 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && 825 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && 826 !Info->hasWorkItemIDZ()); 827 } 828 829 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 830 if (Info->hasPrivateSegmentBuffer()) { 831 unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); 832 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); 833 CCInfo.AllocateReg(PrivateSegmentBufferReg); 834 } 835 836 if (Info->hasDispatchPtr()) { 837 unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); 838 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 839 CCInfo.AllocateReg(DispatchPtrReg); 840 } 841 842 if (Info->hasQueuePtr()) { 843 unsigned QueuePtrReg = Info->addQueuePtr(*TRI); 844 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 845 CCInfo.AllocateReg(QueuePtrReg); 846 } 847 848 if (Info->hasKernargSegmentPtr()) { 849 unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); 850 MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); 851 CCInfo.AllocateReg(InputPtrReg); 852 } 853 854 if (Info->hasDispatchID()) { 855 unsigned DispatchIDReg = Info->addDispatchID(*TRI); 856 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 857 CCInfo.AllocateReg(DispatchIDReg); 858 } 859 860 if (Info->hasFlatScratchInit()) { 861 unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); 862 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 863 CCInfo.AllocateReg(FlatScratchInitReg); 864 } 865 866 if (!AMDGPU::isShader(CallConv)) 867 analyzeFormalArgumentsCompute(CCInfo, Ins); 868 else 869 AnalyzeFormalArguments(CCInfo, Splits); 870 871 SmallVector<SDValue, 16> Chains; 872 873 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 874 875 const ISD::InputArg &Arg = Ins[i]; 876 if (Skipped[i]) { 877 InVals.push_back(DAG.getUNDEF(Arg.VT)); 878 continue; 879 } 880 881 CCValAssign &VA = ArgLocs[ArgIdx++]; 882 MVT VT = VA.getLocVT(); 883 884 if (VA.isMemLoc()) { 885 VT = Ins[i].VT; 886 EVT MemVT = VA.getLocVT(); 887 const unsigned Offset = Subtarget->getExplicitKernelArgOffset() + 888 VA.getLocMemOffset(); 889 // The first 36 bytes of the input buffer contains information about 890 // thread group and global sizes. 891 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, 892 Offset, Ins[i].Flags.isSExt()); 893 Chains.push_back(Arg.getValue(1)); 894 895 auto *ParamTy = 896 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 897 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && 898 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 899 // On SI local pointers are just offsets into LDS, so they are always 900 // less than 16-bits. On CI and newer they could potentially be 901 // real pointers, so we can't guarantee their size. 902 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, 903 DAG.getValueType(MVT::i16)); 904 } 905 906 InVals.push_back(Arg); 907 Info->setABIArgOffset(Offset + MemVT.getStoreSize()); 908 continue; 909 } 910 assert(VA.isRegLoc() && "Parameter must be in a register!"); 911 912 unsigned Reg = VA.getLocReg(); 913 914 if (VT == MVT::i64) { 915 // For now assume it is a pointer 916 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 917 &AMDGPU::SGPR_64RegClass); 918 Reg = MF.addLiveIn(Reg, &AMDGPU::SGPR_64RegClass); 919 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 920 InVals.push_back(Copy); 921 continue; 922 } 923 924 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 925 926 Reg = MF.addLiveIn(Reg, RC); 927 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 928 929 if (Arg.VT.isVector()) { 930 931 // Build a vector from the registers 932 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 933 unsigned NumElements = ParamType->getVectorNumElements(); 934 935 SmallVector<SDValue, 4> Regs; 936 Regs.push_back(Val); 937 for (unsigned j = 1; j != NumElements; ++j) { 938 Reg = ArgLocs[ArgIdx++].getLocReg(); 939 Reg = MF.addLiveIn(Reg, RC); 940 941 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 942 Regs.push_back(Copy); 943 } 944 945 // Fill up the missing vector elements 946 NumElements = Arg.VT.getVectorNumElements() - NumElements; 947 Regs.append(NumElements, DAG.getUNDEF(VT)); 948 949 InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); 950 continue; 951 } 952 953 InVals.push_back(Val); 954 } 955 956 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 957 // these from the dispatch pointer. 958 959 // Start adding system SGPRs. 960 if (Info->hasWorkGroupIDX()) { 961 unsigned Reg = Info->addWorkGroupIDX(); 962 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); 963 CCInfo.AllocateReg(Reg); 964 } 965 966 if (Info->hasWorkGroupIDY()) { 967 unsigned Reg = Info->addWorkGroupIDY(); 968 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); 969 CCInfo.AllocateReg(Reg); 970 } 971 972 if (Info->hasWorkGroupIDZ()) { 973 unsigned Reg = Info->addWorkGroupIDZ(); 974 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); 975 CCInfo.AllocateReg(Reg); 976 } 977 978 if (Info->hasWorkGroupInfo()) { 979 unsigned Reg = Info->addWorkGroupInfo(); 980 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); 981 CCInfo.AllocateReg(Reg); 982 } 983 984 if (Info->hasPrivateSegmentWaveByteOffset()) { 985 // Scratch wave offset passed in system SGPR. 986 unsigned PrivateSegmentWaveByteOffsetReg; 987 988 if (AMDGPU::isShader(CallConv)) { 989 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); 990 Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); 991 } else 992 PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset(); 993 994 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); 995 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); 996 } 997 998 // Now that we've figured out where the scratch register inputs are, see if 999 // should reserve the arguments and use them directly. 1000 bool HasStackObjects = MF.getFrameInfo().hasStackObjects(); 1001 // Record that we know we have non-spill stack objects so we don't need to 1002 // check all stack objects later. 1003 if (HasStackObjects) 1004 Info->setHasNonSpillStackObjects(true); 1005 1006 // Everything live out of a block is spilled with fast regalloc, so it's 1007 // almost certain that spilling will be required. 1008 if (getTargetMachine().getOptLevel() == CodeGenOpt::None) 1009 HasStackObjects = true; 1010 1011 if (ST.isAmdCodeObjectV2()) { 1012 if (HasStackObjects) { 1013 // If we have stack objects, we unquestionably need the private buffer 1014 // resource. For the Code Object V2 ABI, this will be the first 4 user 1015 // SGPR inputs. We can reserve those and use them directly. 1016 1017 unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( 1018 MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); 1019 Info->setScratchRSrcReg(PrivateSegmentBufferReg); 1020 1021 unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( 1022 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 1023 Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); 1024 } else { 1025 unsigned ReservedBufferReg 1026 = TRI->reservedPrivateSegmentBufferReg(MF); 1027 unsigned ReservedOffsetReg 1028 = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); 1029 1030 // We tentatively reserve the last registers (skipping the last two 1031 // which may contain VCC). After register allocation, we'll replace 1032 // these with the ones immediately after those which were really 1033 // allocated. In the prologue copies will be inserted from the argument 1034 // to these reserved registers. 1035 Info->setScratchRSrcReg(ReservedBufferReg); 1036 Info->setScratchWaveOffsetReg(ReservedOffsetReg); 1037 } 1038 } else { 1039 unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); 1040 1041 // Without HSA, relocations are used for the scratch pointer and the 1042 // buffer resource setup is always inserted in the prologue. Scratch wave 1043 // offset is still in an input SGPR. 1044 Info->setScratchRSrcReg(ReservedBufferReg); 1045 1046 if (HasStackObjects) { 1047 unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( 1048 MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 1049 Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); 1050 } else { 1051 unsigned ReservedOffsetReg 1052 = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); 1053 Info->setScratchWaveOffsetReg(ReservedOffsetReg); 1054 } 1055 } 1056 1057 if (Info->hasWorkItemIDX()) { 1058 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); 1059 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 1060 CCInfo.AllocateReg(Reg); 1061 } 1062 1063 if (Info->hasWorkItemIDY()) { 1064 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); 1065 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 1066 CCInfo.AllocateReg(Reg); 1067 } 1068 1069 if (Info->hasWorkItemIDZ()) { 1070 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); 1071 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 1072 CCInfo.AllocateReg(Reg); 1073 } 1074 1075 if (Chains.empty()) 1076 return Chain; 1077 1078 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 1079 } 1080 1081 SDValue 1082 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 1083 bool isVarArg, 1084 const SmallVectorImpl<ISD::OutputArg> &Outs, 1085 const SmallVectorImpl<SDValue> &OutVals, 1086 const SDLoc &DL, SelectionDAG &DAG) const { 1087 MachineFunction &MF = DAG.getMachineFunction(); 1088 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1089 1090 if (!AMDGPU::isShader(CallConv)) 1091 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, 1092 OutVals, DL, DAG); 1093 1094 Info->setIfReturnsVoid(Outs.size() == 0); 1095 1096 SmallVector<ISD::OutputArg, 48> Splits; 1097 SmallVector<SDValue, 48> SplitVals; 1098 1099 // Split vectors into their elements. 1100 for (unsigned i = 0, e = Outs.size(); i != e; ++i) { 1101 const ISD::OutputArg &Out = Outs[i]; 1102 1103 if (Out.VT.isVector()) { 1104 MVT VT = Out.VT.getVectorElementType(); 1105 ISD::OutputArg NewOut = Out; 1106 NewOut.Flags.setSplit(); 1107 NewOut.VT = VT; 1108 1109 // We want the original number of vector elements here, e.g. 1110 // three or five, not four or eight. 1111 unsigned NumElements = Out.ArgVT.getVectorNumElements(); 1112 1113 for (unsigned j = 0; j != NumElements; ++j) { 1114 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], 1115 DAG.getConstant(j, DL, MVT::i32)); 1116 SplitVals.push_back(Elem); 1117 Splits.push_back(NewOut); 1118 NewOut.PartOffset += NewOut.VT.getStoreSize(); 1119 } 1120 } else { 1121 SplitVals.push_back(OutVals[i]); 1122 Splits.push_back(Out); 1123 } 1124 } 1125 1126 // CCValAssign - represent the assignment of the return value to a location. 1127 SmallVector<CCValAssign, 48> RVLocs; 1128 1129 // CCState - Info about the registers and stack slots. 1130 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1131 *DAG.getContext()); 1132 1133 // Analyze outgoing return values. 1134 AnalyzeReturn(CCInfo, Splits); 1135 1136 SDValue Flag; 1137 SmallVector<SDValue, 48> RetOps; 1138 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1139 1140 // Copy the result values into the output registers. 1141 for (unsigned i = 0, realRVLocIdx = 0; 1142 i != RVLocs.size(); 1143 ++i, ++realRVLocIdx) { 1144 CCValAssign &VA = RVLocs[i]; 1145 assert(VA.isRegLoc() && "Can only return in registers!"); 1146 1147 SDValue Arg = SplitVals[realRVLocIdx]; 1148 1149 // Copied from other backends. 1150 switch (VA.getLocInfo()) { 1151 default: llvm_unreachable("Unknown loc info!"); 1152 case CCValAssign::Full: 1153 break; 1154 case CCValAssign::BCvt: 1155 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 1156 break; 1157 } 1158 1159 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 1160 Flag = Chain.getValue(1); 1161 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1162 } 1163 1164 // Update chain and glue. 1165 RetOps[0] = Chain; 1166 if (Flag.getNode()) 1167 RetOps.push_back(Flag); 1168 1169 unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN; 1170 return DAG.getNode(Opc, DL, MVT::Other, RetOps); 1171 } 1172 1173 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, 1174 SelectionDAG &DAG) const { 1175 unsigned Reg = StringSwitch<unsigned>(RegName) 1176 .Case("m0", AMDGPU::M0) 1177 .Case("exec", AMDGPU::EXEC) 1178 .Case("exec_lo", AMDGPU::EXEC_LO) 1179 .Case("exec_hi", AMDGPU::EXEC_HI) 1180 .Case("flat_scratch", AMDGPU::FLAT_SCR) 1181 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) 1182 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) 1183 .Default(AMDGPU::NoRegister); 1184 1185 if (Reg == AMDGPU::NoRegister) { 1186 report_fatal_error(Twine("invalid register name \"" 1187 + StringRef(RegName) + "\".")); 1188 1189 } 1190 1191 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && 1192 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { 1193 report_fatal_error(Twine("invalid register \"" 1194 + StringRef(RegName) + "\" for subtarget.")); 1195 } 1196 1197 switch (Reg) { 1198 case AMDGPU::M0: 1199 case AMDGPU::EXEC_LO: 1200 case AMDGPU::EXEC_HI: 1201 case AMDGPU::FLAT_SCR_LO: 1202 case AMDGPU::FLAT_SCR_HI: 1203 if (VT.getSizeInBits() == 32) 1204 return Reg; 1205 break; 1206 case AMDGPU::EXEC: 1207 case AMDGPU::FLAT_SCR: 1208 if (VT.getSizeInBits() == 64) 1209 return Reg; 1210 break; 1211 default: 1212 llvm_unreachable("missing register type checking"); 1213 } 1214 1215 report_fatal_error(Twine("invalid type for register \"" 1216 + StringRef(RegName) + "\".")); 1217 } 1218 1219 // If kill is not the last instruction, split the block so kill is always a 1220 // proper terminator. 1221 MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, 1222 MachineBasicBlock *BB) const { 1223 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1224 1225 MachineBasicBlock::iterator SplitPoint(&MI); 1226 ++SplitPoint; 1227 1228 if (SplitPoint == BB->end()) { 1229 // Don't bother with a new block. 1230 MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); 1231 return BB; 1232 } 1233 1234 MachineFunction *MF = BB->getParent(); 1235 MachineBasicBlock *SplitBB 1236 = MF->CreateMachineBasicBlock(BB->getBasicBlock()); 1237 1238 MF->insert(++MachineFunction::iterator(BB), SplitBB); 1239 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); 1240 1241 SplitBB->transferSuccessorsAndUpdatePHIs(BB); 1242 BB->addSuccessor(SplitBB); 1243 1244 MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); 1245 return SplitBB; 1246 } 1247 1248 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the 1249 // wavefront. If the value is uniform and just happens to be in a VGPR, this 1250 // will only do one iteration. In the worst case, this will loop 64 times. 1251 // 1252 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. 1253 static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( 1254 const SIInstrInfo *TII, 1255 MachineRegisterInfo &MRI, 1256 MachineBasicBlock &OrigBB, 1257 MachineBasicBlock &LoopBB, 1258 const DebugLoc &DL, 1259 const MachineOperand &IdxReg, 1260 unsigned InitReg, 1261 unsigned ResultReg, 1262 unsigned PhiReg, 1263 unsigned InitSaveExecReg, 1264 int Offset, 1265 bool UseGPRIdxMode) { 1266 MachineBasicBlock::iterator I = LoopBB.begin(); 1267 1268 unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1269 unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1270 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1271 unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1272 1273 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) 1274 .addReg(InitReg) 1275 .addMBB(&OrigBB) 1276 .addReg(ResultReg) 1277 .addMBB(&LoopBB); 1278 1279 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec) 1280 .addReg(InitSaveExecReg) 1281 .addMBB(&OrigBB) 1282 .addReg(NewExec) 1283 .addMBB(&LoopBB); 1284 1285 // Read the next variant <- also loop target. 1286 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) 1287 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); 1288 1289 // Compare the just read M0 value to all possible Idx values. 1290 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) 1291 .addReg(CurrentIdxReg) 1292 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); 1293 1294 if (UseGPRIdxMode) { 1295 unsigned IdxReg; 1296 if (Offset == 0) { 1297 IdxReg = CurrentIdxReg; 1298 } else { 1299 IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 1300 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg) 1301 .addReg(CurrentIdxReg, RegState::Kill) 1302 .addImm(Offset); 1303 } 1304 1305 MachineInstr *SetIdx = 1306 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX)) 1307 .addReg(IdxReg, RegState::Kill); 1308 SetIdx->getOperand(2).setIsUndef(); 1309 } else { 1310 // Move index from VCC into M0 1311 if (Offset == 0) { 1312 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1313 .addReg(CurrentIdxReg, RegState::Kill); 1314 } else { 1315 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 1316 .addReg(CurrentIdxReg, RegState::Kill) 1317 .addImm(Offset); 1318 } 1319 } 1320 1321 // Update EXEC, save the original EXEC value to VCC. 1322 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) 1323 .addReg(CondReg, RegState::Kill); 1324 1325 MRI.setSimpleHint(NewExec, CondReg); 1326 1327 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 1328 MachineInstr *InsertPt = 1329 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 1330 .addReg(AMDGPU::EXEC) 1331 .addReg(NewExec); 1332 1333 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 1334 // s_cbranch_scc0? 1335 1336 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 1337 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 1338 .addMBB(&LoopBB); 1339 1340 return InsertPt->getIterator(); 1341 } 1342 1343 // This has slightly sub-optimal regalloc when the source vector is killed by 1344 // the read. The register allocator does not understand that the kill is 1345 // per-workitem, so is kept alive for the whole loop so we end up not re-using a 1346 // subregister from it, using 1 more VGPR than necessary. This was saved when 1347 // this was expanded after register allocation. 1348 static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, 1349 MachineBasicBlock &MBB, 1350 MachineInstr &MI, 1351 unsigned InitResultReg, 1352 unsigned PhiReg, 1353 int Offset, 1354 bool UseGPRIdxMode) { 1355 MachineFunction *MF = MBB.getParent(); 1356 MachineRegisterInfo &MRI = MF->getRegInfo(); 1357 const DebugLoc &DL = MI.getDebugLoc(); 1358 MachineBasicBlock::iterator I(&MI); 1359 1360 unsigned DstReg = MI.getOperand(0).getReg(); 1361 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1362 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 1363 1364 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); 1365 1366 // Save the EXEC mask 1367 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec) 1368 .addReg(AMDGPU::EXEC); 1369 1370 // To insert the loop we need to split the block. Move everything after this 1371 // point to a new block, and insert a new empty block between the two. 1372 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 1373 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 1374 MachineFunction::iterator MBBI(MBB); 1375 ++MBBI; 1376 1377 MF->insert(MBBI, LoopBB); 1378 MF->insert(MBBI, RemainderBB); 1379 1380 LoopBB->addSuccessor(LoopBB); 1381 LoopBB->addSuccessor(RemainderBB); 1382 1383 // Move the rest of the block into a new block. 1384 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 1385 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 1386 1387 MBB.addSuccessor(LoopBB); 1388 1389 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 1390 1391 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, 1392 InitResultReg, DstReg, PhiReg, TmpExec, 1393 Offset, UseGPRIdxMode); 1394 1395 MachineBasicBlock::iterator First = RemainderBB->begin(); 1396 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 1397 .addReg(SaveExec); 1398 1399 return InsPt; 1400 } 1401 1402 // Returns subreg index, offset 1403 static std::pair<unsigned, int> 1404 computeIndirectRegAndOffset(const SIRegisterInfo &TRI, 1405 const TargetRegisterClass *SuperRC, 1406 unsigned VecReg, 1407 int Offset) { 1408 int NumElts = SuperRC->getSize() / 4; 1409 1410 // Skip out of bounds offsets, or else we would end up using an undefined 1411 // register. 1412 if (Offset >= NumElts || Offset < 0) 1413 return std::make_pair(AMDGPU::sub0, Offset); 1414 1415 return std::make_pair(AMDGPU::sub0 + Offset, 0); 1416 } 1417 1418 // Return true if the index is an SGPR and was set. 1419 static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, 1420 MachineRegisterInfo &MRI, 1421 MachineInstr &MI, 1422 int Offset, 1423 bool UseGPRIdxMode, 1424 bool IsIndirectSrc) { 1425 MachineBasicBlock *MBB = MI.getParent(); 1426 const DebugLoc &DL = MI.getDebugLoc(); 1427 MachineBasicBlock::iterator I(&MI); 1428 1429 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 1430 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); 1431 1432 assert(Idx->getReg() != AMDGPU::NoRegister); 1433 1434 if (!TII->getRegisterInfo().isSGPRClass(IdxRC)) 1435 return false; 1436 1437 if (UseGPRIdxMode) { 1438 unsigned IdxMode = IsIndirectSrc ? 1439 VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; 1440 if (Offset == 0) { 1441 MachineInstr *SetOn = 1442 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 1443 .addOperand(*Idx) 1444 .addImm(IdxMode); 1445 1446 SetOn->getOperand(3).setIsUndef(); 1447 } else { 1448 unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 1449 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) 1450 .addOperand(*Idx) 1451 .addImm(Offset); 1452 MachineInstr *SetOn = 1453 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 1454 .addReg(Tmp, RegState::Kill) 1455 .addImm(IdxMode); 1456 1457 SetOn->getOperand(3).setIsUndef(); 1458 } 1459 1460 return true; 1461 } 1462 1463 if (Offset == 0) { 1464 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1465 .addOperand(*Idx); 1466 } else { 1467 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 1468 .addOperand(*Idx) 1469 .addImm(Offset); 1470 } 1471 1472 return true; 1473 } 1474 1475 // Control flow needs to be inserted if indexing with a VGPR. 1476 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, 1477 MachineBasicBlock &MBB, 1478 const SISubtarget &ST) { 1479 const SIInstrInfo *TII = ST.getInstrInfo(); 1480 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1481 MachineFunction *MF = MBB.getParent(); 1482 MachineRegisterInfo &MRI = MF->getRegInfo(); 1483 1484 unsigned Dst = MI.getOperand(0).getReg(); 1485 unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); 1486 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 1487 1488 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); 1489 1490 unsigned SubReg; 1491 std::tie(SubReg, Offset) 1492 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); 1493 1494 bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; 1495 1496 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) { 1497 MachineBasicBlock::iterator I(&MI); 1498 const DebugLoc &DL = MI.getDebugLoc(); 1499 1500 if (UseGPRIdxMode) { 1501 // TODO: Look at the uses to avoid the copy. This may require rescheduling 1502 // to avoid interfering with other uses, so probably requires a new 1503 // optimization pass. 1504 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) 1505 .addReg(SrcReg, RegState::Undef, SubReg) 1506 .addReg(SrcReg, RegState::Implicit) 1507 .addReg(AMDGPU::M0, RegState::Implicit); 1508 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 1509 } else { 1510 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 1511 .addReg(SrcReg, RegState::Undef, SubReg) 1512 .addReg(SrcReg, RegState::Implicit); 1513 } 1514 1515 MI.eraseFromParent(); 1516 1517 return &MBB; 1518 } 1519 1520 1521 const DebugLoc &DL = MI.getDebugLoc(); 1522 MachineBasicBlock::iterator I(&MI); 1523 1524 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1525 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1526 1527 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); 1528 1529 if (UseGPRIdxMode) { 1530 MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 1531 .addImm(0) // Reset inside loop. 1532 .addImm(VGPRIndexMode::SRC0_ENABLE); 1533 SetOn->getOperand(3).setIsUndef(); 1534 1535 // Disable again after the loop. 1536 BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 1537 } 1538 1539 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode); 1540 MachineBasicBlock *LoopBB = InsPt->getParent(); 1541 1542 if (UseGPRIdxMode) { 1543 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) 1544 .addReg(SrcReg, RegState::Undef, SubReg) 1545 .addReg(SrcReg, RegState::Implicit) 1546 .addReg(AMDGPU::M0, RegState::Implicit); 1547 } else { 1548 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 1549 .addReg(SrcReg, RegState::Undef, SubReg) 1550 .addReg(SrcReg, RegState::Implicit); 1551 } 1552 1553 MI.eraseFromParent(); 1554 1555 return LoopBB; 1556 } 1557 1558 static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) { 1559 switch (VecRC->getSize()) { 1560 case 4: 1561 return AMDGPU::V_MOVRELD_B32_V1; 1562 case 8: 1563 return AMDGPU::V_MOVRELD_B32_V2; 1564 case 16: 1565 return AMDGPU::V_MOVRELD_B32_V4; 1566 case 32: 1567 return AMDGPU::V_MOVRELD_B32_V8; 1568 case 64: 1569 return AMDGPU::V_MOVRELD_B32_V16; 1570 default: 1571 llvm_unreachable("unsupported size for MOVRELD pseudos"); 1572 } 1573 } 1574 1575 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, 1576 MachineBasicBlock &MBB, 1577 const SISubtarget &ST) { 1578 const SIInstrInfo *TII = ST.getInstrInfo(); 1579 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 1580 MachineFunction *MF = MBB.getParent(); 1581 MachineRegisterInfo &MRI = MF->getRegInfo(); 1582 1583 unsigned Dst = MI.getOperand(0).getReg(); 1584 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); 1585 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 1586 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); 1587 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 1588 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); 1589 1590 // This can be an immediate, but will be folded later. 1591 assert(Val->getReg()); 1592 1593 unsigned SubReg; 1594 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, 1595 SrcVec->getReg(), 1596 Offset); 1597 bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode; 1598 1599 if (Idx->getReg() == AMDGPU::NoRegister) { 1600 MachineBasicBlock::iterator I(&MI); 1601 const DebugLoc &DL = MI.getDebugLoc(); 1602 1603 assert(Offset == 0); 1604 1605 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) 1606 .addOperand(*SrcVec) 1607 .addOperand(*Val) 1608 .addImm(SubReg); 1609 1610 MI.eraseFromParent(); 1611 return &MBB; 1612 } 1613 1614 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) { 1615 MachineBasicBlock::iterator I(&MI); 1616 const DebugLoc &DL = MI.getDebugLoc(); 1617 1618 if (UseGPRIdxMode) { 1619 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) 1620 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst 1621 .addOperand(*Val) 1622 .addReg(Dst, RegState::ImplicitDefine) 1623 .addReg(SrcVec->getReg(), RegState::Implicit) 1624 .addReg(AMDGPU::M0, RegState::Implicit); 1625 1626 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 1627 } else { 1628 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); 1629 1630 BuildMI(MBB, I, DL, MovRelDesc) 1631 .addReg(Dst, RegState::Define) 1632 .addReg(SrcVec->getReg()) 1633 .addOperand(*Val) 1634 .addImm(SubReg - AMDGPU::sub0); 1635 } 1636 1637 MI.eraseFromParent(); 1638 return &MBB; 1639 } 1640 1641 if (Val->isReg()) 1642 MRI.clearKillFlags(Val->getReg()); 1643 1644 const DebugLoc &DL = MI.getDebugLoc(); 1645 1646 if (UseGPRIdxMode) { 1647 MachineBasicBlock::iterator I(&MI); 1648 1649 MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 1650 .addImm(0) // Reset inside loop. 1651 .addImm(VGPRIndexMode::DST_ENABLE); 1652 SetOn->getOperand(3).setIsUndef(); 1653 1654 // Disable again after the loop. 1655 BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 1656 } 1657 1658 unsigned PhiReg = MRI.createVirtualRegister(VecRC); 1659 1660 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, 1661 Offset, UseGPRIdxMode); 1662 MachineBasicBlock *LoopBB = InsPt->getParent(); 1663 1664 if (UseGPRIdxMode) { 1665 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) 1666 .addReg(PhiReg, RegState::Undef, SubReg) // vdst 1667 .addOperand(*Val) // src0 1668 .addReg(Dst, RegState::ImplicitDefine) 1669 .addReg(PhiReg, RegState::Implicit) 1670 .addReg(AMDGPU::M0, RegState::Implicit); 1671 } else { 1672 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); 1673 1674 BuildMI(*LoopBB, InsPt, DL, MovRelDesc) 1675 .addReg(Dst, RegState::Define) 1676 .addReg(PhiReg) 1677 .addOperand(*Val) 1678 .addImm(SubReg - AMDGPU::sub0); 1679 } 1680 1681 MI.eraseFromParent(); 1682 1683 return LoopBB; 1684 } 1685 1686 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( 1687 MachineInstr &MI, MachineBasicBlock *BB) const { 1688 switch (MI.getOpcode()) { 1689 case AMDGPU::SI_INIT_M0: { 1690 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1691 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), 1692 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1693 .addOperand(MI.getOperand(0)); 1694 MI.eraseFromParent(); 1695 return BB; 1696 } 1697 case AMDGPU::GET_GROUPSTATICSIZE: { 1698 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1699 1700 MachineFunction *MF = BB->getParent(); 1701 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1702 DebugLoc DL = MI.getDebugLoc(); 1703 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) 1704 .addOperand(MI.getOperand(0)) 1705 .addImm(MFI->getLDSSize()); 1706 MI.eraseFromParent(); 1707 return BB; 1708 } 1709 case AMDGPU::SI_INDIRECT_SRC_V1: 1710 case AMDGPU::SI_INDIRECT_SRC_V2: 1711 case AMDGPU::SI_INDIRECT_SRC_V4: 1712 case AMDGPU::SI_INDIRECT_SRC_V8: 1713 case AMDGPU::SI_INDIRECT_SRC_V16: 1714 return emitIndirectSrc(MI, *BB, *getSubtarget()); 1715 case AMDGPU::SI_INDIRECT_DST_V1: 1716 case AMDGPU::SI_INDIRECT_DST_V2: 1717 case AMDGPU::SI_INDIRECT_DST_V4: 1718 case AMDGPU::SI_INDIRECT_DST_V8: 1719 case AMDGPU::SI_INDIRECT_DST_V16: 1720 return emitIndirectDst(MI, *BB, *getSubtarget()); 1721 case AMDGPU::SI_KILL: 1722 return splitKillBlock(MI, BB); 1723 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 1724 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 1725 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 1726 1727 unsigned Dst = MI.getOperand(0).getReg(); 1728 unsigned Src0 = MI.getOperand(1).getReg(); 1729 unsigned Src1 = MI.getOperand(2).getReg(); 1730 const DebugLoc &DL = MI.getDebugLoc(); 1731 unsigned SrcCond = MI.getOperand(3).getReg(); 1732 1733 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1734 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1735 1736 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 1737 .addReg(Src0, 0, AMDGPU::sub0) 1738 .addReg(Src1, 0, AMDGPU::sub0) 1739 .addReg(SrcCond); 1740 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 1741 .addReg(Src0, 0, AMDGPU::sub1) 1742 .addReg(Src1, 0, AMDGPU::sub1) 1743 .addReg(SrcCond); 1744 1745 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) 1746 .addReg(DstLo) 1747 .addImm(AMDGPU::sub0) 1748 .addReg(DstHi) 1749 .addImm(AMDGPU::sub1); 1750 MI.eraseFromParent(); 1751 return BB; 1752 } 1753 default: 1754 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 1755 } 1756 } 1757 1758 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 1759 // This currently forces unfolding various combinations of fsub into fma with 1760 // free fneg'd operands. As long as we have fast FMA (controlled by 1761 // isFMAFasterThanFMulAndFAdd), we should perform these. 1762 1763 // When fma is quarter rate, for f64 where add / sub are at best half rate, 1764 // most of these combines appear to be cycle neutral but save on instruction 1765 // count / code size. 1766 return true; 1767 } 1768 1769 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, 1770 EVT VT) const { 1771 if (!VT.isVector()) { 1772 return MVT::i1; 1773 } 1774 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 1775 } 1776 1777 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const { 1778 return MVT::i32; 1779 } 1780 1781 // Answering this is somewhat tricky and depends on the specific device which 1782 // have different rates for fma or all f64 operations. 1783 // 1784 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 1785 // regardless of which device (although the number of cycles differs between 1786 // devices), so it is always profitable for f64. 1787 // 1788 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 1789 // only on full rate devices. Normally, we should prefer selecting v_mad_f32 1790 // which we can always do even without fused FP ops since it returns the same 1791 // result as the separate operations and since it is always full 1792 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 1793 // however does not support denormals, so we do report fma as faster if we have 1794 // a fast fma device and require denormals. 1795 // 1796 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 1797 VT = VT.getScalarType(); 1798 1799 if (!VT.isSimple()) 1800 return false; 1801 1802 switch (VT.getSimpleVT().SimpleTy) { 1803 case MVT::f32: 1804 // This is as fast on some subtargets. However, we always have full rate f32 1805 // mad available which returns the same result as the separate operations 1806 // which we should prefer over fma. We can't use this if we want to support 1807 // denormals, so only report this in these cases. 1808 return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); 1809 case MVT::f64: 1810 return true; 1811 default: 1812 break; 1813 } 1814 1815 return false; 1816 } 1817 1818 //===----------------------------------------------------------------------===// 1819 // Custom DAG Lowering Operations 1820 //===----------------------------------------------------------------------===// 1821 1822 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 1823 switch (Op.getOpcode()) { 1824 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 1825 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 1826 case ISD::LOAD: { 1827 SDValue Result = LowerLOAD(Op, DAG); 1828 assert((!Result.getNode() || 1829 Result.getNode()->getNumValues() == 2) && 1830 "Load should return a value and a chain"); 1831 return Result; 1832 } 1833 1834 case ISD::FSIN: 1835 case ISD::FCOS: 1836 return LowerTrig(Op, DAG); 1837 case ISD::SELECT: return LowerSELECT(Op, DAG); 1838 case ISD::FDIV: return LowerFDIV(Op, DAG); 1839 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); 1840 case ISD::STORE: return LowerSTORE(Op, DAG); 1841 case ISD::GlobalAddress: { 1842 MachineFunction &MF = DAG.getMachineFunction(); 1843 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1844 return LowerGlobalAddress(MFI, Op, DAG); 1845 } 1846 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 1847 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); 1848 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); 1849 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); 1850 case ISD::TRAP: return lowerTRAP(Op, DAG); 1851 case ISD::FP_ROUND: 1852 return lowerFP_ROUND(Op, DAG); 1853 } 1854 return SDValue(); 1855 } 1856 1857 /// \brief Helper function for LowerBRCOND 1858 static SDNode *findUser(SDValue Value, unsigned Opcode) { 1859 1860 SDNode *Parent = Value.getNode(); 1861 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 1862 I != E; ++I) { 1863 1864 if (I.getUse().get() != Value) 1865 continue; 1866 1867 if (I->getOpcode() == Opcode) 1868 return *I; 1869 } 1870 return nullptr; 1871 } 1872 1873 bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { 1874 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 1875 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { 1876 case AMDGPUIntrinsic::amdgcn_if: 1877 case AMDGPUIntrinsic::amdgcn_else: 1878 case AMDGPUIntrinsic::amdgcn_end_cf: 1879 case AMDGPUIntrinsic::amdgcn_loop: 1880 return true; 1881 default: 1882 return false; 1883 } 1884 } 1885 1886 if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) { 1887 switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) { 1888 case AMDGPUIntrinsic::amdgcn_break: 1889 case AMDGPUIntrinsic::amdgcn_if_break: 1890 case AMDGPUIntrinsic::amdgcn_else_break: 1891 return true; 1892 default: 1893 return false; 1894 } 1895 } 1896 1897 return false; 1898 } 1899 1900 void SITargetLowering::createDebuggerPrologueStackObjects( 1901 MachineFunction &MF) const { 1902 // Create stack objects that are used for emitting debugger prologue. 1903 // 1904 // Debugger prologue writes work group IDs and work item IDs to scratch memory 1905 // at fixed location in the following format: 1906 // offset 0: work group ID x 1907 // offset 4: work group ID y 1908 // offset 8: work group ID z 1909 // offset 16: work item ID x 1910 // offset 20: work item ID y 1911 // offset 24: work item ID z 1912 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1913 int ObjectIdx = 0; 1914 1915 // For each dimension: 1916 for (unsigned i = 0; i < 3; ++i) { 1917 // Create fixed stack object for work group ID. 1918 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true); 1919 Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); 1920 // Create fixed stack object for work item ID. 1921 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true); 1922 Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); 1923 } 1924 } 1925 1926 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { 1927 const Triple &TT = getTargetMachine().getTargetTriple(); 1928 return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && 1929 AMDGPU::shouldEmitConstantsToTextSection(TT); 1930 } 1931 1932 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { 1933 return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || 1934 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) && 1935 !shouldEmitFixup(GV) && 1936 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); 1937 } 1938 1939 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { 1940 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); 1941 } 1942 1943 /// This transforms the control flow intrinsics to get the branch destination as 1944 /// last parameter, also switches branch target with BR if the need arise 1945 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 1946 SelectionDAG &DAG) const { 1947 1948 SDLoc DL(BRCOND); 1949 1950 SDNode *Intr = BRCOND.getOperand(1).getNode(); 1951 SDValue Target = BRCOND.getOperand(2); 1952 SDNode *BR = nullptr; 1953 SDNode *SetCC = nullptr; 1954 1955 if (Intr->getOpcode() == ISD::SETCC) { 1956 // As long as we negate the condition everything is fine 1957 SetCC = Intr; 1958 Intr = SetCC->getOperand(0).getNode(); 1959 1960 } else { 1961 // Get the target from BR if we don't negate the condition 1962 BR = findUser(BRCOND, ISD::BR); 1963 Target = BR->getOperand(1); 1964 } 1965 1966 // FIXME: This changes the types of the intrinsics instead of introducing new 1967 // nodes with the correct types. 1968 // e.g. llvm.amdgcn.loop 1969 1970 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3 1971 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088> 1972 1973 if (!isCFIntrinsic(Intr)) { 1974 // This is a uniform branch so we don't need to legalize. 1975 return BRCOND; 1976 } 1977 1978 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID || 1979 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; 1980 1981 assert(!SetCC || 1982 (SetCC->getConstantOperandVal(1) == 1 && 1983 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 1984 ISD::SETNE)); 1985 1986 // operands of the new intrinsic call 1987 SmallVector<SDValue, 4> Ops; 1988 if (HaveChain) 1989 Ops.push_back(BRCOND.getOperand(0)); 1990 1991 Ops.append(Intr->op_begin() + (HaveChain ? 1 : 0), Intr->op_end()); 1992 Ops.push_back(Target); 1993 1994 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 1995 1996 // build the new intrinsic call 1997 SDNode *Result = DAG.getNode( 1998 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 1999 DAG.getVTList(Res), Ops).getNode(); 2000 2001 if (!HaveChain) { 2002 SDValue Ops[] = { 2003 SDValue(Result, 0), 2004 BRCOND.getOperand(0) 2005 }; 2006 2007 Result = DAG.getMergeValues(Ops, DL).getNode(); 2008 } 2009 2010 if (BR) { 2011 // Give the branch instruction our target 2012 SDValue Ops[] = { 2013 BR->getOperand(0), 2014 BRCOND.getOperand(2) 2015 }; 2016 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 2017 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 2018 BR = NewBR.getNode(); 2019 } 2020 2021 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 2022 2023 // Copy the intrinsic results to registers 2024 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 2025 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 2026 if (!CopyToReg) 2027 continue; 2028 2029 Chain = DAG.getCopyToReg( 2030 Chain, DL, 2031 CopyToReg->getOperand(1), 2032 SDValue(Result, i - 1), 2033 SDValue()); 2034 2035 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 2036 } 2037 2038 // Remove the old intrinsic from the chain 2039 DAG.ReplaceAllUsesOfValueWith( 2040 SDValue(Intr, Intr->getNumValues() - 1), 2041 Intr->getOperand(0)); 2042 2043 return Chain; 2044 } 2045 2046 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, 2047 SDValue Op, 2048 const SDLoc &DL, 2049 EVT VT) const { 2050 return Op.getValueType().bitsLE(VT) ? 2051 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) : 2052 DAG.getNode(ISD::FTRUNC, DL, VT, Op); 2053 } 2054 2055 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 2056 assert(Op.getValueType() == MVT::f16 && 2057 "Do not know how to custom lower FP_ROUND for non-f16 type"); 2058 2059 SDValue Src = Op.getOperand(0); 2060 EVT SrcVT = Src.getValueType(); 2061 if (SrcVT != MVT::f64) 2062 return Op; 2063 2064 SDLoc DL(Op); 2065 2066 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); 2067 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); 2068 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);; 2069 } 2070 2071 SDValue SITargetLowering::getSegmentAperture(unsigned AS, 2072 SelectionDAG &DAG) const { 2073 SDLoc SL; 2074 MachineFunction &MF = DAG.getMachineFunction(); 2075 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2076 unsigned UserSGPR = Info->getQueuePtrUserSGPR(); 2077 assert(UserSGPR != AMDGPU::NoRegister); 2078 2079 SDValue QueuePtr = CreateLiveInRegister( 2080 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); 2081 2082 // Offset into amd_queue_t for group_segment_aperture_base_hi / 2083 // private_segment_aperture_base_hi. 2084 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 2085 2086 SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr, 2087 DAG.getConstant(StructOffset, SL, MVT::i64)); 2088 2089 // TODO: Use custom target PseudoSourceValue. 2090 // TODO: We should use the value from the IR intrinsic call, but it might not 2091 // be available and how do we get it? 2092 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), 2093 AMDGPUAS::CONSTANT_ADDRESS)); 2094 2095 MachinePointerInfo PtrInfo(V, StructOffset); 2096 return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo, 2097 MinAlign(64, StructOffset), 2098 MachineMemOperand::MODereferenceable | 2099 MachineMemOperand::MOInvariant); 2100 } 2101 2102 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, 2103 SelectionDAG &DAG) const { 2104 SDLoc SL(Op); 2105 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op); 2106 2107 SDValue Src = ASC->getOperand(0); 2108 2109 // FIXME: Really support non-0 null pointers. 2110 SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32); 2111 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); 2112 2113 // flat -> local/private 2114 if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { 2115 if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 2116 ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 2117 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); 2118 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); 2119 2120 return DAG.getNode(ISD::SELECT, SL, MVT::i32, 2121 NonNull, Ptr, SegmentNullPtr); 2122 } 2123 } 2124 2125 // local/private -> flat 2126 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { 2127 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 2128 ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 2129 SDValue NonNull 2130 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); 2131 2132 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG); 2133 SDValue CvtPtr 2134 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); 2135 2136 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, 2137 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), 2138 FlatNullPtr); 2139 } 2140 } 2141 2142 // global <-> flat are no-ops and never emitted. 2143 2144 const MachineFunction &MF = DAG.getMachineFunction(); 2145 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2146 *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); 2147 DAG.getContext()->diagnose(InvalidAddrSpaceCast); 2148 2149 return DAG.getUNDEF(ASC->getValueType(0)); 2150 } 2151 2152 bool 2153 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 2154 // We can fold offsets for anything that doesn't require a GOT relocation. 2155 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || 2156 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) && 2157 !shouldEmitGOTReloc(GA->getGlobal()); 2158 } 2159 2160 static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, 2161 SDLoc DL, unsigned Offset, EVT PtrVT, 2162 unsigned GAFlags = SIInstrInfo::MO_NONE) { 2163 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is 2164 // lowered to the following code sequence: 2165 // 2166 // For constant address space: 2167 // s_getpc_b64 s[0:1] 2168 // s_add_u32 s0, s0, $symbol 2169 // s_addc_u32 s1, s1, 0 2170 // 2171 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2172 // a fixup or relocation is emitted to replace $symbol with a literal 2173 // constant, which is a pc-relative offset from the encoding of the $symbol 2174 // operand to the global variable. 2175 // 2176 // For global address space: 2177 // s_getpc_b64 s[0:1] 2178 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2179 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2180 // 2181 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2182 // fixups or relocations are emitted to replace $symbol@*@lo and 2183 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2184 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2185 // operand to the global variable. 2186 // 2187 // What we want here is an offset from the value returned by s_getpc 2188 // (which is the address of the s_add_u32 instruction) to the global 2189 // variable, but since the encoding of $symbol starts 4 bytes after the start 2190 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2191 // small. This requires us to add 4 to the global variable offset in order to 2192 // compute the correct address. 2193 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, 2194 GAFlags); 2195 SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, 2196 GAFlags == SIInstrInfo::MO_NONE ? 2197 GAFlags : GAFlags + 1); 2198 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); 2199 } 2200 2201 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 2202 SDValue Op, 2203 SelectionDAG &DAG) const { 2204 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 2205 2206 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && 2207 GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) 2208 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 2209 2210 SDLoc DL(GSD); 2211 const GlobalValue *GV = GSD->getGlobal(); 2212 EVT PtrVT = Op.getValueType(); 2213 2214 if (shouldEmitFixup(GV)) 2215 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); 2216 else if (shouldEmitPCReloc(GV)) 2217 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT, 2218 SIInstrInfo::MO_REL32); 2219 2220 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, 2221 SIInstrInfo::MO_GOTPCREL32); 2222 2223 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); 2224 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 2225 const DataLayout &DataLayout = DAG.getDataLayout(); 2226 unsigned Align = DataLayout.getABITypeAlignment(PtrTy); 2227 // FIXME: Use a PseudoSourceValue once those can be assigned an address space. 2228 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 2229 2230 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align, 2231 MachineMemOperand::MODereferenceable | 2232 MachineMemOperand::MOInvariant); 2233 } 2234 2235 SDValue SITargetLowering::lowerTRAP(SDValue Op, 2236 SelectionDAG &DAG) const { 2237 const MachineFunction &MF = DAG.getMachineFunction(); 2238 DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), 2239 "trap handler not supported", 2240 Op.getDebugLoc(), 2241 DS_Warning); 2242 DAG.getContext()->diagnose(NoTrap); 2243 2244 // Emit s_endpgm. 2245 2246 // FIXME: This should really be selected to s_trap, but that requires 2247 // setting up the trap handler for it o do anything. 2248 return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other, 2249 Op.getOperand(0)); 2250 } 2251 2252 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, 2253 const SDLoc &DL, SDValue V) const { 2254 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as 2255 // the destination register. 2256 // 2257 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, 2258 // so we will end up with redundant moves to m0. 2259 // 2260 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. 2261 2262 // A Null SDValue creates a glue result. 2263 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, 2264 V, Chain); 2265 return SDValue(M0, 0); 2266 } 2267 2268 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, 2269 SDValue Op, 2270 MVT VT, 2271 unsigned Offset) const { 2272 SDLoc SL(Op); 2273 SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, 2274 DAG.getEntryNode(), Offset, false); 2275 // The local size values will have the hi 16-bits as zero. 2276 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, 2277 DAG.getValueType(VT)); 2278 } 2279 2280 static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { 2281 DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), 2282 "non-hsa intrinsic with hsa target", 2283 DL.getDebugLoc()); 2284 DAG.getContext()->diagnose(BadIntrin); 2285 return DAG.getUNDEF(VT); 2286 } 2287 2288 static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { 2289 DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), 2290 "intrinsic not supported on subtarget", 2291 DL.getDebugLoc()); 2292 DAG.getContext()->diagnose(BadIntrin); 2293 return DAG.getUNDEF(VT); 2294 } 2295 2296 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 2297 SelectionDAG &DAG) const { 2298 MachineFunction &MF = DAG.getMachineFunction(); 2299 auto MFI = MF.getInfo<SIMachineFunctionInfo>(); 2300 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 2301 2302 EVT VT = Op.getValueType(); 2303 SDLoc DL(Op); 2304 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2305 2306 // TODO: Should this propagate fast-math-flags? 2307 2308 switch (IntrinsicID) { 2309 case Intrinsic::amdgcn_dispatch_ptr: 2310 case Intrinsic::amdgcn_queue_ptr: { 2311 if (!Subtarget->isAmdCodeObjectV2()) { 2312 DiagnosticInfoUnsupported BadIntrin( 2313 *MF.getFunction(), "unsupported hsa intrinsic without hsa target", 2314 DL.getDebugLoc()); 2315 DAG.getContext()->diagnose(BadIntrin); 2316 return DAG.getUNDEF(VT); 2317 } 2318 2319 auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? 2320 SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR; 2321 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, 2322 TRI->getPreloadedValue(MF, Reg), VT); 2323 } 2324 case Intrinsic::amdgcn_implicitarg_ptr: { 2325 unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); 2326 return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset); 2327 } 2328 case Intrinsic::amdgcn_kernarg_segment_ptr: { 2329 unsigned Reg 2330 = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); 2331 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); 2332 } 2333 case Intrinsic::amdgcn_dispatch_id: { 2334 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID); 2335 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); 2336 } 2337 case Intrinsic::amdgcn_rcp: 2338 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); 2339 case Intrinsic::amdgcn_rsq: 2340 case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name 2341 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 2342 case Intrinsic::amdgcn_rsq_legacy: { 2343 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 2344 return emitRemovedIntrinsicError(DAG, DL, VT); 2345 2346 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 2347 } 2348 case Intrinsic::amdgcn_rcp_legacy: { 2349 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 2350 return emitRemovedIntrinsicError(DAG, DL, VT); 2351 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1)); 2352 } 2353 case Intrinsic::amdgcn_rsq_clamp: { 2354 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) 2355 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 2356 2357 Type *Type = VT.getTypeForEVT(*DAG.getContext()); 2358 APFloat Max = APFloat::getLargest(Type->getFltSemantics()); 2359 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); 2360 2361 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 2362 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, 2363 DAG.getConstantFP(Max, DL, VT)); 2364 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, 2365 DAG.getConstantFP(Min, DL, VT)); 2366 } 2367 case Intrinsic::r600_read_ngroups_x: 2368 if (Subtarget->isAmdHsaOS()) 2369 return emitNonHSAIntrinsicError(DAG, DL, VT); 2370 2371 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2372 SI::KernelInputOffsets::NGROUPS_X, false); 2373 case Intrinsic::r600_read_ngroups_y: 2374 if (Subtarget->isAmdHsaOS()) 2375 return emitNonHSAIntrinsicError(DAG, DL, VT); 2376 2377 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2378 SI::KernelInputOffsets::NGROUPS_Y, false); 2379 case Intrinsic::r600_read_ngroups_z: 2380 if (Subtarget->isAmdHsaOS()) 2381 return emitNonHSAIntrinsicError(DAG, DL, VT); 2382 2383 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2384 SI::KernelInputOffsets::NGROUPS_Z, false); 2385 case Intrinsic::r600_read_global_size_x: 2386 if (Subtarget->isAmdHsaOS()) 2387 return emitNonHSAIntrinsicError(DAG, DL, VT); 2388 2389 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2390 SI::KernelInputOffsets::GLOBAL_SIZE_X, false); 2391 case Intrinsic::r600_read_global_size_y: 2392 if (Subtarget->isAmdHsaOS()) 2393 return emitNonHSAIntrinsicError(DAG, DL, VT); 2394 2395 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2396 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); 2397 case Intrinsic::r600_read_global_size_z: 2398 if (Subtarget->isAmdHsaOS()) 2399 return emitNonHSAIntrinsicError(DAG, DL, VT); 2400 2401 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 2402 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); 2403 case Intrinsic::r600_read_local_size_x: 2404 if (Subtarget->isAmdHsaOS()) 2405 return emitNonHSAIntrinsicError(DAG, DL, VT); 2406 2407 return lowerImplicitZextParam(DAG, Op, MVT::i16, 2408 SI::KernelInputOffsets::LOCAL_SIZE_X); 2409 case Intrinsic::r600_read_local_size_y: 2410 if (Subtarget->isAmdHsaOS()) 2411 return emitNonHSAIntrinsicError(DAG, DL, VT); 2412 2413 return lowerImplicitZextParam(DAG, Op, MVT::i16, 2414 SI::KernelInputOffsets::LOCAL_SIZE_Y); 2415 case Intrinsic::r600_read_local_size_z: 2416 if (Subtarget->isAmdHsaOS()) 2417 return emitNonHSAIntrinsicError(DAG, DL, VT); 2418 2419 return lowerImplicitZextParam(DAG, Op, MVT::i16, 2420 SI::KernelInputOffsets::LOCAL_SIZE_Z); 2421 case Intrinsic::amdgcn_workgroup_id_x: 2422 case Intrinsic::r600_read_tgid_x: 2423 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, 2424 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); 2425 case Intrinsic::amdgcn_workgroup_id_y: 2426 case Intrinsic::r600_read_tgid_y: 2427 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, 2428 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); 2429 case Intrinsic::amdgcn_workgroup_id_z: 2430 case Intrinsic::r600_read_tgid_z: 2431 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, 2432 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); 2433 case Intrinsic::amdgcn_workitem_id_x: 2434 case Intrinsic::r600_read_tidig_x: 2435 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 2436 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); 2437 case Intrinsic::amdgcn_workitem_id_y: 2438 case Intrinsic::r600_read_tidig_y: 2439 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 2440 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); 2441 case Intrinsic::amdgcn_workitem_id_z: 2442 case Intrinsic::r600_read_tidig_z: 2443 return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, 2444 TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); 2445 case AMDGPUIntrinsic::SI_load_const: { 2446 SDValue Ops[] = { 2447 Op.getOperand(1), 2448 Op.getOperand(2) 2449 }; 2450 2451 MachineMemOperand *MMO = MF.getMachineMemOperand( 2452 MachinePointerInfo(), 2453 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2454 MachineMemOperand::MOInvariant, 2455 VT.getStoreSize(), 4); 2456 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 2457 Op->getVTList(), Ops, VT, MMO); 2458 } 2459 case AMDGPUIntrinsic::amdgcn_fdiv_fast: { 2460 return lowerFDIV_FAST(Op, DAG); 2461 } 2462 case AMDGPUIntrinsic::SI_vs_load_input: 2463 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 2464 Op.getOperand(1), 2465 Op.getOperand(2), 2466 Op.getOperand(3)); 2467 2468 case AMDGPUIntrinsic::SI_fs_constant: { 2469 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 2470 SDValue Glue = M0.getValue(1); 2471 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, 2472 DAG.getConstant(2, DL, MVT::i32), // P0 2473 Op.getOperand(1), Op.getOperand(2), Glue); 2474 } 2475 case AMDGPUIntrinsic::SI_packf16: 2476 if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) 2477 return DAG.getUNDEF(MVT::i32); 2478 return Op; 2479 case AMDGPUIntrinsic::SI_fs_interp: { 2480 SDValue IJ = Op.getOperand(4); 2481 SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 2482 DAG.getConstant(0, DL, MVT::i32)); 2483 SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, 2484 DAG.getConstant(1, DL, MVT::i32)); 2485 I = DAG.getNode(ISD::BITCAST, DL, MVT::f32, I); 2486 J = DAG.getNode(ISD::BITCAST, DL, MVT::f32, J); 2487 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); 2488 SDValue Glue = M0.getValue(1); 2489 SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, 2490 DAG.getVTList(MVT::f32, MVT::Glue), 2491 I, Op.getOperand(1), Op.getOperand(2), Glue); 2492 Glue = SDValue(P1.getNode(), 1); 2493 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, 2494 Op.getOperand(1), Op.getOperand(2), Glue); 2495 } 2496 case Intrinsic::amdgcn_interp_mov: { 2497 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); 2498 SDValue Glue = M0.getValue(1); 2499 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1), 2500 Op.getOperand(2), Op.getOperand(3), Glue); 2501 } 2502 case Intrinsic::amdgcn_interp_p1: { 2503 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); 2504 SDValue Glue = M0.getValue(1); 2505 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), 2506 Op.getOperand(2), Op.getOperand(3), Glue); 2507 } 2508 case Intrinsic::amdgcn_interp_p2: { 2509 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); 2510 SDValue Glue = SDValue(M0.getNode(), 1); 2511 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), 2512 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), 2513 Glue); 2514 } 2515 case Intrinsic::amdgcn_sin: 2516 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); 2517 2518 case Intrinsic::amdgcn_cos: 2519 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); 2520 2521 case Intrinsic::amdgcn_log_clamp: { 2522 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) 2523 return SDValue(); 2524 2525 DiagnosticInfoUnsupported BadIntrin( 2526 *MF.getFunction(), "intrinsic not supported on subtarget", 2527 DL.getDebugLoc()); 2528 DAG.getContext()->diagnose(BadIntrin); 2529 return DAG.getUNDEF(VT); 2530 } 2531 case Intrinsic::amdgcn_ldexp: 2532 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, 2533 Op.getOperand(1), Op.getOperand(2)); 2534 2535 case Intrinsic::amdgcn_fract: 2536 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 2537 2538 case Intrinsic::amdgcn_class: 2539 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, 2540 Op.getOperand(1), Op.getOperand(2)); 2541 case Intrinsic::amdgcn_div_fmas: 2542 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, 2543 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), 2544 Op.getOperand(4)); 2545 2546 case Intrinsic::amdgcn_div_fixup: 2547 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, 2548 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 2549 2550 case Intrinsic::amdgcn_trig_preop: 2551 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, 2552 Op.getOperand(1), Op.getOperand(2)); 2553 case Intrinsic::amdgcn_div_scale: { 2554 // 3rd parameter required to be a constant. 2555 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 2556 if (!Param) 2557 return DAG.getUNDEF(VT); 2558 2559 // Translate to the operands expected by the machine instruction. The 2560 // first parameter must be the same as the first instruction. 2561 SDValue Numerator = Op.getOperand(1); 2562 SDValue Denominator = Op.getOperand(2); 2563 2564 // Note this order is opposite of the machine instruction's operations, 2565 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The 2566 // intrinsic has the numerator as the first operand to match a normal 2567 // division operation. 2568 2569 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; 2570 2571 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, 2572 Denominator, Numerator); 2573 } 2574 case Intrinsic::amdgcn_icmp: { 2575 const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 2576 int CondCode = CD->getSExtValue(); 2577 2578 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || 2579 CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE) 2580 return DAG.getUNDEF(VT); 2581 2582 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); 2583 ISD::CondCode CCOpcode = getICmpCondCode(IcInput); 2584 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), 2585 Op.getOperand(2), DAG.getCondCode(CCOpcode)); 2586 } 2587 case Intrinsic::amdgcn_fcmp: { 2588 const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 2589 int CondCode = CD->getSExtValue(); 2590 2591 if (CondCode <= FCmpInst::Predicate::FCMP_FALSE || 2592 CondCode >= FCmpInst::Predicate::FCMP_TRUE) 2593 return DAG.getUNDEF(VT); 2594 2595 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); 2596 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); 2597 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), 2598 Op.getOperand(2), DAG.getCondCode(CCOpcode)); 2599 } 2600 case Intrinsic::amdgcn_fmul_legacy: 2601 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, 2602 Op.getOperand(1), Op.getOperand(2)); 2603 case Intrinsic::amdgcn_sffbh: 2604 case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name. 2605 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); 2606 default: 2607 return AMDGPUTargetLowering::LowerOperation(Op, DAG); 2608 } 2609 } 2610 2611 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, 2612 SelectionDAG &DAG) const { 2613 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 2614 switch (IntrID) { 2615 case Intrinsic::amdgcn_atomic_inc: 2616 case Intrinsic::amdgcn_atomic_dec: { 2617 MemSDNode *M = cast<MemSDNode>(Op); 2618 unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ? 2619 AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC; 2620 SDValue Ops[] = { 2621 M->getOperand(0), // Chain 2622 M->getOperand(2), // Ptr 2623 M->getOperand(3) // Value 2624 }; 2625 2626 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, 2627 M->getMemoryVT(), M->getMemOperand()); 2628 } 2629 default: 2630 return SDValue(); 2631 } 2632 } 2633 2634 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 2635 SelectionDAG &DAG) const { 2636 MachineFunction &MF = DAG.getMachineFunction(); 2637 SDLoc DL(Op); 2638 SDValue Chain = Op.getOperand(0); 2639 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 2640 2641 switch (IntrinsicID) { 2642 case AMDGPUIntrinsic::SI_sendmsg: { 2643 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 2644 SDValue Glue = Chain.getValue(1); 2645 return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, 2646 Op.getOperand(2), Glue); 2647 } 2648 case AMDGPUIntrinsic::SI_tbuffer_store: { 2649 SDValue Ops[] = { 2650 Chain, 2651 Op.getOperand(2), 2652 Op.getOperand(3), 2653 Op.getOperand(4), 2654 Op.getOperand(5), 2655 Op.getOperand(6), 2656 Op.getOperand(7), 2657 Op.getOperand(8), 2658 Op.getOperand(9), 2659 Op.getOperand(10), 2660 Op.getOperand(11), 2661 Op.getOperand(12), 2662 Op.getOperand(13), 2663 Op.getOperand(14) 2664 }; 2665 2666 EVT VT = Op.getOperand(3).getValueType(); 2667 2668 MachineMemOperand *MMO = MF.getMachineMemOperand( 2669 MachinePointerInfo(), 2670 MachineMemOperand::MOStore, 2671 VT.getStoreSize(), 4); 2672 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 2673 Op->getVTList(), Ops, VT, MMO); 2674 } 2675 case AMDGPUIntrinsic::AMDGPU_kill: { 2676 SDValue Src = Op.getOperand(2); 2677 if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) { 2678 if (!K->isNegative()) 2679 return Chain; 2680 2681 SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32); 2682 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne); 2683 } 2684 2685 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); 2686 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast); 2687 } 2688 case AMDGPUIntrinsic::SI_export: { 2689 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2)); 2690 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3)); 2691 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4)); 2692 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5)); 2693 const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6)); 2694 2695 const SDValue Ops[] = { 2696 Chain, 2697 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), 2698 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1), 2699 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), 2700 DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1), 2701 Op.getOperand(7), // src0 2702 Op.getOperand(8), // src1 2703 Op.getOperand(9), // src2 2704 Op.getOperand(10) // src3 2705 }; 2706 2707 unsigned Opc = Done->isNullValue() ? 2708 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; 2709 return DAG.getNode(Opc, DL, Op->getVTList(), Ops); 2710 } 2711 default: 2712 return SDValue(); 2713 } 2714 } 2715 2716 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2717 SDLoc DL(Op); 2718 LoadSDNode *Load = cast<LoadSDNode>(Op); 2719 ISD::LoadExtType ExtType = Load->getExtensionType(); 2720 EVT MemVT = Load->getMemoryVT(); 2721 2722 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { 2723 // FIXME: Copied from PPC 2724 // First, load into 32 bits, then truncate to 1 bit. 2725 2726 SDValue Chain = Load->getChain(); 2727 SDValue BasePtr = Load->getBasePtr(); 2728 MachineMemOperand *MMO = Load->getMemOperand(); 2729 2730 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; 2731 2732 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, 2733 BasePtr, RealMemVT, MMO); 2734 2735 SDValue Ops[] = { 2736 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), 2737 NewLD.getValue(1) 2738 }; 2739 2740 return DAG.getMergeValues(Ops, DL); 2741 } 2742 2743 if (!MemVT.isVector()) 2744 return SDValue(); 2745 2746 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 2747 "Custom lowering for non-i32 vectors hasn't been implemented."); 2748 2749 unsigned AS = Load->getAddressSpace(); 2750 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, 2751 AS, Load->getAlignment())) { 2752 SDValue Ops[2]; 2753 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2754 return DAG.getMergeValues(Ops, DL); 2755 } 2756 2757 MachineFunction &MF = DAG.getMachineFunction(); 2758 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2759 // If there is a possibilty that flat instruction access scratch memory 2760 // then we need to use the same legalization rules we use for private. 2761 if (AS == AMDGPUAS::FLAT_ADDRESS) 2762 AS = MFI->hasFlatScratchInit() ? 2763 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; 2764 2765 unsigned NumElements = MemVT.getVectorNumElements(); 2766 switch (AS) { 2767 case AMDGPUAS::CONSTANT_ADDRESS: 2768 if (isMemOpUniform(Load)) 2769 return SDValue(); 2770 // Non-uniform loads will be selected to MUBUF instructions, so they 2771 // have the same legalization requirements as global and private 2772 // loads. 2773 // 2774 LLVM_FALLTHROUGH; 2775 case AMDGPUAS::GLOBAL_ADDRESS: { 2776 if (isMemOpUniform(Load) && isMemOpHasNoClobberedMemOperand(Load)) 2777 return SDValue(); 2778 // Non-uniform loads will be selected to MUBUF instructions, so they 2779 // have the same legalization requirements as global and private 2780 // loads. 2781 // 2782 } 2783 LLVM_FALLTHROUGH; 2784 case AMDGPUAS::FLAT_ADDRESS: 2785 if (NumElements > 4) 2786 return SplitVectorLoad(Op, DAG); 2787 // v4 loads are supported for private and global memory. 2788 return SDValue(); 2789 case AMDGPUAS::PRIVATE_ADDRESS: { 2790 // Depending on the setting of the private_element_size field in the 2791 // resource descriptor, we can only make private accesses up to a certain 2792 // size. 2793 switch (Subtarget->getMaxPrivateElementSize()) { 2794 case 4: 2795 return scalarizeVectorLoad(Load, DAG); 2796 case 8: 2797 if (NumElements > 2) 2798 return SplitVectorLoad(Op, DAG); 2799 return SDValue(); 2800 case 16: 2801 // Same as global/flat 2802 if (NumElements > 4) 2803 return SplitVectorLoad(Op, DAG); 2804 return SDValue(); 2805 default: 2806 llvm_unreachable("unsupported private_element_size"); 2807 } 2808 } 2809 case AMDGPUAS::LOCAL_ADDRESS: { 2810 if (NumElements > 2) 2811 return SplitVectorLoad(Op, DAG); 2812 2813 if (NumElements == 2) 2814 return SDValue(); 2815 2816 // If properly aligned, if we split we might be able to use ds_read_b64. 2817 return SplitVectorLoad(Op, DAG); 2818 } 2819 default: 2820 return SDValue(); 2821 } 2822 } 2823 2824 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 2825 if (Op.getValueType() != MVT::i64) 2826 return SDValue(); 2827 2828 SDLoc DL(Op); 2829 SDValue Cond = Op.getOperand(0); 2830 2831 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 2832 SDValue One = DAG.getConstant(1, DL, MVT::i32); 2833 2834 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 2835 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 2836 2837 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 2838 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 2839 2840 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 2841 2842 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 2843 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 2844 2845 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 2846 2847 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); 2848 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 2849 } 2850 2851 // Catch division cases where we can use shortcuts with rcp and rsq 2852 // instructions. 2853 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, 2854 SelectionDAG &DAG) const { 2855 SDLoc SL(Op); 2856 SDValue LHS = Op.getOperand(0); 2857 SDValue RHS = Op.getOperand(1); 2858 EVT VT = Op.getValueType(); 2859 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; 2860 2861 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 2862 if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()))) { 2863 2864 if (CLHS->isExactlyValue(1.0)) { 2865 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 2866 // the CI documentation has a worst case error of 1 ulp. 2867 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 2868 // use it as long as we aren't trying to use denormals. 2869 2870 // 1.0 / sqrt(x) -> rsq(x) 2871 // 2872 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 2873 // error seems really high at 2^29 ULP. 2874 if (RHS.getOpcode() == ISD::FSQRT) 2875 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); 2876 2877 // 1.0 / x -> rcp(x) 2878 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 2879 } 2880 2881 // Same as for 1.0, but expand the sign out of the constant. 2882 if (CLHS->isExactlyValue(-1.0)) { 2883 // -1.0 / x -> rcp (fneg x) 2884 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 2885 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); 2886 } 2887 } 2888 } 2889 2890 const SDNodeFlags *Flags = Op->getFlags(); 2891 2892 if (Unsafe || Flags->hasAllowReciprocal()) { 2893 // Turn into multiply by the reciprocal. 2894 // x / y -> x * (1.0 / y) 2895 SDNodeFlags Flags; 2896 Flags.setUnsafeAlgebra(true); 2897 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 2898 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags); 2899 } 2900 2901 return SDValue(); 2902 } 2903 2904 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, 2905 EVT VT, SDValue A, SDValue B, SDValue GlueChain) { 2906 if (GlueChain->getNumValues() <= 1) { 2907 return DAG.getNode(Opcode, SL, VT, A, B); 2908 } 2909 2910 assert(GlueChain->getNumValues() == 3); 2911 2912 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); 2913 switch (Opcode) { 2914 default: llvm_unreachable("no chain equivalent for opcode"); 2915 case ISD::FMUL: 2916 Opcode = AMDGPUISD::FMUL_W_CHAIN; 2917 break; 2918 } 2919 2920 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, 2921 GlueChain.getValue(2)); 2922 } 2923 2924 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, 2925 EVT VT, SDValue A, SDValue B, SDValue C, 2926 SDValue GlueChain) { 2927 if (GlueChain->getNumValues() <= 1) { 2928 return DAG.getNode(Opcode, SL, VT, A, B, C); 2929 } 2930 2931 assert(GlueChain->getNumValues() == 3); 2932 2933 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); 2934 switch (Opcode) { 2935 default: llvm_unreachable("no chain equivalent for opcode"); 2936 case ISD::FMA: 2937 Opcode = AMDGPUISD::FMA_W_CHAIN; 2938 break; 2939 } 2940 2941 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C, 2942 GlueChain.getValue(2)); 2943 } 2944 2945 // Faster 2.5 ULP division that does not support denormals. 2946 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { 2947 SDLoc SL(Op); 2948 SDValue LHS = Op.getOperand(1); 2949 SDValue RHS = Op.getOperand(2); 2950 2951 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); 2952 2953 const APFloat K0Val(BitsToFloat(0x6f800000)); 2954 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); 2955 2956 const APFloat K1Val(BitsToFloat(0x2f800000)); 2957 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); 2958 2959 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 2960 2961 EVT SetCCVT = 2962 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); 2963 2964 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 2965 2966 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); 2967 2968 // TODO: Should this propagate fast-math-flags? 2969 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); 2970 2971 // rcp does not support denormals. 2972 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); 2973 2974 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); 2975 2976 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); 2977 } 2978 2979 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 2980 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) 2981 return FastLowered; 2982 2983 SDLoc SL(Op); 2984 SDValue LHS = Op.getOperand(0); 2985 SDValue RHS = Op.getOperand(1); 2986 2987 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 2988 2989 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); 2990 2991 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, 2992 RHS, RHS, LHS); 2993 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, 2994 LHS, RHS, LHS); 2995 2996 // Denominator is scaled to not be denormal, so using rcp is ok. 2997 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, 2998 DenominatorScaled); 2999 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, 3000 DenominatorScaled); 3001 3002 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | 3003 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3004 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3005 3006 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); 3007 3008 if (!Subtarget->hasFP32Denormals()) { 3009 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 3010 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, 3011 SL, MVT::i32); 3012 SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, 3013 DAG.getEntryNode(), 3014 EnableDenormValue, BitField); 3015 SDValue Ops[3] = { 3016 NegDivScale0, 3017 EnableDenorm.getValue(0), 3018 EnableDenorm.getValue(1) 3019 }; 3020 3021 NegDivScale0 = DAG.getMergeValues(Ops, SL); 3022 } 3023 3024 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, 3025 ApproxRcp, One, NegDivScale0); 3026 3027 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, 3028 ApproxRcp, Fma0); 3029 3030 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, 3031 Fma1, Fma1); 3032 3033 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, 3034 NumeratorScaled, Mul); 3035 3036 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2); 3037 3038 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, 3039 NumeratorScaled, Fma3); 3040 3041 if (!Subtarget->hasFP32Denormals()) { 3042 const SDValue DisableDenormValue = 3043 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); 3044 SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, 3045 Fma4.getValue(1), 3046 DisableDenormValue, 3047 BitField, 3048 Fma4.getValue(2)); 3049 3050 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 3051 DisableDenorm, DAG.getRoot()); 3052 DAG.setRoot(OutputChain); 3053 } 3054 3055 SDValue Scale = NumeratorScaled.getValue(1); 3056 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, 3057 Fma4, Fma1, Fma3, Scale); 3058 3059 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); 3060 } 3061 3062 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 3063 if (DAG.getTarget().Options.UnsafeFPMath) 3064 return lowerFastUnsafeFDIV(Op, DAG); 3065 3066 SDLoc SL(Op); 3067 SDValue X = Op.getOperand(0); 3068 SDValue Y = Op.getOperand(1); 3069 3070 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 3071 3072 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 3073 3074 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 3075 3076 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 3077 3078 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 3079 3080 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 3081 3082 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 3083 3084 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 3085 3086 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 3087 3088 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 3089 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 3090 3091 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, 3092 NegDivScale0, Mul, DivScale1); 3093 3094 SDValue Scale; 3095 3096 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { 3097 // Workaround a hardware bug on SI where the condition output from div_scale 3098 // is not usable. 3099 3100 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); 3101 3102 // Figure out if the scale to use for div_fmas. 3103 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 3104 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 3105 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 3106 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 3107 3108 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 3109 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 3110 3111 SDValue Scale0Hi 3112 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 3113 SDValue Scale1Hi 3114 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 3115 3116 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 3117 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 3118 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 3119 } else { 3120 Scale = DivScale1.getValue(1); 3121 } 3122 3123 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, 3124 Fma4, Fma3, Mul, Scale); 3125 3126 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 3127 } 3128 3129 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 3130 EVT VT = Op.getValueType(); 3131 3132 if (VT == MVT::f32) 3133 return LowerFDIV32(Op, DAG); 3134 3135 if (VT == MVT::f64) 3136 return LowerFDIV64(Op, DAG); 3137 3138 llvm_unreachable("Unexpected type for fdiv"); 3139 } 3140 3141 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 3142 SDLoc DL(Op); 3143 StoreSDNode *Store = cast<StoreSDNode>(Op); 3144 EVT VT = Store->getMemoryVT(); 3145 3146 if (VT == MVT::i1) { 3147 return DAG.getTruncStore(Store->getChain(), DL, 3148 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 3149 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 3150 } 3151 3152 assert(VT.isVector() && 3153 Store->getValue().getValueType().getScalarType() == MVT::i32); 3154 3155 unsigned AS = Store->getAddressSpace(); 3156 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 3157 AS, Store->getAlignment())) { 3158 return expandUnalignedStore(Store, DAG); 3159 } 3160 3161 MachineFunction &MF = DAG.getMachineFunction(); 3162 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3163 // If there is a possibilty that flat instruction access scratch memory 3164 // then we need to use the same legalization rules we use for private. 3165 if (AS == AMDGPUAS::FLAT_ADDRESS) 3166 AS = MFI->hasFlatScratchInit() ? 3167 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; 3168 3169 unsigned NumElements = VT.getVectorNumElements(); 3170 switch (AS) { 3171 case AMDGPUAS::GLOBAL_ADDRESS: 3172 case AMDGPUAS::FLAT_ADDRESS: 3173 if (NumElements > 4) 3174 return SplitVectorStore(Op, DAG); 3175 return SDValue(); 3176 case AMDGPUAS::PRIVATE_ADDRESS: { 3177 switch (Subtarget->getMaxPrivateElementSize()) { 3178 case 4: 3179 return scalarizeVectorStore(Store, DAG); 3180 case 8: 3181 if (NumElements > 2) 3182 return SplitVectorStore(Op, DAG); 3183 return SDValue(); 3184 case 16: 3185 if (NumElements > 4) 3186 return SplitVectorStore(Op, DAG); 3187 return SDValue(); 3188 default: 3189 llvm_unreachable("unsupported private_element_size"); 3190 } 3191 } 3192 case AMDGPUAS::LOCAL_ADDRESS: { 3193 if (NumElements > 2) 3194 return SplitVectorStore(Op, DAG); 3195 3196 if (NumElements == 2) 3197 return Op; 3198 3199 // If properly aligned, if we split we might be able to use ds_write_b64. 3200 return SplitVectorStore(Op, DAG); 3201 } 3202 default: 3203 llvm_unreachable("unhandled address space"); 3204 } 3205 } 3206 3207 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 3208 SDLoc DL(Op); 3209 EVT VT = Op.getValueType(); 3210 SDValue Arg = Op.getOperand(0); 3211 // TODO: Should this propagate fast-math-flags? 3212 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 3213 DAG.getNode(ISD::FMUL, DL, VT, Arg, 3214 DAG.getConstantFP(0.5/M_PI, DL, 3215 VT))); 3216 3217 switch (Op.getOpcode()) { 3218 case ISD::FCOS: 3219 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); 3220 case ISD::FSIN: 3221 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); 3222 default: 3223 llvm_unreachable("Wrong trig opcode"); 3224 } 3225 } 3226 3227 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 3228 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op); 3229 assert(AtomicNode->isCompareAndSwap()); 3230 unsigned AS = AtomicNode->getAddressSpace(); 3231 3232 // No custom lowering required for local address space 3233 if (!isFlatGlobalAddrSpace(AS)) 3234 return Op; 3235 3236 // Non-local address space requires custom lowering for atomic compare 3237 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 3238 SDLoc DL(Op); 3239 SDValue ChainIn = Op.getOperand(0); 3240 SDValue Addr = Op.getOperand(1); 3241 SDValue Old = Op.getOperand(2); 3242 SDValue New = Op.getOperand(3); 3243 EVT VT = Op.getValueType(); 3244 MVT SimpleVT = VT.getSimpleVT(); 3245 MVT VecType = MVT::getVectorVT(SimpleVT, 2); 3246 3247 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old}); 3248 SDValue Ops[] = { ChainIn, Addr, NewOld }; 3249 3250 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(), 3251 Ops, VT, AtomicNode->getMemOperand()); 3252 } 3253 3254 //===----------------------------------------------------------------------===// 3255 // Custom DAG optimizations 3256 //===----------------------------------------------------------------------===// 3257 3258 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 3259 DAGCombinerInfo &DCI) const { 3260 EVT VT = N->getValueType(0); 3261 EVT ScalarVT = VT.getScalarType(); 3262 if (ScalarVT != MVT::f32) 3263 return SDValue(); 3264 3265 SelectionDAG &DAG = DCI.DAG; 3266 SDLoc DL(N); 3267 3268 SDValue Src = N->getOperand(0); 3269 EVT SrcVT = Src.getValueType(); 3270 3271 // TODO: We could try to match extracting the higher bytes, which would be 3272 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 3273 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 3274 // about in practice. 3275 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { 3276 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 3277 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); 3278 DCI.AddToWorklist(Cvt.getNode()); 3279 return Cvt; 3280 } 3281 } 3282 3283 return SDValue(); 3284 } 3285 3286 /// \brief Return true if the given offset Size in bytes can be folded into 3287 /// the immediate offsets of a memory instruction for the given address space. 3288 static bool canFoldOffset(unsigned OffsetSize, unsigned AS, 3289 const SISubtarget &STI) { 3290 switch (AS) { 3291 case AMDGPUAS::GLOBAL_ADDRESS: { 3292 // MUBUF instructions a 12-bit offset in bytes. 3293 return isUInt<12>(OffsetSize); 3294 } 3295 case AMDGPUAS::CONSTANT_ADDRESS: { 3296 // SMRD instructions have an 8-bit offset in dwords on SI and 3297 // a 20-bit offset in bytes on VI. 3298 if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 3299 return isUInt<20>(OffsetSize); 3300 else 3301 return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); 3302 } 3303 case AMDGPUAS::LOCAL_ADDRESS: 3304 case AMDGPUAS::REGION_ADDRESS: { 3305 // The single offset versions have a 16-bit offset in bytes. 3306 return isUInt<16>(OffsetSize); 3307 } 3308 case AMDGPUAS::PRIVATE_ADDRESS: 3309 // Indirect register addressing does not use any offsets. 3310 default: 3311 return 0; 3312 } 3313 } 3314 3315 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 3316 3317 // This is a variant of 3318 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 3319 // 3320 // The normal DAG combiner will do this, but only if the add has one use since 3321 // that would increase the number of instructions. 3322 // 3323 // This prevents us from seeing a constant offset that can be folded into a 3324 // memory instruction's addressing mode. If we know the resulting add offset of 3325 // a pointer can be folded into an addressing offset, we can replace the pointer 3326 // operand with the add of new constant offset. This eliminates one of the uses, 3327 // and may allow the remaining use to also be simplified. 3328 // 3329 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, 3330 unsigned AddrSpace, 3331 DAGCombinerInfo &DCI) const { 3332 SDValue N0 = N->getOperand(0); 3333 SDValue N1 = N->getOperand(1); 3334 3335 if (N0.getOpcode() != ISD::ADD) 3336 return SDValue(); 3337 3338 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 3339 if (!CN1) 3340 return SDValue(); 3341 3342 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 3343 if (!CAdd) 3344 return SDValue(); 3345 3346 // If the resulting offset is too large, we can't fold it into the addressing 3347 // mode offset. 3348 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 3349 if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget())) 3350 return SDValue(); 3351 3352 SelectionDAG &DAG = DCI.DAG; 3353 SDLoc SL(N); 3354 EVT VT = N->getValueType(0); 3355 3356 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 3357 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); 3358 3359 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); 3360 } 3361 3362 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) { 3363 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) || 3364 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) || 3365 (Opc == ISD::XOR && Val == 0); 3366 } 3367 3368 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This 3369 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit 3370 // integer combine opportunities since most 64-bit operations are decomposed 3371 // this way. TODO: We won't want this for SALU especially if it is an inline 3372 // immediate. 3373 SDValue SITargetLowering::splitBinaryBitConstantOp( 3374 DAGCombinerInfo &DCI, 3375 const SDLoc &SL, 3376 unsigned Opc, SDValue LHS, 3377 const ConstantSDNode *CRHS) const { 3378 uint64_t Val = CRHS->getZExtValue(); 3379 uint32_t ValLo = Lo_32(Val); 3380 uint32_t ValHi = Hi_32(Val); 3381 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 3382 3383 if ((bitOpWithConstantIsReducible(Opc, ValLo) || 3384 bitOpWithConstantIsReducible(Opc, ValHi)) || 3385 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { 3386 // If we need to materialize a 64-bit immediate, it will be split up later 3387 // anyway. Avoid creating the harder to understand 64-bit immediate 3388 // materialization. 3389 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi); 3390 } 3391 3392 return SDValue(); 3393 } 3394 3395 SDValue SITargetLowering::performAndCombine(SDNode *N, 3396 DAGCombinerInfo &DCI) const { 3397 if (DCI.isBeforeLegalize()) 3398 return SDValue(); 3399 3400 SelectionDAG &DAG = DCI.DAG; 3401 EVT VT = N->getValueType(0); 3402 SDValue LHS = N->getOperand(0); 3403 SDValue RHS = N->getOperand(1); 3404 3405 3406 if (VT == MVT::i64) { 3407 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 3408 if (CRHS) { 3409 if (SDValue Split 3410 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) 3411 return Split; 3412 } 3413 } 3414 3415 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 3416 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 3417 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) { 3418 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 3419 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 3420 3421 SDValue X = LHS.getOperand(0); 3422 SDValue Y = RHS.getOperand(0); 3423 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) 3424 return SDValue(); 3425 3426 if (LCC == ISD::SETO) { 3427 if (X != LHS.getOperand(1)) 3428 return SDValue(); 3429 3430 if (RCC == ISD::SETUNE) { 3431 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 3432 if (!C1 || !C1->isInfinity() || C1->isNegative()) 3433 return SDValue(); 3434 3435 const uint32_t Mask = SIInstrFlags::N_NORMAL | 3436 SIInstrFlags::N_SUBNORMAL | 3437 SIInstrFlags::N_ZERO | 3438 SIInstrFlags::P_ZERO | 3439 SIInstrFlags::P_SUBNORMAL | 3440 SIInstrFlags::P_NORMAL; 3441 3442 static_assert(((~(SIInstrFlags::S_NAN | 3443 SIInstrFlags::Q_NAN | 3444 SIInstrFlags::N_INFINITY | 3445 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, 3446 "mask not equal"); 3447 3448 SDLoc DL(N); 3449 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 3450 X, DAG.getConstant(Mask, DL, MVT::i32)); 3451 } 3452 } 3453 } 3454 3455 return SDValue(); 3456 } 3457 3458 SDValue SITargetLowering::performOrCombine(SDNode *N, 3459 DAGCombinerInfo &DCI) const { 3460 SelectionDAG &DAG = DCI.DAG; 3461 SDValue LHS = N->getOperand(0); 3462 SDValue RHS = N->getOperand(1); 3463 3464 EVT VT = N->getValueType(0); 3465 if (VT == MVT::i1) { 3466 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 3467 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 3468 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 3469 SDValue Src = LHS.getOperand(0); 3470 if (Src != RHS.getOperand(0)) 3471 return SDValue(); 3472 3473 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 3474 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 3475 if (!CLHS || !CRHS) 3476 return SDValue(); 3477 3478 // Only 10 bits are used. 3479 static const uint32_t MaxMask = 0x3ff; 3480 3481 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 3482 SDLoc DL(N); 3483 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 3484 Src, DAG.getConstant(NewMask, DL, MVT::i32)); 3485 } 3486 3487 return SDValue(); 3488 } 3489 3490 if (VT != MVT::i64) 3491 return SDValue(); 3492 3493 // TODO: This could be a generic combine with a predicate for extracting the 3494 // high half of an integer being free. 3495 3496 // (or i64:x, (zero_extend i32:y)) -> 3497 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) 3498 if (LHS.getOpcode() == ISD::ZERO_EXTEND && 3499 RHS.getOpcode() != ISD::ZERO_EXTEND) 3500 std::swap(LHS, RHS); 3501 3502 if (RHS.getOpcode() == ISD::ZERO_EXTEND) { 3503 SDValue ExtSrc = RHS.getOperand(0); 3504 EVT SrcVT = ExtSrc.getValueType(); 3505 if (SrcVT == MVT::i32) { 3506 SDLoc SL(N); 3507 SDValue LowLHS, HiBits; 3508 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); 3509 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); 3510 3511 DCI.AddToWorklist(LowOr.getNode()); 3512 DCI.AddToWorklist(HiBits.getNode()); 3513 3514 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 3515 LowOr, HiBits); 3516 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 3517 } 3518 } 3519 3520 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 3521 if (CRHS) { 3522 if (SDValue Split 3523 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS)) 3524 return Split; 3525 } 3526 3527 return SDValue(); 3528 } 3529 3530 SDValue SITargetLowering::performXorCombine(SDNode *N, 3531 DAGCombinerInfo &DCI) const { 3532 EVT VT = N->getValueType(0); 3533 if (VT != MVT::i64) 3534 return SDValue(); 3535 3536 SDValue LHS = N->getOperand(0); 3537 SDValue RHS = N->getOperand(1); 3538 3539 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 3540 if (CRHS) { 3541 if (SDValue Split 3542 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) 3543 return Split; 3544 } 3545 3546 return SDValue(); 3547 } 3548 3549 SDValue SITargetLowering::performClassCombine(SDNode *N, 3550 DAGCombinerInfo &DCI) const { 3551 SelectionDAG &DAG = DCI.DAG; 3552 SDValue Mask = N->getOperand(1); 3553 3554 // fp_class x, 0 -> false 3555 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { 3556 if (CMask->isNullValue()) 3557 return DAG.getConstant(0, SDLoc(N), MVT::i1); 3558 } 3559 3560 if (N->getOperand(0).isUndef()) 3561 return DAG.getUNDEF(MVT::i1); 3562 3563 return SDValue(); 3564 } 3565 3566 // Constant fold canonicalize. 3567 SDValue SITargetLowering::performFCanonicalizeCombine( 3568 SDNode *N, 3569 DAGCombinerInfo &DCI) const { 3570 ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 3571 if (!CFP) 3572 return SDValue(); 3573 3574 SelectionDAG &DAG = DCI.DAG; 3575 const APFloat &C = CFP->getValueAPF(); 3576 3577 // Flush denormals to 0 if not enabled. 3578 if (C.isDenormal()) { 3579 EVT VT = N->getValueType(0); 3580 if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) 3581 return DAG.getConstantFP(0.0, SDLoc(N), VT); 3582 3583 if (VT == MVT::f64 && !Subtarget->hasFP64Denormals()) 3584 return DAG.getConstantFP(0.0, SDLoc(N), VT); 3585 } 3586 3587 if (C.isNaN()) { 3588 EVT VT = N->getValueType(0); 3589 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); 3590 if (C.isSignaling()) { 3591 // Quiet a signaling NaN. 3592 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); 3593 } 3594 3595 // Make sure it is the canonical NaN bitpattern. 3596 // 3597 // TODO: Can we use -1 as the canonical NaN value since it's an inline 3598 // immediate? 3599 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) 3600 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); 3601 } 3602 3603 return SDValue(CFP, 0); 3604 } 3605 3606 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 3607 switch (Opc) { 3608 case ISD::FMAXNUM: 3609 return AMDGPUISD::FMAX3; 3610 case ISD::SMAX: 3611 return AMDGPUISD::SMAX3; 3612 case ISD::UMAX: 3613 return AMDGPUISD::UMAX3; 3614 case ISD::FMINNUM: 3615 return AMDGPUISD::FMIN3; 3616 case ISD::SMIN: 3617 return AMDGPUISD::SMIN3; 3618 case ISD::UMIN: 3619 return AMDGPUISD::UMIN3; 3620 default: 3621 llvm_unreachable("Not a min/max opcode"); 3622 } 3623 } 3624 3625 static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, 3626 SDValue Op0, SDValue Op1, bool Signed) { 3627 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1); 3628 if (!K1) 3629 return SDValue(); 3630 3631 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1)); 3632 if (!K0) 3633 return SDValue(); 3634 3635 if (Signed) { 3636 if (K0->getAPIntValue().sge(K1->getAPIntValue())) 3637 return SDValue(); 3638 } else { 3639 if (K0->getAPIntValue().uge(K1->getAPIntValue())) 3640 return SDValue(); 3641 } 3642 3643 EVT VT = K0->getValueType(0); 3644 3645 MVT NVT = MVT::i32; 3646 unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3647 3648 SDValue Tmp1, Tmp2, Tmp3; 3649 Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); 3650 Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); 3651 Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); 3652 3653 if (VT == MVT::i16) { 3654 Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT, 3655 Tmp1, Tmp2, Tmp3); 3656 3657 return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1); 3658 } else 3659 return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, 3660 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); 3661 } 3662 3663 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { 3664 if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) 3665 return true; 3666 3667 return DAG.isKnownNeverNaN(Op); 3668 } 3669 3670 static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, 3671 SDValue Op0, SDValue Op1) { 3672 ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1); 3673 if (!K1) 3674 return SDValue(); 3675 3676 ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1)); 3677 if (!K0) 3678 return SDValue(); 3679 3680 // Ordered >= (although NaN inputs should have folded away by now). 3681 APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); 3682 if (Cmp == APFloat::cmpGreaterThan) 3683 return SDValue(); 3684 3685 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a 3686 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then 3687 // give the other result, which is different from med3 with a NaN input. 3688 SDValue Var = Op0.getOperand(0); 3689 if (!isKnownNeverSNan(DAG, Var)) 3690 return SDValue(); 3691 3692 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), 3693 Var, SDValue(K0, 0), SDValue(K1, 0)); 3694 } 3695 3696 SDValue SITargetLowering::performMinMaxCombine(SDNode *N, 3697 DAGCombinerInfo &DCI) const { 3698 SelectionDAG &DAG = DCI.DAG; 3699 3700 unsigned Opc = N->getOpcode(); 3701 SDValue Op0 = N->getOperand(0); 3702 SDValue Op1 = N->getOperand(1); 3703 3704 // Only do this if the inner op has one use since this will just increases 3705 // register pressure for no benefit. 3706 3707 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) { 3708 // max(max(a, b), c) -> max3(a, b, c) 3709 // min(min(a, b), c) -> min3(a, b, c) 3710 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 3711 SDLoc DL(N); 3712 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 3713 DL, 3714 N->getValueType(0), 3715 Op0.getOperand(0), 3716 Op0.getOperand(1), 3717 Op1); 3718 } 3719 3720 // Try commuted. 3721 // max(a, max(b, c)) -> max3(a, b, c) 3722 // min(a, min(b, c)) -> min3(a, b, c) 3723 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 3724 SDLoc DL(N); 3725 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 3726 DL, 3727 N->getValueType(0), 3728 Op0, 3729 Op1.getOperand(0), 3730 Op1.getOperand(1)); 3731 } 3732 } 3733 3734 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) 3735 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { 3736 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) 3737 return Med3; 3738 } 3739 3740 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { 3741 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) 3742 return Med3; 3743 } 3744 3745 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) 3746 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || 3747 (Opc == AMDGPUISD::FMIN_LEGACY && 3748 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && 3749 N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { 3750 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) 3751 return Res; 3752 } 3753 3754 return SDValue(); 3755 } 3756 3757 SDValue SITargetLowering::performSetCCCombine(SDNode *N, 3758 DAGCombinerInfo &DCI) const { 3759 SelectionDAG &DAG = DCI.DAG; 3760 SDLoc SL(N); 3761 3762 SDValue LHS = N->getOperand(0); 3763 SDValue RHS = N->getOperand(1); 3764 EVT VT = LHS.getValueType(); 3765 3766 if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() && 3767 VT != MVT::f16)) 3768 return SDValue(); 3769 3770 // Match isinf pattern 3771 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 3772 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 3773 if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { 3774 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 3775 if (!CRHS) 3776 return SDValue(); 3777 3778 const APFloat &APF = CRHS->getValueAPF(); 3779 if (APF.isInfinity() && !APF.isNegative()) { 3780 unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; 3781 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), 3782 DAG.getConstant(Mask, SL, MVT::i32)); 3783 } 3784 } 3785 3786 return SDValue(); 3787 } 3788 3789 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 3790 DAGCombinerInfo &DCI) const { 3791 SelectionDAG &DAG = DCI.DAG; 3792 SDLoc DL(N); 3793 3794 switch (N->getOpcode()) { 3795 default: 3796 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 3797 case ISD::SETCC: 3798 return performSetCCCombine(N, DCI); 3799 case ISD::FMAXNUM: 3800 case ISD::FMINNUM: 3801 case ISD::SMAX: 3802 case ISD::SMIN: 3803 case ISD::UMAX: 3804 case ISD::UMIN: 3805 case AMDGPUISD::FMIN_LEGACY: 3806 case AMDGPUISD::FMAX_LEGACY: { 3807 if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && 3808 N->getValueType(0) != MVT::f64 && 3809 getTargetMachine().getOptLevel() > CodeGenOpt::None) 3810 return performMinMaxCombine(N, DCI); 3811 break; 3812 } 3813 3814 case AMDGPUISD::CVT_F32_UBYTE0: 3815 case AMDGPUISD::CVT_F32_UBYTE1: 3816 case AMDGPUISD::CVT_F32_UBYTE2: 3817 case AMDGPUISD::CVT_F32_UBYTE3: { 3818 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 3819 3820 SDValue Src = N->getOperand(0); 3821 SDValue Srl = N->getOperand(0); 3822 if (Srl.getOpcode() == ISD::ZERO_EXTEND) 3823 Srl = Srl.getOperand(0); 3824 3825 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. 3826 if (Srl.getOpcode() == ISD::SRL) { 3827 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x 3828 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x 3829 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x 3830 3831 if (const ConstantSDNode *C = 3832 dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { 3833 Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)), 3834 EVT(MVT::i32)); 3835 3836 unsigned SrcOffset = C->getZExtValue() + 8 * Offset; 3837 if (SrcOffset < 32 && SrcOffset % 8 == 0) { 3838 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL, 3839 MVT::f32, Srl); 3840 } 3841 } 3842 } 3843 3844 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 3845 3846 APInt KnownZero, KnownOne; 3847 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 3848 !DCI.isBeforeLegalizeOps()); 3849 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 3850 if (TLO.ShrinkDemandedConstant(Src, Demanded) || 3851 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { 3852 DCI.CommitTargetLoweringOpt(TLO); 3853 } 3854 3855 break; 3856 } 3857 case ISD::SINT_TO_FP: 3858 case ISD::UINT_TO_FP: { 3859 return performUCharToFloatCombine(N, DCI); 3860 } 3861 case ISD::FADD: { 3862 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 3863 break; 3864 3865 EVT VT = N->getValueType(0); 3866 if (VT != MVT::f32) 3867 break; 3868 3869 // Only do this if we are not trying to support denormals. v_mad_f32 does 3870 // not support denormals ever. 3871 if (Subtarget->hasFP32Denormals()) 3872 break; 3873 3874 SDValue LHS = N->getOperand(0); 3875 SDValue RHS = N->getOperand(1); 3876 3877 // These should really be instruction patterns, but writing patterns with 3878 // source modiifiers is a pain. 3879 3880 // fadd (fadd (a, a), b) -> mad 2.0, a, b 3881 if (LHS.getOpcode() == ISD::FADD) { 3882 SDValue A = LHS.getOperand(0); 3883 if (A == LHS.getOperand(1)) { 3884 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 3885 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); 3886 } 3887 } 3888 3889 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 3890 if (RHS.getOpcode() == ISD::FADD) { 3891 SDValue A = RHS.getOperand(0); 3892 if (A == RHS.getOperand(1)) { 3893 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 3894 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); 3895 } 3896 } 3897 3898 return SDValue(); 3899 } 3900 case ISD::FSUB: { 3901 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 3902 break; 3903 3904 EVT VT = N->getValueType(0); 3905 3906 // Try to get the fneg to fold into the source modifier. This undoes generic 3907 // DAG combines and folds them into the mad. 3908 // 3909 // Only do this if we are not trying to support denormals. v_mad_f32 does 3910 // not support denormals ever. 3911 if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) { 3912 SDValue LHS = N->getOperand(0); 3913 SDValue RHS = N->getOperand(1); 3914 if (LHS.getOpcode() == ISD::FADD) { 3915 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 3916 3917 SDValue A = LHS.getOperand(0); 3918 if (A == LHS.getOperand(1)) { 3919 const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); 3920 SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); 3921 3922 return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); 3923 } 3924 } 3925 3926 if (RHS.getOpcode() == ISD::FADD) { 3927 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 3928 3929 SDValue A = RHS.getOperand(0); 3930 if (A == RHS.getOperand(1)) { 3931 const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); 3932 return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); 3933 } 3934 } 3935 3936 return SDValue(); 3937 } 3938 3939 break; 3940 } 3941 case ISD::LOAD: 3942 case ISD::STORE: 3943 case ISD::ATOMIC_LOAD: 3944 case ISD::ATOMIC_STORE: 3945 case ISD::ATOMIC_CMP_SWAP: 3946 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 3947 case ISD::ATOMIC_SWAP: 3948 case ISD::ATOMIC_LOAD_ADD: 3949 case ISD::ATOMIC_LOAD_SUB: 3950 case ISD::ATOMIC_LOAD_AND: 3951 case ISD::ATOMIC_LOAD_OR: 3952 case ISD::ATOMIC_LOAD_XOR: 3953 case ISD::ATOMIC_LOAD_NAND: 3954 case ISD::ATOMIC_LOAD_MIN: 3955 case ISD::ATOMIC_LOAD_MAX: 3956 case ISD::ATOMIC_LOAD_UMIN: 3957 case ISD::ATOMIC_LOAD_UMAX: 3958 case AMDGPUISD::ATOMIC_INC: 3959 case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics. 3960 if (DCI.isBeforeLegalize()) 3961 break; 3962 3963 MemSDNode *MemNode = cast<MemSDNode>(N); 3964 SDValue Ptr = MemNode->getBasePtr(); 3965 3966 // TODO: We could also do this for multiplies. 3967 unsigned AS = MemNode->getAddressSpace(); 3968 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { 3969 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); 3970 if (NewPtr) { 3971 SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end()); 3972 3973 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; 3974 return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); 3975 } 3976 } 3977 break; 3978 } 3979 case ISD::AND: 3980 return performAndCombine(N, DCI); 3981 case ISD::OR: 3982 return performOrCombine(N, DCI); 3983 case ISD::XOR: 3984 return performXorCombine(N, DCI); 3985 case AMDGPUISD::FP_CLASS: 3986 return performClassCombine(N, DCI); 3987 case ISD::FCANONICALIZE: 3988 return performFCanonicalizeCombine(N, DCI); 3989 case AMDGPUISD::FRACT: 3990 case AMDGPUISD::RCP: 3991 case AMDGPUISD::RSQ: 3992 case AMDGPUISD::RCP_LEGACY: 3993 case AMDGPUISD::RSQ_LEGACY: 3994 case AMDGPUISD::RSQ_CLAMP: 3995 case AMDGPUISD::LDEXP: { 3996 SDValue Src = N->getOperand(0); 3997 if (Src.isUndef()) 3998 return Src; 3999 break; 4000 } 4001 } 4002 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 4003 } 4004 4005 /// \brief Helper function for adjustWritemask 4006 static unsigned SubIdx2Lane(unsigned Idx) { 4007 switch (Idx) { 4008 default: return 0; 4009 case AMDGPU::sub0: return 0; 4010 case AMDGPU::sub1: return 1; 4011 case AMDGPU::sub2: return 2; 4012 case AMDGPU::sub3: return 3; 4013 } 4014 } 4015 4016 /// \brief Adjust the writemask of MIMG instructions 4017 void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 4018 SelectionDAG &DAG) const { 4019 SDNode *Users[4] = { }; 4020 unsigned Lane = 0; 4021 unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; 4022 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); 4023 unsigned NewDmask = 0; 4024 4025 // Try to figure out the used register components 4026 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 4027 I != E; ++I) { 4028 4029 // Abort if we can't understand the usage 4030 if (!I->isMachineOpcode() || 4031 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 4032 return; 4033 4034 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 4035 // Note that subregs are packed, i.e. Lane==0 is the first bit set 4036 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 4037 // set, etc. 4038 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 4039 4040 // Set which texture component corresponds to the lane. 4041 unsigned Comp; 4042 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 4043 assert(Dmask); 4044 Comp = countTrailingZeros(Dmask); 4045 Dmask &= ~(1 << Comp); 4046 } 4047 4048 // Abort if we have more than one user per component 4049 if (Users[Lane]) 4050 return; 4051 4052 Users[Lane] = *I; 4053 NewDmask |= 1 << Comp; 4054 } 4055 4056 // Abort if there's no change 4057 if (NewDmask == OldDmask) 4058 return; 4059 4060 // Adjust the writemask in the node 4061 std::vector<SDValue> Ops; 4062 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); 4063 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); 4064 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); 4065 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); 4066 4067 // If we only got one lane, replace it with a copy 4068 // (if NewDmask has only one bit set...) 4069 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 4070 SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), 4071 MVT::i32); 4072 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 4073 SDLoc(), Users[Lane]->getValueType(0), 4074 SDValue(Node, 0), RC); 4075 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 4076 return; 4077 } 4078 4079 // Update the users of the node with the new indices 4080 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 4081 4082 SDNode *User = Users[i]; 4083 if (!User) 4084 continue; 4085 4086 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); 4087 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 4088 4089 switch (Idx) { 4090 default: break; 4091 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 4092 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 4093 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 4094 } 4095 } 4096 } 4097 4098 static bool isFrameIndexOp(SDValue Op) { 4099 if (Op.getOpcode() == ISD::AssertZext) 4100 Op = Op.getOperand(0); 4101 4102 return isa<FrameIndexSDNode>(Op); 4103 } 4104 4105 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) 4106 /// with frame index operands. 4107 /// LLVM assumes that inputs are to these instructions are registers. 4108 void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 4109 SelectionDAG &DAG) const { 4110 4111 SmallVector<SDValue, 8> Ops; 4112 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 4113 if (!isFrameIndexOp(Node->getOperand(i))) { 4114 Ops.push_back(Node->getOperand(i)); 4115 continue; 4116 } 4117 4118 SDLoc DL(Node); 4119 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 4120 Node->getOperand(i).getValueType(), 4121 Node->getOperand(i)), 0)); 4122 } 4123 4124 DAG.UpdateNodeOperands(Node, Ops); 4125 } 4126 4127 /// \brief Fold the instructions after selecting them. 4128 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 4129 SelectionDAG &DAG) const { 4130 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4131 unsigned Opcode = Node->getMachineOpcode(); 4132 4133 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && 4134 !TII->isGather4(Opcode)) 4135 adjustWritemask(Node, DAG); 4136 4137 if (Opcode == AMDGPU::INSERT_SUBREG || 4138 Opcode == AMDGPU::REG_SEQUENCE) { 4139 legalizeTargetIndependentNode(Node, DAG); 4140 return Node; 4141 } 4142 return Node; 4143 } 4144 4145 /// \brief Assign the register class depending on the number of 4146 /// bits set in the writemask 4147 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 4148 SDNode *Node) const { 4149 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4150 4151 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4152 4153 if (TII->isVOP3(MI.getOpcode())) { 4154 // Make sure constant bus requirements are respected. 4155 TII->legalizeOperandsVOP3(MRI, MI); 4156 return; 4157 } 4158 4159 if (TII->isMIMG(MI)) { 4160 unsigned VReg = MI.getOperand(0).getReg(); 4161 const TargetRegisterClass *RC = MRI.getRegClass(VReg); 4162 // TODO: Need mapping tables to handle other cases (register classes). 4163 if (RC != &AMDGPU::VReg_128RegClass) 4164 return; 4165 4166 unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; 4167 unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); 4168 unsigned BitsSet = 0; 4169 for (unsigned i = 0; i < 4; ++i) 4170 BitsSet += Writemask & (1 << i) ? 1 : 0; 4171 switch (BitsSet) { 4172 default: return; 4173 case 1: RC = &AMDGPU::VGPR_32RegClass; break; 4174 case 2: RC = &AMDGPU::VReg_64RegClass; break; 4175 case 3: RC = &AMDGPU::VReg_96RegClass; break; 4176 } 4177 4178 unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet); 4179 MI.setDesc(TII->get(NewOpcode)); 4180 MRI.setRegClass(VReg, RC); 4181 return; 4182 } 4183 4184 // Replace unused atomics with the no return version. 4185 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); 4186 if (NoRetAtomicOp != -1) { 4187 if (!Node->hasAnyUseOfValue(0)) { 4188 MI.setDesc(TII->get(NoRetAtomicOp)); 4189 MI.RemoveOperand(0); 4190 return; 4191 } 4192 4193 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg 4194 // instruction, because the return type of these instructions is a vec2 of 4195 // the memory type, so it can be tied to the input operand. 4196 // This means these instructions always have a use, so we need to add a 4197 // special case to check if the atomic has only one extract_subreg use, 4198 // which itself has no uses. 4199 if ((Node->hasNUsesOfValue(1, 0) && 4200 Node->use_begin()->isMachineOpcode() && 4201 Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && 4202 !Node->use_begin()->hasAnyUseOfValue(0))) { 4203 unsigned Def = MI.getOperand(0).getReg(); 4204 4205 // Change this into a noret atomic. 4206 MI.setDesc(TII->get(NoRetAtomicOp)); 4207 MI.RemoveOperand(0); 4208 4209 // If we only remove the def operand from the atomic instruction, the 4210 // extract_subreg will be left with a use of a vreg without a def. 4211 // So we need to insert an implicit_def to avoid machine verifier 4212 // errors. 4213 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 4214 TII->get(AMDGPU::IMPLICIT_DEF), Def); 4215 } 4216 return; 4217 } 4218 } 4219 4220 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, 4221 uint64_t Val) { 4222 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); 4223 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 4224 } 4225 4226 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 4227 const SDLoc &DL, 4228 SDValue Ptr) const { 4229 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4230 4231 // Build the half of the subregister with the constants before building the 4232 // full 128-bit register. If we are building multiple resource descriptors, 4233 // this will allow CSEing of the 2-component register. 4234 const SDValue Ops0[] = { 4235 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), 4236 buildSMovImm32(DAG, DL, 0), 4237 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 4238 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 4239 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) 4240 }; 4241 4242 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, 4243 MVT::v2i32, Ops0), 0); 4244 4245 // Combine the constants and the pointer. 4246 const SDValue Ops1[] = { 4247 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 4248 Ptr, 4249 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), 4250 SubRegHi, 4251 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) 4252 }; 4253 4254 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 4255 } 4256 4257 /// \brief Return a resource descriptor with the 'Add TID' bit enabled 4258 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] 4259 /// of the resource descriptor) to create an offset, which is added to 4260 /// the resource pointer. 4261 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, 4262 SDValue Ptr, uint32_t RsrcDword1, 4263 uint64_t RsrcDword2And3) const { 4264 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 4265 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 4266 if (RsrcDword1) { 4267 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 4268 DAG.getConstant(RsrcDword1, DL, MVT::i32)), 4269 0); 4270 } 4271 4272 SDValue DataLo = buildSMovImm32(DAG, DL, 4273 RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 4274 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 4275 4276 const SDValue Ops[] = { 4277 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 4278 PtrLo, 4279 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 4280 PtrHi, 4281 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), 4282 DataLo, 4283 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), 4284 DataHi, 4285 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) 4286 }; 4287 4288 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 4289 } 4290 4291 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 4292 const TargetRegisterClass *RC, 4293 unsigned Reg, EVT VT) const { 4294 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 4295 4296 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 4297 cast<RegisterSDNode>(VReg)->getReg(), VT); 4298 } 4299 4300 //===----------------------------------------------------------------------===// 4301 // SI Inline Assembly Support 4302 //===----------------------------------------------------------------------===// 4303 4304 std::pair<unsigned, const TargetRegisterClass *> 4305 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 4306 StringRef Constraint, 4307 MVT VT) const { 4308 if (!isTypeLegal(VT)) 4309 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4310 4311 if (Constraint.size() == 1) { 4312 switch (Constraint[0]) { 4313 case 's': 4314 case 'r': 4315 switch (VT.getSizeInBits()) { 4316 default: 4317 return std::make_pair(0U, nullptr); 4318 case 32: 4319 return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass); 4320 case 64: 4321 return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); 4322 case 128: 4323 return std::make_pair(0U, &AMDGPU::SReg_128RegClass); 4324 case 256: 4325 return std::make_pair(0U, &AMDGPU::SReg_256RegClass); 4326 } 4327 4328 case 'v': 4329 switch (VT.getSizeInBits()) { 4330 default: 4331 return std::make_pair(0U, nullptr); 4332 case 32: 4333 return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); 4334 case 64: 4335 return std::make_pair(0U, &AMDGPU::VReg_64RegClass); 4336 case 96: 4337 return std::make_pair(0U, &AMDGPU::VReg_96RegClass); 4338 case 128: 4339 return std::make_pair(0U, &AMDGPU::VReg_128RegClass); 4340 case 256: 4341 return std::make_pair(0U, &AMDGPU::VReg_256RegClass); 4342 case 512: 4343 return std::make_pair(0U, &AMDGPU::VReg_512RegClass); 4344 } 4345 } 4346 } 4347 4348 if (Constraint.size() > 1) { 4349 const TargetRegisterClass *RC = nullptr; 4350 if (Constraint[1] == 'v') { 4351 RC = &AMDGPU::VGPR_32RegClass; 4352 } else if (Constraint[1] == 's') { 4353 RC = &AMDGPU::SGPR_32RegClass; 4354 } 4355 4356 if (RC) { 4357 uint32_t Idx; 4358 bool Failed = Constraint.substr(2).getAsInteger(10, Idx); 4359 if (!Failed && Idx < RC->getNumRegs()) 4360 return std::make_pair(RC->getRegister(Idx), RC); 4361 } 4362 } 4363 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4364 } 4365 4366 SITargetLowering::ConstraintType 4367 SITargetLowering::getConstraintType(StringRef Constraint) const { 4368 if (Constraint.size() == 1) { 4369 switch (Constraint[0]) { 4370 default: break; 4371 case 's': 4372 case 'v': 4373 return C_RegisterClass; 4374 } 4375 } 4376 return TargetLowering::getConstraintType(Constraint); 4377 } 4378