1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Custom DAG lowering for SI 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifdef _MSC_VER 16 // Provide M_PI. 17 #define _USE_MATH_DEFINES 18 #endif 19 20 #include "SIISelLowering.h" 21 #include "AMDGPU.h" 22 #include "AMDGPUIntrinsicInfo.h" 23 #include "AMDGPUSubtarget.h" 24 #include "AMDGPUTargetMachine.h" 25 #include "SIDefines.h" 26 #include "SIInstrInfo.h" 27 #include "SIMachineFunctionInfo.h" 28 #include "SIRegisterInfo.h" 29 #include "Utils/AMDGPUBaseInfo.h" 30 #include "llvm/ADT/APFloat.h" 31 #include "llvm/ADT/APInt.h" 32 #include "llvm/ADT/ArrayRef.h" 33 #include "llvm/ADT/BitVector.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Statistic.h" 36 #include "llvm/ADT/StringRef.h" 37 #include "llvm/ADT/StringSwitch.h" 38 #include "llvm/ADT/Twine.h" 39 #include "llvm/CodeGen/Analysis.h" 40 #include "llvm/CodeGen/CallingConvLower.h" 41 #include "llvm/CodeGen/DAGCombine.h" 42 #include "llvm/CodeGen/ISDOpcodes.h" 43 #include "llvm/CodeGen/MachineBasicBlock.h" 44 #include "llvm/CodeGen/MachineFrameInfo.h" 45 #include "llvm/CodeGen/MachineFunction.h" 46 #include "llvm/CodeGen/MachineInstr.h" 47 #include "llvm/CodeGen/MachineInstrBuilder.h" 48 #include "llvm/CodeGen/MachineMemOperand.h" 49 #include "llvm/CodeGen/MachineModuleInfo.h" 50 #include "llvm/CodeGen/MachineOperand.h" 51 #include "llvm/CodeGen/MachineRegisterInfo.h" 52 #include "llvm/CodeGen/SelectionDAG.h" 53 #include "llvm/CodeGen/SelectionDAGNodes.h" 54 #include "llvm/CodeGen/TargetCallingConv.h" 55 #include "llvm/CodeGen/TargetRegisterInfo.h" 56 #include "llvm/CodeGen/ValueTypes.h" 57 #include "llvm/IR/Constants.h" 58 #include "llvm/IR/DataLayout.h" 59 #include "llvm/IR/DebugLoc.h" 60 #include "llvm/IR/DerivedTypes.h" 61 #include "llvm/IR/DiagnosticInfo.h" 62 #include "llvm/IR/Function.h" 63 #include "llvm/IR/GlobalValue.h" 64 #include "llvm/IR/InstrTypes.h" 65 #include "llvm/IR/Instruction.h" 66 #include "llvm/IR/Instructions.h" 67 #include "llvm/IR/IntrinsicInst.h" 68 #include "llvm/IR/Type.h" 69 #include "llvm/Support/Casting.h" 70 #include "llvm/Support/CodeGen.h" 71 #include "llvm/Support/CommandLine.h" 72 #include "llvm/Support/Compiler.h" 73 #include "llvm/Support/ErrorHandling.h" 74 #include "llvm/Support/KnownBits.h" 75 #include "llvm/Support/MachineValueType.h" 76 #include "llvm/Support/MathExtras.h" 77 #include "llvm/Target/TargetOptions.h" 78 #include <cassert> 79 #include <cmath> 80 #include <cstdint> 81 #include <iterator> 82 #include <tuple> 83 #include <utility> 84 #include <vector> 85 86 using namespace llvm; 87 88 #define DEBUG_TYPE "si-lower" 89 90 STATISTIC(NumTailCalls, "Number of tail calls"); 91 92 static cl::opt<bool> EnableVGPRIndexMode( 93 "amdgpu-vgpr-index-mode", 94 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 95 cl::init(false)); 96 97 static cl::opt<unsigned> AssumeFrameIndexHighZeroBits( 98 "amdgpu-frame-index-zero-bits", 99 cl::desc("High bits of frame index assumed to be zero"), 100 cl::init(5), 101 cl::ReallyHidden); 102 103 static unsigned findFirstFreeSGPR(CCState &CCInfo) { 104 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 105 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { 106 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { 107 return AMDGPU::SGPR0 + Reg; 108 } 109 } 110 llvm_unreachable("Cannot allocate sgpr"); 111 } 112 113 SITargetLowering::SITargetLowering(const TargetMachine &TM, 114 const SISubtarget &STI) 115 : AMDGPUTargetLowering(TM, STI) { 116 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 117 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 118 119 addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass); 120 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 121 122 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 123 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 124 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 125 126 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); 127 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); 128 129 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); 130 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 131 132 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); 133 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 134 135 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); 136 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 137 138 if (Subtarget->has16BitInsts()) { 139 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); 140 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); 141 } 142 143 if (Subtarget->hasVOP3PInsts()) { 144 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); 145 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); 146 } 147 148 computeRegisterProperties(STI.getRegisterInfo()); 149 150 // We need to custom lower vector stores from local memory 151 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 152 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 153 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 154 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 155 setOperationAction(ISD::LOAD, MVT::i1, Custom); 156 157 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 158 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 159 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 160 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 161 setOperationAction(ISD::STORE, MVT::i1, Custom); 162 163 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 164 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); 165 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 166 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 167 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand); 168 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand); 169 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand); 170 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); 171 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); 172 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); 173 174 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 175 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 176 setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); 177 178 setOperationAction(ISD::SELECT, MVT::i1, Promote); 179 setOperationAction(ISD::SELECT, MVT::i64, Custom); 180 setOperationAction(ISD::SELECT, MVT::f64, Promote); 181 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 182 183 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 184 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 185 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 186 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 187 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 188 189 setOperationAction(ISD::SETCC, MVT::i1, Promote); 190 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 191 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 192 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32); 193 194 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); 195 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 196 197 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 198 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 199 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 200 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 201 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 202 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 203 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 204 205 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 206 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 207 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 208 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); 209 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); 210 211 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); 212 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); 213 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 214 215 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 216 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); 217 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); 218 setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); 219 220 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 221 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 222 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 223 setOperationAction(ISD::BR_CC, MVT::i64, Expand); 224 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 225 setOperationAction(ISD::BR_CC, MVT::f64, Expand); 226 227 setOperationAction(ISD::UADDO, MVT::i32, Legal); 228 setOperationAction(ISD::USUBO, MVT::i32, Legal); 229 230 setOperationAction(ISD::ADDCARRY, MVT::i32, Legal); 231 setOperationAction(ISD::SUBCARRY, MVT::i32, Legal); 232 233 #if 0 234 setOperationAction(ISD::ADDCARRY, MVT::i64, Legal); 235 setOperationAction(ISD::SUBCARRY, MVT::i64, Legal); 236 #endif 237 238 //setOperationAction(ISD::ADDC, MVT::i64, Expand); 239 //setOperationAction(ISD::SUBC, MVT::i64, Expand); 240 241 // We only support LOAD/STORE and vector manipulation ops for vectors 242 // with > 4 elements. 243 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, 244 MVT::v2i64, MVT::v2f64}) { 245 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 246 switch (Op) { 247 case ISD::LOAD: 248 case ISD::STORE: 249 case ISD::BUILD_VECTOR: 250 case ISD::BITCAST: 251 case ISD::EXTRACT_VECTOR_ELT: 252 case ISD::INSERT_VECTOR_ELT: 253 case ISD::INSERT_SUBVECTOR: 254 case ISD::EXTRACT_SUBVECTOR: 255 case ISD::SCALAR_TO_VECTOR: 256 break; 257 case ISD::CONCAT_VECTORS: 258 setOperationAction(Op, VT, Custom); 259 break; 260 default: 261 setOperationAction(Op, VT, Expand); 262 break; 263 } 264 } 265 } 266 267 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that 268 // is expanded to avoid having two separate loops in case the index is a VGPR. 269 270 // Most operations are naturally 32-bit vector operations. We only support 271 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. 272 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { 273 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 274 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); 275 276 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 277 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); 278 279 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 280 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); 281 282 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 283 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); 284 } 285 286 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 287 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 288 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 289 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 290 291 // Avoid stack access for these. 292 // TODO: Generalize to more vector types. 293 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom); 294 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom); 295 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); 296 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 297 298 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, 299 // and output demarshalling 300 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 301 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 302 303 // We can't return success/failure, only the old value, 304 // let LLVM add the comparison 305 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); 306 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); 307 308 if (getSubtarget()->hasFlatAddressSpace()) { 309 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); 310 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); 311 } 312 313 setOperationAction(ISD::BSWAP, MVT::i32, Legal); 314 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 315 316 // On SI this is s_memtime and s_memrealtime on VI. 317 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 318 setOperationAction(ISD::TRAP, MVT::Other, Custom); 319 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom); 320 321 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 322 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 323 324 if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) { 325 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 326 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 327 setOperationAction(ISD::FRINT, MVT::f64, Legal); 328 } 329 330 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 331 332 setOperationAction(ISD::FSIN, MVT::f32, Custom); 333 setOperationAction(ISD::FCOS, MVT::f32, Custom); 334 setOperationAction(ISD::FDIV, MVT::f32, Custom); 335 setOperationAction(ISD::FDIV, MVT::f64, Custom); 336 337 if (Subtarget->has16BitInsts()) { 338 setOperationAction(ISD::Constant, MVT::i16, Legal); 339 340 setOperationAction(ISD::SMIN, MVT::i16, Legal); 341 setOperationAction(ISD::SMAX, MVT::i16, Legal); 342 343 setOperationAction(ISD::UMIN, MVT::i16, Legal); 344 setOperationAction(ISD::UMAX, MVT::i16, Legal); 345 346 setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote); 347 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); 348 349 setOperationAction(ISD::ROTR, MVT::i16, Promote); 350 setOperationAction(ISD::ROTL, MVT::i16, Promote); 351 352 setOperationAction(ISD::SDIV, MVT::i16, Promote); 353 setOperationAction(ISD::UDIV, MVT::i16, Promote); 354 setOperationAction(ISD::SREM, MVT::i16, Promote); 355 setOperationAction(ISD::UREM, MVT::i16, Promote); 356 357 setOperationAction(ISD::BSWAP, MVT::i16, Promote); 358 setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); 359 360 setOperationAction(ISD::CTTZ, MVT::i16, Promote); 361 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote); 362 setOperationAction(ISD::CTLZ, MVT::i16, Promote); 363 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote); 364 setOperationAction(ISD::CTPOP, MVT::i16, Promote); 365 366 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); 367 368 setOperationAction(ISD::BR_CC, MVT::i16, Expand); 369 370 setOperationAction(ISD::LOAD, MVT::i16, Custom); 371 372 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 373 374 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); 375 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); 376 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); 377 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); 378 379 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); 380 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); 381 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); 382 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); 383 384 // F16 - Constant Actions. 385 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 386 387 // F16 - Load/Store Actions. 388 setOperationAction(ISD::LOAD, MVT::f16, Promote); 389 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16); 390 setOperationAction(ISD::STORE, MVT::f16, Promote); 391 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); 392 393 // F16 - VOP1 Actions. 394 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 395 setOperationAction(ISD::FCOS, MVT::f16, Promote); 396 setOperationAction(ISD::FSIN, MVT::f16, Promote); 397 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote); 398 setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote); 399 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote); 400 setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote); 401 setOperationAction(ISD::FROUND, MVT::f16, Custom); 402 403 // F16 - VOP2 Actions. 404 setOperationAction(ISD::BR_CC, MVT::f16, Expand); 405 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); 406 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 407 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 408 setOperationAction(ISD::FDIV, MVT::f16, Custom); 409 410 // F16 - VOP3 Actions. 411 setOperationAction(ISD::FMA, MVT::f16, Legal); 412 if (!Subtarget->hasFP16Denormals()) 413 setOperationAction(ISD::FMAD, MVT::f16, Legal); 414 } 415 416 if (Subtarget->hasVOP3PInsts()) { 417 for (MVT VT : {MVT::v2i16, MVT::v2f16}) { 418 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 419 switch (Op) { 420 case ISD::LOAD: 421 case ISD::STORE: 422 case ISD::BUILD_VECTOR: 423 case ISD::BITCAST: 424 case ISD::EXTRACT_VECTOR_ELT: 425 case ISD::INSERT_VECTOR_ELT: 426 case ISD::INSERT_SUBVECTOR: 427 case ISD::EXTRACT_SUBVECTOR: 428 case ISD::SCALAR_TO_VECTOR: 429 break; 430 case ISD::CONCAT_VECTORS: 431 setOperationAction(Op, VT, Custom); 432 break; 433 default: 434 setOperationAction(Op, VT, Expand); 435 break; 436 } 437 } 438 } 439 440 // XXX - Do these do anything? Vector constants turn into build_vector. 441 setOperationAction(ISD::Constant, MVT::v2i16, Legal); 442 setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal); 443 444 setOperationAction(ISD::STORE, MVT::v2i16, Promote); 445 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); 446 setOperationAction(ISD::STORE, MVT::v2f16, Promote); 447 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32); 448 449 setOperationAction(ISD::LOAD, MVT::v2i16, Promote); 450 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32); 451 setOperationAction(ISD::LOAD, MVT::v2f16, Promote); 452 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32); 453 454 setOperationAction(ISD::AND, MVT::v2i16, Promote); 455 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32); 456 setOperationAction(ISD::OR, MVT::v2i16, Promote); 457 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); 458 setOperationAction(ISD::XOR, MVT::v2i16, Promote); 459 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); 460 setOperationAction(ISD::SELECT, MVT::v2i16, Promote); 461 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); 462 setOperationAction(ISD::SELECT, MVT::v2f16, Promote); 463 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); 464 465 setOperationAction(ISD::ADD, MVT::v2i16, Legal); 466 setOperationAction(ISD::SUB, MVT::v2i16, Legal); 467 setOperationAction(ISD::MUL, MVT::v2i16, Legal); 468 setOperationAction(ISD::SHL, MVT::v2i16, Legal); 469 setOperationAction(ISD::SRL, MVT::v2i16, Legal); 470 setOperationAction(ISD::SRA, MVT::v2i16, Legal); 471 setOperationAction(ISD::SMIN, MVT::v2i16, Legal); 472 setOperationAction(ISD::UMIN, MVT::v2i16, Legal); 473 setOperationAction(ISD::SMAX, MVT::v2i16, Legal); 474 setOperationAction(ISD::UMAX, MVT::v2i16, Legal); 475 476 setOperationAction(ISD::FADD, MVT::v2f16, Legal); 477 setOperationAction(ISD::FNEG, MVT::v2f16, Legal); 478 setOperationAction(ISD::FMUL, MVT::v2f16, Legal); 479 setOperationAction(ISD::FMA, MVT::v2f16, Legal); 480 setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal); 481 setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal); 482 483 // This isn't really legal, but this avoids the legalizer unrolling it (and 484 // allows matching fneg (fabs x) patterns) 485 setOperationAction(ISD::FABS, MVT::v2f16, Legal); 486 487 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); 488 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 489 490 setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); 491 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); 492 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); 493 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); 494 } else { 495 setOperationAction(ISD::SELECT, MVT::v2i16, Custom); 496 setOperationAction(ISD::SELECT, MVT::v2f16, Custom); 497 } 498 499 for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) { 500 setOperationAction(ISD::SELECT, VT, Custom); 501 } 502 503 setTargetDAGCombine(ISD::ADD); 504 setTargetDAGCombine(ISD::ADDCARRY); 505 setTargetDAGCombine(ISD::SUB); 506 setTargetDAGCombine(ISD::SUBCARRY); 507 setTargetDAGCombine(ISD::FADD); 508 setTargetDAGCombine(ISD::FSUB); 509 setTargetDAGCombine(ISD::FMINNUM); 510 setTargetDAGCombine(ISD::FMAXNUM); 511 setTargetDAGCombine(ISD::SMIN); 512 setTargetDAGCombine(ISD::SMAX); 513 setTargetDAGCombine(ISD::UMIN); 514 setTargetDAGCombine(ISD::UMAX); 515 setTargetDAGCombine(ISD::SETCC); 516 setTargetDAGCombine(ISD::AND); 517 setTargetDAGCombine(ISD::OR); 518 setTargetDAGCombine(ISD::XOR); 519 setTargetDAGCombine(ISD::SINT_TO_FP); 520 setTargetDAGCombine(ISD::UINT_TO_FP); 521 setTargetDAGCombine(ISD::FCANONICALIZE); 522 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); 523 setTargetDAGCombine(ISD::ZERO_EXTEND); 524 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 525 setTargetDAGCombine(ISD::BUILD_VECTOR); 526 527 // All memory operations. Some folding on the pointer operand is done to help 528 // matching the constant offsets in the addressing modes. 529 setTargetDAGCombine(ISD::LOAD); 530 setTargetDAGCombine(ISD::STORE); 531 setTargetDAGCombine(ISD::ATOMIC_LOAD); 532 setTargetDAGCombine(ISD::ATOMIC_STORE); 533 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); 534 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); 535 setTargetDAGCombine(ISD::ATOMIC_SWAP); 536 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); 537 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); 538 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); 539 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); 540 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); 541 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); 542 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); 543 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); 544 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); 545 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); 546 547 setSchedulingPreference(Sched::RegPressure); 548 } 549 550 const SISubtarget *SITargetLowering::getSubtarget() const { 551 return static_cast<const SISubtarget *>(Subtarget); 552 } 553 554 //===----------------------------------------------------------------------===// 555 // TargetLowering queries 556 //===----------------------------------------------------------------------===// 557 558 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { 559 // SI has some legal vector types, but no legal vector operations. Say no 560 // shuffles are legal in order to prefer scalarizing some vector operations. 561 return false; 562 } 563 564 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 565 const CallInst &CI, 566 MachineFunction &MF, 567 unsigned IntrID) const { 568 if (const AMDGPU::RsrcIntrinsic *RsrcIntr = 569 AMDGPU::lookupRsrcIntrinsicByIntr(IntrID)) { 570 AttributeList Attr = Intrinsic::getAttributes(CI.getContext(), 571 (Intrinsic::ID)IntrID); 572 if (Attr.hasFnAttribute(Attribute::ReadNone)) 573 return false; 574 575 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 576 577 if (RsrcIntr->IsImage) { 578 Info.ptrVal = MFI->getImagePSV( 579 *MF.getSubtarget<SISubtarget>().getInstrInfo(), 580 CI.getArgOperand(RsrcIntr->RsrcArg)); 581 Info.align = 0; 582 } else { 583 Info.ptrVal = MFI->getBufferPSV( 584 *MF.getSubtarget<SISubtarget>().getInstrInfo(), 585 CI.getArgOperand(RsrcIntr->RsrcArg)); 586 } 587 588 Info.flags = MachineMemOperand::MODereferenceable; 589 if (Attr.hasFnAttribute(Attribute::ReadOnly)) { 590 Info.opc = ISD::INTRINSIC_W_CHAIN; 591 Info.memVT = MVT::getVT(CI.getType()); 592 Info.flags |= MachineMemOperand::MOLoad; 593 } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) { 594 Info.opc = ISD::INTRINSIC_VOID; 595 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); 596 Info.flags |= MachineMemOperand::MOStore; 597 } else { 598 // Atomic 599 Info.opc = ISD::INTRINSIC_W_CHAIN; 600 Info.memVT = MVT::getVT(CI.getType()); 601 Info.flags = MachineMemOperand::MOLoad | 602 MachineMemOperand::MOStore | 603 MachineMemOperand::MODereferenceable; 604 605 // XXX - Should this be volatile without known ordering? 606 Info.flags |= MachineMemOperand::MOVolatile; 607 } 608 return true; 609 } 610 611 switch (IntrID) { 612 case Intrinsic::amdgcn_atomic_inc: 613 case Intrinsic::amdgcn_atomic_dec: 614 case Intrinsic::amdgcn_ds_fadd: 615 case Intrinsic::amdgcn_ds_fmin: 616 case Intrinsic::amdgcn_ds_fmax: { 617 Info.opc = ISD::INTRINSIC_W_CHAIN; 618 Info.memVT = MVT::getVT(CI.getType()); 619 Info.ptrVal = CI.getOperand(0); 620 Info.align = 0; 621 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 622 623 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); 624 if (!Vol || !Vol->isZero()) 625 Info.flags |= MachineMemOperand::MOVolatile; 626 627 return true; 628 } 629 630 default: 631 return false; 632 } 633 } 634 635 bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, 636 SmallVectorImpl<Value*> &Ops, 637 Type *&AccessTy) const { 638 switch (II->getIntrinsicID()) { 639 case Intrinsic::amdgcn_atomic_inc: 640 case Intrinsic::amdgcn_atomic_dec: 641 case Intrinsic::amdgcn_ds_fadd: 642 case Intrinsic::amdgcn_ds_fmin: 643 case Intrinsic::amdgcn_ds_fmax: { 644 Value *Ptr = II->getArgOperand(0); 645 AccessTy = II->getType(); 646 Ops.push_back(Ptr); 647 return true; 648 } 649 default: 650 return false; 651 } 652 } 653 654 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { 655 if (!Subtarget->hasFlatInstOffsets()) { 656 // Flat instructions do not have offsets, and only have the register 657 // address. 658 return AM.BaseOffs == 0 && AM.Scale == 0; 659 } 660 661 // GFX9 added a 13-bit signed offset. When using regular flat instructions, 662 // the sign bit is ignored and is treated as a 12-bit unsigned offset. 663 664 // Just r + i 665 return isUInt<12>(AM.BaseOffs) && AM.Scale == 0; 666 } 667 668 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { 669 if (Subtarget->hasFlatGlobalInsts()) 670 return isInt<13>(AM.BaseOffs) && AM.Scale == 0; 671 672 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { 673 // Assume the we will use FLAT for all global memory accesses 674 // on VI. 675 // FIXME: This assumption is currently wrong. On VI we still use 676 // MUBUF instructions for the r + i addressing mode. As currently 677 // implemented, the MUBUF instructions only work on buffer < 4GB. 678 // It may be possible to support > 4GB buffers with MUBUF instructions, 679 // by setting the stride value in the resource descriptor which would 680 // increase the size limit to (stride * 4GB). However, this is risky, 681 // because it has never been validated. 682 return isLegalFlatAddressingMode(AM); 683 } 684 685 return isLegalMUBUFAddressingMode(AM); 686 } 687 688 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { 689 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and 690 // additionally can do r + r + i with addr64. 32-bit has more addressing 691 // mode options. Depending on the resource constant, it can also do 692 // (i64 r0) + (i32 r1) * (i14 i). 693 // 694 // Private arrays end up using a scratch buffer most of the time, so also 695 // assume those use MUBUF instructions. Scratch loads / stores are currently 696 // implemented as mubuf instructions with offen bit set, so slightly 697 // different than the normal addr64. 698 if (!isUInt<12>(AM.BaseOffs)) 699 return false; 700 701 // FIXME: Since we can split immediate into soffset and immediate offset, 702 // would it make sense to allow any immediate? 703 704 switch (AM.Scale) { 705 case 0: // r + i or just i, depending on HasBaseReg. 706 return true; 707 case 1: 708 return true; // We have r + r or r + i. 709 case 2: 710 if (AM.HasBaseReg) { 711 // Reject 2 * r + r. 712 return false; 713 } 714 715 // Allow 2 * r as r + r 716 // Or 2 * r + i is allowed as r + r + i. 717 return true; 718 default: // Don't allow n * r 719 return false; 720 } 721 } 722 723 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, 724 const AddrMode &AM, Type *Ty, 725 unsigned AS, Instruction *I) const { 726 // No global is ever allowed as a base. 727 if (AM.BaseGV) 728 return false; 729 730 if (AS == AMDGPUASI.GLOBAL_ADDRESS) 731 return isLegalGlobalAddressingMode(AM); 732 733 if (AS == AMDGPUASI.CONSTANT_ADDRESS || 734 AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { 735 // If the offset isn't a multiple of 4, it probably isn't going to be 736 // correctly aligned. 737 // FIXME: Can we get the real alignment here? 738 if (AM.BaseOffs % 4 != 0) 739 return isLegalMUBUFAddressingMode(AM); 740 741 // There are no SMRD extloads, so if we have to do a small type access we 742 // will use a MUBUF load. 743 // FIXME?: We also need to do this if unaligned, but we don't know the 744 // alignment here. 745 if (DL.getTypeStoreSize(Ty) < 4) 746 return isLegalGlobalAddressingMode(AM); 747 748 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { 749 // SMRD instructions have an 8-bit, dword offset on SI. 750 if (!isUInt<8>(AM.BaseOffs / 4)) 751 return false; 752 } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) { 753 // On CI+, this can also be a 32-bit literal constant offset. If it fits 754 // in 8-bits, it can use a smaller encoding. 755 if (!isUInt<32>(AM.BaseOffs / 4)) 756 return false; 757 } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 758 // On VI, these use the SMEM format and the offset is 20-bit in bytes. 759 if (!isUInt<20>(AM.BaseOffs)) 760 return false; 761 } else 762 llvm_unreachable("unhandled generation"); 763 764 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 765 return true; 766 767 if (AM.Scale == 1 && AM.HasBaseReg) 768 return true; 769 770 return false; 771 772 } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { 773 return isLegalMUBUFAddressingMode(AM); 774 } else if (AS == AMDGPUASI.LOCAL_ADDRESS || 775 AS == AMDGPUASI.REGION_ADDRESS) { 776 // Basic, single offset DS instructions allow a 16-bit unsigned immediate 777 // field. 778 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have 779 // an 8-bit dword offset but we don't know the alignment here. 780 if (!isUInt<16>(AM.BaseOffs)) 781 return false; 782 783 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 784 return true; 785 786 if (AM.Scale == 1 && AM.HasBaseReg) 787 return true; 788 789 return false; 790 } else if (AS == AMDGPUASI.FLAT_ADDRESS || 791 AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) { 792 // For an unknown address space, this usually means that this is for some 793 // reason being used for pure arithmetic, and not based on some addressing 794 // computation. We don't have instructions that compute pointers with any 795 // addressing modes, so treat them as having no offset like flat 796 // instructions. 797 return isLegalFlatAddressingMode(AM); 798 } else { 799 llvm_unreachable("unhandled address space"); 800 } 801 } 802 803 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, 804 const SelectionDAG &DAG) const { 805 if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { 806 return (MemVT.getSizeInBits() <= 4 * 32); 807 } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { 808 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); 809 return (MemVT.getSizeInBits() <= MaxPrivateBits); 810 } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { 811 return (MemVT.getSizeInBits() <= 2 * 32); 812 } 813 return true; 814 } 815 816 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 817 unsigned AddrSpace, 818 unsigned Align, 819 bool *IsFast) const { 820 if (IsFast) 821 *IsFast = false; 822 823 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, 824 // which isn't a simple VT. 825 // Until MVT is extended to handle this, simply check for the size and 826 // rely on the condition below: allow accesses if the size is a multiple of 4. 827 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && 828 VT.getStoreSize() > 16)) { 829 return false; 830 } 831 832 if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS || 833 AddrSpace == AMDGPUASI.REGION_ADDRESS) { 834 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte 835 // aligned, 8 byte access in a single operation using ds_read2/write2_b32 836 // with adjacent offsets. 837 bool AlignedBy4 = (Align % 4 == 0); 838 if (IsFast) 839 *IsFast = AlignedBy4; 840 841 return AlignedBy4; 842 } 843 844 // FIXME: We have to be conservative here and assume that flat operations 845 // will access scratch. If we had access to the IR function, then we 846 // could determine if any private memory was used in the function. 847 if (!Subtarget->hasUnalignedScratchAccess() && 848 (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS || 849 AddrSpace == AMDGPUASI.FLAT_ADDRESS)) { 850 return false; 851 } 852 853 if (Subtarget->hasUnalignedBufferAccess()) { 854 // If we have an uniform constant load, it still requires using a slow 855 // buffer instruction if unaligned. 856 if (IsFast) { 857 *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS || 858 AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ? 859 (Align % 4 == 0) : true; 860 } 861 862 return true; 863 } 864 865 // Smaller than dword value must be aligned. 866 if (VT.bitsLT(MVT::i32)) 867 return false; 868 869 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 870 // byte-address are ignored, thus forcing Dword alignment. 871 // This applies to private, global, and constant memory. 872 if (IsFast) 873 *IsFast = true; 874 875 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 876 } 877 878 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 879 unsigned SrcAlign, bool IsMemset, 880 bool ZeroMemset, 881 bool MemcpyStrSrc, 882 MachineFunction &MF) const { 883 // FIXME: Should account for address space here. 884 885 // The default fallback uses the private pointer size as a guess for a type to 886 // use. Make sure we switch these to 64-bit accesses. 887 888 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global 889 return MVT::v4i32; 890 891 if (Size >= 8 && DstAlign >= 4) 892 return MVT::v2i32; 893 894 // Use the default. 895 return MVT::Other; 896 } 897 898 static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) { 899 return AS == AMDGPUASI.GLOBAL_ADDRESS || 900 AS == AMDGPUASI.FLAT_ADDRESS || 901 AS == AMDGPUASI.CONSTANT_ADDRESS || 902 AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT; 903 } 904 905 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, 906 unsigned DestAS) const { 907 return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) && 908 isFlatGlobalAddrSpace(DestAS, AMDGPUASI); 909 } 910 911 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { 912 const MemSDNode *MemNode = cast<MemSDNode>(N); 913 const Value *Ptr = MemNode->getMemOperand()->getValue(); 914 const Instruction *I = dyn_cast_or_null<Instruction>(Ptr); 915 return I && I->getMetadata("amdgpu.noclobber"); 916 } 917 918 bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, 919 unsigned DestAS) const { 920 // Flat -> private/local is a simple truncate. 921 // Flat -> global is no-op 922 if (SrcAS == AMDGPUASI.FLAT_ADDRESS) 923 return true; 924 925 return isNoopAddrSpaceCast(SrcAS, DestAS); 926 } 927 928 bool SITargetLowering::isMemOpUniform(const SDNode *N) const { 929 const MemSDNode *MemNode = cast<MemSDNode>(N); 930 931 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand()); 932 } 933 934 TargetLoweringBase::LegalizeTypeAction 935 SITargetLowering::getPreferredVectorAction(EVT VT) const { 936 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) 937 return TypeSplitVector; 938 939 return TargetLoweringBase::getPreferredVectorAction(VT); 940 } 941 942 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 943 Type *Ty) const { 944 // FIXME: Could be smarter if called for vector constants. 945 return true; 946 } 947 948 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { 949 if (Subtarget->has16BitInsts() && VT == MVT::i16) { 950 switch (Op) { 951 case ISD::LOAD: 952 case ISD::STORE: 953 954 // These operations are done with 32-bit instructions anyway. 955 case ISD::AND: 956 case ISD::OR: 957 case ISD::XOR: 958 case ISD::SELECT: 959 // TODO: Extensions? 960 return true; 961 default: 962 return false; 963 } 964 } 965 966 // SimplifySetCC uses this function to determine whether or not it should 967 // create setcc with i1 operands. We don't have instructions for i1 setcc. 968 if (VT == MVT::i1 && Op == ISD::SETCC) 969 return false; 970 971 return TargetLowering::isTypeDesirableForOp(Op, VT); 972 } 973 974 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, 975 const SDLoc &SL, 976 SDValue Chain, 977 uint64_t Offset) const { 978 const DataLayout &DL = DAG.getDataLayout(); 979 MachineFunction &MF = DAG.getMachineFunction(); 980 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 981 982 const ArgDescriptor *InputPtrReg; 983 const TargetRegisterClass *RC; 984 985 std::tie(InputPtrReg, RC) 986 = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 987 988 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 989 MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS); 990 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, 991 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); 992 993 return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 994 DAG.getConstant(Offset, SL, PtrVT)); 995 } 996 997 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, 998 const SDLoc &SL) const { 999 auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); 1000 uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); 1001 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); 1002 } 1003 1004 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, 1005 const SDLoc &SL, SDValue Val, 1006 bool Signed, 1007 const ISD::InputArg *Arg) const { 1008 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && 1009 VT.bitsLT(MemVT)) { 1010 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; 1011 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT)); 1012 } 1013 1014 if (MemVT.isFloatingPoint()) 1015 Val = getFPExtOrFPTrunc(DAG, Val, SL, VT); 1016 else if (Signed) 1017 Val = DAG.getSExtOrTrunc(Val, SL, VT); 1018 else 1019 Val = DAG.getZExtOrTrunc(Val, SL, VT); 1020 1021 return Val; 1022 } 1023 1024 SDValue SITargetLowering::lowerKernargMemParameter( 1025 SelectionDAG &DAG, EVT VT, EVT MemVT, 1026 const SDLoc &SL, SDValue Chain, 1027 uint64_t Offset, bool Signed, 1028 const ISD::InputArg *Arg) const { 1029 const DataLayout &DL = DAG.getDataLayout(); 1030 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); 1031 PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); 1032 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 1033 1034 unsigned Align = DL.getABITypeAlignment(Ty); 1035 1036 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); 1037 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, 1038 MachineMemOperand::MODereferenceable | 1039 MachineMemOperand::MOInvariant); 1040 1041 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); 1042 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); 1043 } 1044 1045 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, 1046 const SDLoc &SL, SDValue Chain, 1047 const ISD::InputArg &Arg) const { 1048 MachineFunction &MF = DAG.getMachineFunction(); 1049 MachineFrameInfo &MFI = MF.getFrameInfo(); 1050 1051 if (Arg.Flags.isByVal()) { 1052 unsigned Size = Arg.Flags.getByValSize(); 1053 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false); 1054 return DAG.getFrameIndex(FrameIdx, MVT::i32); 1055 } 1056 1057 unsigned ArgOffset = VA.getLocMemOffset(); 1058 unsigned ArgSize = VA.getValVT().getStoreSize(); 1059 1060 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true); 1061 1062 // Create load nodes to retrieve arguments from the stack. 1063 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); 1064 SDValue ArgValue; 1065 1066 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 1067 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 1068 MVT MemVT = VA.getValVT(); 1069 1070 switch (VA.getLocInfo()) { 1071 default: 1072 break; 1073 case CCValAssign::BCvt: 1074 MemVT = VA.getLocVT(); 1075 break; 1076 case CCValAssign::SExt: 1077 ExtType = ISD::SEXTLOAD; 1078 break; 1079 case CCValAssign::ZExt: 1080 ExtType = ISD::ZEXTLOAD; 1081 break; 1082 case CCValAssign::AExt: 1083 ExtType = ISD::EXTLOAD; 1084 break; 1085 } 1086 1087 ArgValue = DAG.getExtLoad( 1088 ExtType, SL, VA.getLocVT(), Chain, FIN, 1089 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 1090 MemVT); 1091 return ArgValue; 1092 } 1093 1094 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, 1095 const SIMachineFunctionInfo &MFI, 1096 EVT VT, 1097 AMDGPUFunctionArgInfo::PreloadedValue PVID) const { 1098 const ArgDescriptor *Reg; 1099 const TargetRegisterClass *RC; 1100 1101 std::tie(Reg, RC) = MFI.getPreloadedValue(PVID); 1102 return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT); 1103 } 1104 1105 static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, 1106 CallingConv::ID CallConv, 1107 ArrayRef<ISD::InputArg> Ins, 1108 BitVector &Skipped, 1109 FunctionType *FType, 1110 SIMachineFunctionInfo *Info) { 1111 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { 1112 const ISD::InputArg &Arg = Ins[I]; 1113 1114 // First check if it's a PS input addr. 1115 if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && 1116 !Arg.Flags.isByVal() && PSInputNum <= 15) { 1117 1118 if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { 1119 // We can safely skip PS inputs. 1120 Skipped.set(I); 1121 ++PSInputNum; 1122 continue; 1123 } 1124 1125 Info->markPSInputAllocated(PSInputNum); 1126 if (Arg.Used) 1127 Info->markPSInputEnabled(PSInputNum); 1128 1129 ++PSInputNum; 1130 } 1131 1132 // Second split vertices into their elements. 1133 if (Arg.VT.isVector()) { 1134 ISD::InputArg NewArg = Arg; 1135 NewArg.Flags.setSplit(); 1136 NewArg.VT = Arg.VT.getVectorElementType(); 1137 1138 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 1139 // three or five element vertex only needs three or five registers, 1140 // NOT four or eight. 1141 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 1142 unsigned NumElements = ParamType->getVectorNumElements(); 1143 1144 for (unsigned J = 0; J != NumElements; ++J) { 1145 Splits.push_back(NewArg); 1146 NewArg.PartOffset += NewArg.VT.getStoreSize(); 1147 } 1148 } else { 1149 Splits.push_back(Arg); 1150 } 1151 } 1152 } 1153 1154 // Allocate special inputs passed in VGPRs. 1155 static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, 1156 MachineFunction &MF, 1157 const SIRegisterInfo &TRI, 1158 SIMachineFunctionInfo &Info) { 1159 if (Info.hasWorkItemIDX()) { 1160 unsigned Reg = AMDGPU::VGPR0; 1161 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 1162 1163 CCInfo.AllocateReg(Reg); 1164 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); 1165 } 1166 1167 if (Info.hasWorkItemIDY()) { 1168 unsigned Reg = AMDGPU::VGPR1; 1169 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 1170 1171 CCInfo.AllocateReg(Reg); 1172 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); 1173 } 1174 1175 if (Info.hasWorkItemIDZ()) { 1176 unsigned Reg = AMDGPU::VGPR2; 1177 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 1178 1179 CCInfo.AllocateReg(Reg); 1180 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); 1181 } 1182 } 1183 1184 // Try to allocate a VGPR at the end of the argument list, or if no argument 1185 // VGPRs are left allocating a stack slot. 1186 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { 1187 ArrayRef<MCPhysReg> ArgVGPRs 1188 = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); 1189 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); 1190 if (RegIdx == ArgVGPRs.size()) { 1191 // Spill to stack required. 1192 int64_t Offset = CCInfo.AllocateStack(4, 4); 1193 1194 return ArgDescriptor::createStack(Offset); 1195 } 1196 1197 unsigned Reg = ArgVGPRs[RegIdx]; 1198 Reg = CCInfo.AllocateReg(Reg); 1199 assert(Reg != AMDGPU::NoRegister); 1200 1201 MachineFunction &MF = CCInfo.getMachineFunction(); 1202 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 1203 return ArgDescriptor::createRegister(Reg); 1204 } 1205 1206 static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, 1207 const TargetRegisterClass *RC, 1208 unsigned NumArgRegs) { 1209 ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32); 1210 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs); 1211 if (RegIdx == ArgSGPRs.size()) 1212 report_fatal_error("ran out of SGPRs for arguments"); 1213 1214 unsigned Reg = ArgSGPRs[RegIdx]; 1215 Reg = CCInfo.AllocateReg(Reg); 1216 assert(Reg != AMDGPU::NoRegister); 1217 1218 MachineFunction &MF = CCInfo.getMachineFunction(); 1219 MF.addLiveIn(Reg, RC); 1220 return ArgDescriptor::createRegister(Reg); 1221 } 1222 1223 static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) { 1224 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); 1225 } 1226 1227 static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) { 1228 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); 1229 } 1230 1231 static void allocateSpecialInputVGPRs(CCState &CCInfo, 1232 MachineFunction &MF, 1233 const SIRegisterInfo &TRI, 1234 SIMachineFunctionInfo &Info) { 1235 if (Info.hasWorkItemIDX()) 1236 Info.setWorkItemIDX(allocateVGPR32Input(CCInfo)); 1237 1238 if (Info.hasWorkItemIDY()) 1239 Info.setWorkItemIDY(allocateVGPR32Input(CCInfo)); 1240 1241 if (Info.hasWorkItemIDZ()) 1242 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo)); 1243 } 1244 1245 static void allocateSpecialInputSGPRs(CCState &CCInfo, 1246 MachineFunction &MF, 1247 const SIRegisterInfo &TRI, 1248 SIMachineFunctionInfo &Info) { 1249 auto &ArgInfo = Info.getArgInfo(); 1250 1251 // TODO: Unify handling with private memory pointers. 1252 1253 if (Info.hasDispatchPtr()) 1254 ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo); 1255 1256 if (Info.hasQueuePtr()) 1257 ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo); 1258 1259 if (Info.hasKernargSegmentPtr()) 1260 ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo); 1261 1262 if (Info.hasDispatchID()) 1263 ArgInfo.DispatchID = allocateSGPR64Input(CCInfo); 1264 1265 // flat_scratch_init is not applicable for non-kernel functions. 1266 1267 if (Info.hasWorkGroupIDX()) 1268 ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo); 1269 1270 if (Info.hasWorkGroupIDY()) 1271 ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo); 1272 1273 if (Info.hasWorkGroupIDZ()) 1274 ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo); 1275 1276 if (Info.hasImplicitArgPtr()) 1277 ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo); 1278 } 1279 1280 // Allocate special inputs passed in user SGPRs. 1281 static void allocateHSAUserSGPRs(CCState &CCInfo, 1282 MachineFunction &MF, 1283 const SIRegisterInfo &TRI, 1284 SIMachineFunctionInfo &Info) { 1285 if (Info.hasImplicitBufferPtr()) { 1286 unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); 1287 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 1288 CCInfo.AllocateReg(ImplicitBufferPtrReg); 1289 } 1290 1291 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 1292 if (Info.hasPrivateSegmentBuffer()) { 1293 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 1294 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 1295 CCInfo.AllocateReg(PrivateSegmentBufferReg); 1296 } 1297 1298 if (Info.hasDispatchPtr()) { 1299 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI); 1300 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 1301 CCInfo.AllocateReg(DispatchPtrReg); 1302 } 1303 1304 if (Info.hasQueuePtr()) { 1305 unsigned QueuePtrReg = Info.addQueuePtr(TRI); 1306 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 1307 CCInfo.AllocateReg(QueuePtrReg); 1308 } 1309 1310 if (Info.hasKernargSegmentPtr()) { 1311 unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI); 1312 MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); 1313 CCInfo.AllocateReg(InputPtrReg); 1314 } 1315 1316 if (Info.hasDispatchID()) { 1317 unsigned DispatchIDReg = Info.addDispatchID(TRI); 1318 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 1319 CCInfo.AllocateReg(DispatchIDReg); 1320 } 1321 1322 if (Info.hasFlatScratchInit()) { 1323 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI); 1324 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 1325 CCInfo.AllocateReg(FlatScratchInitReg); 1326 } 1327 1328 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 1329 // these from the dispatch pointer. 1330 } 1331 1332 // Allocate special input registers that are initialized per-wave. 1333 static void allocateSystemSGPRs(CCState &CCInfo, 1334 MachineFunction &MF, 1335 SIMachineFunctionInfo &Info, 1336 CallingConv::ID CallConv, 1337 bool IsShader) { 1338 if (Info.hasWorkGroupIDX()) { 1339 unsigned Reg = Info.addWorkGroupIDX(); 1340 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); 1341 CCInfo.AllocateReg(Reg); 1342 } 1343 1344 if (Info.hasWorkGroupIDY()) { 1345 unsigned Reg = Info.addWorkGroupIDY(); 1346 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); 1347 CCInfo.AllocateReg(Reg); 1348 } 1349 1350 if (Info.hasWorkGroupIDZ()) { 1351 unsigned Reg = Info.addWorkGroupIDZ(); 1352 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); 1353 CCInfo.AllocateReg(Reg); 1354 } 1355 1356 if (Info.hasWorkGroupInfo()) { 1357 unsigned Reg = Info.addWorkGroupInfo(); 1358 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); 1359 CCInfo.AllocateReg(Reg); 1360 } 1361 1362 if (Info.hasPrivateSegmentWaveByteOffset()) { 1363 // Scratch wave offset passed in system SGPR. 1364 unsigned PrivateSegmentWaveByteOffsetReg; 1365 1366 if (IsShader) { 1367 PrivateSegmentWaveByteOffsetReg = 1368 Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); 1369 1370 // This is true if the scratch wave byte offset doesn't have a fixed 1371 // location. 1372 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { 1373 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); 1374 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); 1375 } 1376 } else 1377 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); 1378 1379 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); 1380 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); 1381 } 1382 } 1383 1384 static void reservePrivateMemoryRegs(const TargetMachine &TM, 1385 MachineFunction &MF, 1386 const SIRegisterInfo &TRI, 1387 SIMachineFunctionInfo &Info) { 1388 // Now that we've figured out where the scratch register inputs are, see if 1389 // should reserve the arguments and use them directly. 1390 MachineFrameInfo &MFI = MF.getFrameInfo(); 1391 bool HasStackObjects = MFI.hasStackObjects(); 1392 1393 // Record that we know we have non-spill stack objects so we don't need to 1394 // check all stack objects later. 1395 if (HasStackObjects) 1396 Info.setHasNonSpillStackObjects(true); 1397 1398 // Everything live out of a block is spilled with fast regalloc, so it's 1399 // almost certain that spilling will be required. 1400 if (TM.getOptLevel() == CodeGenOpt::None) 1401 HasStackObjects = true; 1402 1403 // For now assume stack access is needed in any callee functions, so we need 1404 // the scratch registers to pass in. 1405 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); 1406 1407 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 1408 if (ST.isAmdCodeObjectV2(MF)) { 1409 if (RequiresStackAccess) { 1410 // If we have stack objects, we unquestionably need the private buffer 1411 // resource. For the Code Object V2 ABI, this will be the first 4 user 1412 // SGPR inputs. We can reserve those and use them directly. 1413 1414 unsigned PrivateSegmentBufferReg = Info.getPreloadedReg( 1415 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 1416 Info.setScratchRSrcReg(PrivateSegmentBufferReg); 1417 1418 if (MFI.hasCalls()) { 1419 // If we have calls, we need to keep the frame register in a register 1420 // that won't be clobbered by a call, so ensure it is copied somewhere. 1421 1422 // This is not a problem for the scratch wave offset, because the same 1423 // registers are reserved in all functions. 1424 1425 // FIXME: Nothing is really ensuring this is a call preserved register, 1426 // it's just selected from the end so it happens to be. 1427 unsigned ReservedOffsetReg 1428 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); 1429 Info.setScratchWaveOffsetReg(ReservedOffsetReg); 1430 } else { 1431 unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg( 1432 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 1433 Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); 1434 } 1435 } else { 1436 unsigned ReservedBufferReg 1437 = TRI.reservedPrivateSegmentBufferReg(MF); 1438 unsigned ReservedOffsetReg 1439 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); 1440 1441 // We tentatively reserve the last registers (skipping the last two 1442 // which may contain VCC). After register allocation, we'll replace 1443 // these with the ones immediately after those which were really 1444 // allocated. In the prologue copies will be inserted from the argument 1445 // to these reserved registers. 1446 Info.setScratchRSrcReg(ReservedBufferReg); 1447 Info.setScratchWaveOffsetReg(ReservedOffsetReg); 1448 } 1449 } else { 1450 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); 1451 1452 // Without HSA, relocations are used for the scratch pointer and the 1453 // buffer resource setup is always inserted in the prologue. Scratch wave 1454 // offset is still in an input SGPR. 1455 Info.setScratchRSrcReg(ReservedBufferReg); 1456 1457 if (HasStackObjects && !MFI.hasCalls()) { 1458 unsigned ScratchWaveOffsetReg = Info.getPreloadedReg( 1459 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); 1460 Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); 1461 } else { 1462 unsigned ReservedOffsetReg 1463 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); 1464 Info.setScratchWaveOffsetReg(ReservedOffsetReg); 1465 } 1466 } 1467 } 1468 1469 bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const { 1470 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1471 return !Info->isEntryFunction(); 1472 } 1473 1474 void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 1475 1476 } 1477 1478 void SITargetLowering::insertCopiesSplitCSR( 1479 MachineBasicBlock *Entry, 1480 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 1481 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 1482 1483 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 1484 if (!IStart) 1485 return; 1486 1487 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1488 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 1489 MachineBasicBlock::iterator MBBI = Entry->begin(); 1490 for (const MCPhysReg *I = IStart; *I; ++I) { 1491 const TargetRegisterClass *RC = nullptr; 1492 if (AMDGPU::SReg_64RegClass.contains(*I)) 1493 RC = &AMDGPU::SGPR_64RegClass; 1494 else if (AMDGPU::SReg_32RegClass.contains(*I)) 1495 RC = &AMDGPU::SGPR_32RegClass; 1496 else 1497 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 1498 1499 unsigned NewVR = MRI->createVirtualRegister(RC); 1500 // Create copy from CSR to a virtual register. 1501 Entry->addLiveIn(*I); 1502 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 1503 .addReg(*I); 1504 1505 // Insert the copy-back instructions right before the terminator. 1506 for (auto *Exit : Exits) 1507 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 1508 TII->get(TargetOpcode::COPY), *I) 1509 .addReg(NewVR); 1510 } 1511 } 1512 1513 SDValue SITargetLowering::LowerFormalArguments( 1514 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 1515 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 1516 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 1517 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 1518 1519 MachineFunction &MF = DAG.getMachineFunction(); 1520 FunctionType *FType = MF.getFunction().getFunctionType(); 1521 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1522 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 1523 1524 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { 1525 const Function &Fn = MF.getFunction(); 1526 DiagnosticInfoUnsupported NoGraphicsHSA( 1527 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); 1528 DAG.getContext()->diagnose(NoGraphicsHSA); 1529 return DAG.getEntryNode(); 1530 } 1531 1532 // Create stack objects that are used for emitting debugger prologue if 1533 // "amdgpu-debugger-emit-prologue" attribute was specified. 1534 if (ST.debuggerEmitPrologue()) 1535 createDebuggerPrologueStackObjects(MF); 1536 1537 SmallVector<ISD::InputArg, 16> Splits; 1538 SmallVector<CCValAssign, 16> ArgLocs; 1539 BitVector Skipped(Ins.size()); 1540 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1541 *DAG.getContext()); 1542 1543 bool IsShader = AMDGPU::isShader(CallConv); 1544 bool IsKernel = AMDGPU::isKernel(CallConv); 1545 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); 1546 1547 if (!IsEntryFunc) { 1548 // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over 1549 // this when allocating argument fixed offsets. 1550 CCInfo.AllocateStack(4, 4); 1551 } 1552 1553 if (IsShader) { 1554 processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); 1555 1556 // At least one interpolation mode must be enabled or else the GPU will 1557 // hang. 1558 // 1559 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 1560 // set PSInputAddr, the user wants to enable some bits after the compilation 1561 // based on run-time states. Since we can't know what the final PSInputEna 1562 // will look like, so we shouldn't do anything here and the user should take 1563 // responsibility for the correct programming. 1564 // 1565 // Otherwise, the following restrictions apply: 1566 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 1567 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 1568 // enabled too. 1569 if (CallConv == CallingConv::AMDGPU_PS) { 1570 if ((Info->getPSInputAddr() & 0x7F) == 0 || 1571 ((Info->getPSInputAddr() & 0xF) == 0 && 1572 Info->isPSInputAllocated(11))) { 1573 CCInfo.AllocateReg(AMDGPU::VGPR0); 1574 CCInfo.AllocateReg(AMDGPU::VGPR1); 1575 Info->markPSInputAllocated(0); 1576 Info->markPSInputEnabled(0); 1577 } 1578 if (Subtarget->isAmdPalOS()) { 1579 // For isAmdPalOS, the user does not enable some bits after compilation 1580 // based on run-time states; the register values being generated here are 1581 // the final ones set in hardware. Therefore we need to apply the 1582 // workaround to PSInputAddr and PSInputEnable together. (The case where 1583 // a bit is set in PSInputAddr but not PSInputEnable is where the 1584 // frontend set up an input arg for a particular interpolation mode, but 1585 // nothing uses that input arg. Really we should have an earlier pass 1586 // that removes such an arg.) 1587 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 1588 if ((PsInputBits & 0x7F) == 0 || 1589 ((PsInputBits & 0xF) == 0 && 1590 (PsInputBits >> 11 & 1))) 1591 Info->markPSInputEnabled( 1592 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); 1593 } 1594 } 1595 1596 assert(!Info->hasDispatchPtr() && 1597 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && 1598 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && 1599 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && 1600 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && 1601 !Info->hasWorkItemIDZ()); 1602 } else if (IsKernel) { 1603 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); 1604 } else { 1605 Splits.append(Ins.begin(), Ins.end()); 1606 } 1607 1608 if (IsEntryFunc) { 1609 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 1610 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); 1611 } 1612 1613 if (IsKernel) { 1614 analyzeFormalArgumentsCompute(CCInfo, Ins); 1615 } else { 1616 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); 1617 CCInfo.AnalyzeFormalArguments(Splits, AssignFn); 1618 } 1619 1620 SmallVector<SDValue, 16> Chains; 1621 1622 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 1623 const ISD::InputArg &Arg = Ins[i]; 1624 if (Skipped[i]) { 1625 InVals.push_back(DAG.getUNDEF(Arg.VT)); 1626 continue; 1627 } 1628 1629 CCValAssign &VA = ArgLocs[ArgIdx++]; 1630 MVT VT = VA.getLocVT(); 1631 1632 if (IsEntryFunc && VA.isMemLoc()) { 1633 VT = Ins[i].VT; 1634 EVT MemVT = VA.getLocVT(); 1635 1636 const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) + 1637 VA.getLocMemOffset(); 1638 Info->setABIArgOffset(Offset + MemVT.getStoreSize()); 1639 1640 // The first 36 bytes of the input buffer contains information about 1641 // thread group and global sizes. 1642 SDValue Arg = lowerKernargMemParameter( 1643 DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]); 1644 Chains.push_back(Arg.getValue(1)); 1645 1646 auto *ParamTy = 1647 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 1648 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && 1649 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 1650 // On SI local pointers are just offsets into LDS, so they are always 1651 // less than 16-bits. On CI and newer they could potentially be 1652 // real pointers, so we can't guarantee their size. 1653 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, 1654 DAG.getValueType(MVT::i16)); 1655 } 1656 1657 InVals.push_back(Arg); 1658 continue; 1659 } else if (!IsEntryFunc && VA.isMemLoc()) { 1660 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); 1661 InVals.push_back(Val); 1662 if (!Arg.Flags.isByVal()) 1663 Chains.push_back(Val.getValue(1)); 1664 continue; 1665 } 1666 1667 assert(VA.isRegLoc() && "Parameter must be in a register!"); 1668 1669 unsigned Reg = VA.getLocReg(); 1670 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 1671 EVT ValVT = VA.getValVT(); 1672 1673 Reg = MF.addLiveIn(Reg, RC); 1674 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1675 1676 if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) { 1677 // The return object should be reasonably addressable. 1678 1679 // FIXME: This helps when the return is a real sret. If it is a 1680 // automatically inserted sret (i.e. CanLowerReturn returns false), an 1681 // extra copy is inserted in SelectionDAGBuilder which obscures this. 1682 unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits; 1683 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, 1684 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); 1685 } 1686 1687 // If this is an 8 or 16-bit value, it is really passed promoted 1688 // to 32 bits. Insert an assert[sz]ext to capture this, then 1689 // truncate to the right size. 1690 switch (VA.getLocInfo()) { 1691 case CCValAssign::Full: 1692 break; 1693 case CCValAssign::BCvt: 1694 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val); 1695 break; 1696 case CCValAssign::SExt: 1697 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, 1698 DAG.getValueType(ValVT)); 1699 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); 1700 break; 1701 case CCValAssign::ZExt: 1702 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, 1703 DAG.getValueType(ValVT)); 1704 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); 1705 break; 1706 case CCValAssign::AExt: 1707 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); 1708 break; 1709 default: 1710 llvm_unreachable("Unknown loc info!"); 1711 } 1712 1713 if (IsShader && Arg.VT.isVector()) { 1714 // Build a vector from the registers 1715 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); 1716 unsigned NumElements = ParamType->getVectorNumElements(); 1717 1718 SmallVector<SDValue, 4> Regs; 1719 Regs.push_back(Val); 1720 for (unsigned j = 1; j != NumElements; ++j) { 1721 Reg = ArgLocs[ArgIdx++].getLocReg(); 1722 Reg = MF.addLiveIn(Reg, RC); 1723 1724 SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1725 Regs.push_back(Copy); 1726 } 1727 1728 // Fill up the missing vector elements 1729 NumElements = Arg.VT.getVectorNumElements() - NumElements; 1730 Regs.append(NumElements, DAG.getUNDEF(VT)); 1731 1732 InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); 1733 continue; 1734 } 1735 1736 InVals.push_back(Val); 1737 } 1738 1739 if (!IsEntryFunc) { 1740 // Special inputs come after user arguments. 1741 allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); 1742 } 1743 1744 // Start adding system SGPRs. 1745 if (IsEntryFunc) { 1746 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); 1747 } else { 1748 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 1749 CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); 1750 CCInfo.AllocateReg(Info->getFrameOffsetReg()); 1751 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 1752 } 1753 1754 auto &ArgUsageInfo = 1755 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); 1756 ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo()); 1757 1758 unsigned StackArgSize = CCInfo.getNextStackOffset(); 1759 Info->setBytesInStackArgArea(StackArgSize); 1760 1761 return Chains.empty() ? Chain : 1762 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 1763 } 1764 1765 // TODO: If return values can't fit in registers, we should return as many as 1766 // possible in registers before passing on stack. 1767 bool SITargetLowering::CanLowerReturn( 1768 CallingConv::ID CallConv, 1769 MachineFunction &MF, bool IsVarArg, 1770 const SmallVectorImpl<ISD::OutputArg> &Outs, 1771 LLVMContext &Context) const { 1772 // Replacing returns with sret/stack usage doesn't make sense for shaders. 1773 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn 1774 // for shaders. Vector types should be explicitly handled by CC. 1775 if (AMDGPU::isEntryFunctionCC(CallConv)) 1776 return true; 1777 1778 SmallVector<CCValAssign, 16> RVLocs; 1779 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); 1780 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)); 1781 } 1782 1783 SDValue 1784 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 1785 bool isVarArg, 1786 const SmallVectorImpl<ISD::OutputArg> &Outs, 1787 const SmallVectorImpl<SDValue> &OutVals, 1788 const SDLoc &DL, SelectionDAG &DAG) const { 1789 MachineFunction &MF = DAG.getMachineFunction(); 1790 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1791 1792 if (AMDGPU::isKernel(CallConv)) { 1793 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, 1794 OutVals, DL, DAG); 1795 } 1796 1797 bool IsShader = AMDGPU::isShader(CallConv); 1798 1799 Info->setIfReturnsVoid(Outs.size() == 0); 1800 bool IsWaveEnd = Info->returnsVoid() && IsShader; 1801 1802 SmallVector<ISD::OutputArg, 48> Splits; 1803 SmallVector<SDValue, 48> SplitVals; 1804 1805 // Split vectors into their elements. 1806 for (unsigned i = 0, e = Outs.size(); i != e; ++i) { 1807 const ISD::OutputArg &Out = Outs[i]; 1808 1809 if (IsShader && Out.VT.isVector()) { 1810 MVT VT = Out.VT.getVectorElementType(); 1811 ISD::OutputArg NewOut = Out; 1812 NewOut.Flags.setSplit(); 1813 NewOut.VT = VT; 1814 1815 // We want the original number of vector elements here, e.g. 1816 // three or five, not four or eight. 1817 unsigned NumElements = Out.ArgVT.getVectorNumElements(); 1818 1819 for (unsigned j = 0; j != NumElements; ++j) { 1820 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], 1821 DAG.getConstant(j, DL, MVT::i32)); 1822 SplitVals.push_back(Elem); 1823 Splits.push_back(NewOut); 1824 NewOut.PartOffset += NewOut.VT.getStoreSize(); 1825 } 1826 } else { 1827 SplitVals.push_back(OutVals[i]); 1828 Splits.push_back(Out); 1829 } 1830 } 1831 1832 // CCValAssign - represent the assignment of the return value to a location. 1833 SmallVector<CCValAssign, 48> RVLocs; 1834 1835 // CCState - Info about the registers and stack slots. 1836 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 1837 *DAG.getContext()); 1838 1839 // Analyze outgoing return values. 1840 CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg)); 1841 1842 SDValue Flag; 1843 SmallVector<SDValue, 48> RetOps; 1844 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1845 1846 // Add return address for callable functions. 1847 if (!Info->isEntryFunction()) { 1848 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 1849 SDValue ReturnAddrReg = CreateLiveInRegister( 1850 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); 1851 1852 // FIXME: Should be able to use a vreg here, but need a way to prevent it 1853 // from being allcoated to a CSR. 1854 1855 SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), 1856 MVT::i64); 1857 1858 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag); 1859 Flag = Chain.getValue(1); 1860 1861 RetOps.push_back(PhysReturnAddrReg); 1862 } 1863 1864 // Copy the result values into the output registers. 1865 for (unsigned i = 0, realRVLocIdx = 0; 1866 i != RVLocs.size(); 1867 ++i, ++realRVLocIdx) { 1868 CCValAssign &VA = RVLocs[i]; 1869 assert(VA.isRegLoc() && "Can only return in registers!"); 1870 // TODO: Partially return in registers if return values don't fit. 1871 1872 SDValue Arg = SplitVals[realRVLocIdx]; 1873 1874 // Copied from other backends. 1875 switch (VA.getLocInfo()) { 1876 case CCValAssign::Full: 1877 break; 1878 case CCValAssign::BCvt: 1879 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 1880 break; 1881 case CCValAssign::SExt: 1882 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 1883 break; 1884 case CCValAssign::ZExt: 1885 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 1886 break; 1887 case CCValAssign::AExt: 1888 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 1889 break; 1890 default: 1891 llvm_unreachable("Unknown loc info!"); 1892 } 1893 1894 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 1895 Flag = Chain.getValue(1); 1896 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1897 } 1898 1899 // FIXME: Does sret work properly? 1900 if (!Info->isEntryFunction()) { 1901 const SIRegisterInfo *TRI 1902 = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo(); 1903 const MCPhysReg *I = 1904 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 1905 if (I) { 1906 for (; *I; ++I) { 1907 if (AMDGPU::SReg_64RegClass.contains(*I)) 1908 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 1909 else if (AMDGPU::SReg_32RegClass.contains(*I)) 1910 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 1911 else 1912 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 1913 } 1914 } 1915 } 1916 1917 // Update chain and glue. 1918 RetOps[0] = Chain; 1919 if (Flag.getNode()) 1920 RetOps.push_back(Flag); 1921 1922 unsigned Opc = AMDGPUISD::ENDPGM; 1923 if (!IsWaveEnd) 1924 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG; 1925 return DAG.getNode(Opc, DL, MVT::Other, RetOps); 1926 } 1927 1928 SDValue SITargetLowering::LowerCallResult( 1929 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg, 1930 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 1931 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn, 1932 SDValue ThisVal) const { 1933 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg); 1934 1935 // Assign locations to each value returned by this call. 1936 SmallVector<CCValAssign, 16> RVLocs; 1937 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, 1938 *DAG.getContext()); 1939 CCInfo.AnalyzeCallResult(Ins, RetCC); 1940 1941 // Copy all of the result registers out of their specified physreg. 1942 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1943 CCValAssign VA = RVLocs[i]; 1944 SDValue Val; 1945 1946 if (VA.isRegLoc()) { 1947 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); 1948 Chain = Val.getValue(1); 1949 InFlag = Val.getValue(2); 1950 } else if (VA.isMemLoc()) { 1951 report_fatal_error("TODO: return values in memory"); 1952 } else 1953 llvm_unreachable("unknown argument location type"); 1954 1955 switch (VA.getLocInfo()) { 1956 case CCValAssign::Full: 1957 break; 1958 case CCValAssign::BCvt: 1959 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 1960 break; 1961 case CCValAssign::ZExt: 1962 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val, 1963 DAG.getValueType(VA.getValVT())); 1964 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); 1965 break; 1966 case CCValAssign::SExt: 1967 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val, 1968 DAG.getValueType(VA.getValVT())); 1969 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); 1970 break; 1971 case CCValAssign::AExt: 1972 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); 1973 break; 1974 default: 1975 llvm_unreachable("Unknown loc info!"); 1976 } 1977 1978 InVals.push_back(Val); 1979 } 1980 1981 return Chain; 1982 } 1983 1984 // Add code to pass special inputs required depending on used features separate 1985 // from the explicit user arguments present in the IR. 1986 void SITargetLowering::passSpecialInputs( 1987 CallLoweringInfo &CLI, 1988 const SIMachineFunctionInfo &Info, 1989 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, 1990 SmallVectorImpl<SDValue> &MemOpChains, 1991 SDValue Chain, 1992 SDValue StackPtr) const { 1993 // If we don't have a call site, this was a call inserted by 1994 // legalization. These can never use special inputs. 1995 if (!CLI.CS) 1996 return; 1997 1998 const Function *CalleeFunc = CLI.CS.getCalledFunction(); 1999 assert(CalleeFunc); 2000 2001 SelectionDAG &DAG = CLI.DAG; 2002 const SDLoc &DL = CLI.DL; 2003 2004 const SISubtarget *ST = getSubtarget(); 2005 const SIRegisterInfo *TRI = ST->getRegisterInfo(); 2006 2007 auto &ArgUsageInfo = 2008 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); 2009 const AMDGPUFunctionArgInfo &CalleeArgInfo 2010 = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); 2011 2012 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); 2013 2014 // TODO: Unify with private memory register handling. This is complicated by 2015 // the fact that at least in kernels, the input argument is not necessarily 2016 // in the same location as the input. 2017 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { 2018 AMDGPUFunctionArgInfo::DISPATCH_PTR, 2019 AMDGPUFunctionArgInfo::QUEUE_PTR, 2020 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR, 2021 AMDGPUFunctionArgInfo::DISPATCH_ID, 2022 AMDGPUFunctionArgInfo::WORKGROUP_ID_X, 2023 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, 2024 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, 2025 AMDGPUFunctionArgInfo::WORKITEM_ID_X, 2026 AMDGPUFunctionArgInfo::WORKITEM_ID_Y, 2027 AMDGPUFunctionArgInfo::WORKITEM_ID_Z, 2028 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR 2029 }; 2030 2031 for (auto InputID : InputRegs) { 2032 const ArgDescriptor *OutgoingArg; 2033 const TargetRegisterClass *ArgRC; 2034 2035 std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID); 2036 if (!OutgoingArg) 2037 continue; 2038 2039 const ArgDescriptor *IncomingArg; 2040 const TargetRegisterClass *IncomingArgRC; 2041 std::tie(IncomingArg, IncomingArgRC) 2042 = CallerArgInfo.getPreloadedValue(InputID); 2043 assert(IncomingArgRC == ArgRC); 2044 2045 // All special arguments are ints for now. 2046 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32; 2047 SDValue InputReg; 2048 2049 if (IncomingArg) { 2050 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg); 2051 } else { 2052 // The implicit arg ptr is special because it doesn't have a corresponding 2053 // input for kernels, and is computed from the kernarg segment pointer. 2054 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2055 InputReg = getImplicitArgPtr(DAG, DL); 2056 } 2057 2058 if (OutgoingArg->isRegister()) { 2059 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); 2060 } else { 2061 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr, 2062 InputReg, 2063 OutgoingArg->getStackOffset()); 2064 MemOpChains.push_back(ArgStore); 2065 } 2066 } 2067 } 2068 2069 static bool canGuaranteeTCO(CallingConv::ID CC) { 2070 return CC == CallingConv::Fast; 2071 } 2072 2073 /// Return true if we might ever do TCO for calls with this calling convention. 2074 static bool mayTailCallThisCC(CallingConv::ID CC) { 2075 switch (CC) { 2076 case CallingConv::C: 2077 return true; 2078 default: 2079 return canGuaranteeTCO(CC); 2080 } 2081 } 2082 2083 bool SITargetLowering::isEligibleForTailCallOptimization( 2084 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg, 2085 const SmallVectorImpl<ISD::OutputArg> &Outs, 2086 const SmallVectorImpl<SDValue> &OutVals, 2087 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 2088 if (!mayTailCallThisCC(CalleeCC)) 2089 return false; 2090 2091 MachineFunction &MF = DAG.getMachineFunction(); 2092 const Function &CallerF = MF.getFunction(); 2093 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2094 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 2095 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2096 2097 // Kernels aren't callable, and don't have a live in return address so it 2098 // doesn't make sense to do a tail call with entry functions. 2099 if (!CallerPreserved) 2100 return false; 2101 2102 bool CCMatch = CallerCC == CalleeCC; 2103 2104 if (DAG.getTarget().Options.GuaranteedTailCallOpt) { 2105 if (canGuaranteeTCO(CalleeCC) && CCMatch) 2106 return true; 2107 return false; 2108 } 2109 2110 // TODO: Can we handle var args? 2111 if (IsVarArg) 2112 return false; 2113 2114 for (const Argument &Arg : CallerF.args()) { 2115 if (Arg.hasByValAttr()) 2116 return false; 2117 } 2118 2119 LLVMContext &Ctx = *DAG.getContext(); 2120 2121 // Check that the call results are passed in the same way. 2122 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins, 2123 CCAssignFnForCall(CalleeCC, IsVarArg), 2124 CCAssignFnForCall(CallerCC, IsVarArg))) 2125 return false; 2126 2127 // The callee has to preserve all registers the caller needs to preserve. 2128 if (!CCMatch) { 2129 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2130 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2131 return false; 2132 } 2133 2134 // Nothing more to check if the callee is taking no arguments. 2135 if (Outs.empty()) 2136 return true; 2137 2138 SmallVector<CCValAssign, 16> ArgLocs; 2139 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx); 2140 2141 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg)); 2142 2143 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 2144 // If the stack arguments for this call do not fit into our own save area then 2145 // the call cannot be made tail. 2146 // TODO: Is this really necessary? 2147 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) 2148 return false; 2149 2150 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2151 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals); 2152 } 2153 2154 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 2155 if (!CI->isTailCall()) 2156 return false; 2157 2158 const Function *ParentFn = CI->getParent()->getParent(); 2159 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv())) 2160 return false; 2161 2162 auto Attr = ParentFn->getFnAttribute("disable-tail-calls"); 2163 return (Attr.getValueAsString() != "true"); 2164 } 2165 2166 // The wave scratch offset register is used as the global base pointer. 2167 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, 2168 SmallVectorImpl<SDValue> &InVals) const { 2169 SelectionDAG &DAG = CLI.DAG; 2170 const SDLoc &DL = CLI.DL; 2171 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 2172 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 2173 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 2174 SDValue Chain = CLI.Chain; 2175 SDValue Callee = CLI.Callee; 2176 bool &IsTailCall = CLI.IsTailCall; 2177 CallingConv::ID CallConv = CLI.CallConv; 2178 bool IsVarArg = CLI.IsVarArg; 2179 bool IsSibCall = false; 2180 bool IsThisReturn = false; 2181 MachineFunction &MF = DAG.getMachineFunction(); 2182 2183 if (IsVarArg) { 2184 return lowerUnhandledCall(CLI, InVals, 2185 "unsupported call to variadic function "); 2186 } 2187 2188 if (!CLI.CS.getCalledFunction()) { 2189 return lowerUnhandledCall(CLI, InVals, 2190 "unsupported indirect call to function "); 2191 } 2192 2193 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) { 2194 return lowerUnhandledCall(CLI, InVals, 2195 "unsupported required tail call to function "); 2196 } 2197 2198 // The first 4 bytes are reserved for the callee's emergency stack slot. 2199 const unsigned CalleeUsableStackOffset = 4; 2200 2201 if (IsTailCall) { 2202 IsTailCall = isEligibleForTailCallOptimization( 2203 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); 2204 if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) { 2205 report_fatal_error("failed to perform tail call elimination on a call " 2206 "site marked musttail"); 2207 } 2208 2209 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2210 2211 // A sibling call is one where we're under the usual C ABI and not planning 2212 // to change that but can still do a tail call: 2213 if (!TailCallOpt && IsTailCall) 2214 IsSibCall = true; 2215 2216 if (IsTailCall) 2217 ++NumTailCalls; 2218 } 2219 2220 if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) { 2221 // FIXME: Remove this hack for function pointer types after removing 2222 // support of old address space mapping. In the new address space 2223 // mapping the pointer in default address space is 64 bit, therefore 2224 // does not need this hack. 2225 if (Callee.getValueType() == MVT::i32) { 2226 const GlobalValue *GV = GA->getGlobal(); 2227 Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false, 2228 GA->getTargetFlags()); 2229 } 2230 } 2231 assert(Callee.getValueType() == MVT::i64); 2232 2233 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2234 2235 // Analyze operands of the call, assigning locations to each operand. 2236 SmallVector<CCValAssign, 16> ArgLocs; 2237 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); 2238 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); 2239 CCInfo.AnalyzeCallOperands(Outs, AssignFn); 2240 2241 // Get a count of how many bytes are to be pushed on the stack. 2242 unsigned NumBytes = CCInfo.getNextStackOffset(); 2243 2244 if (IsSibCall) { 2245 // Since we're not changing the ABI to make this a tail call, the memory 2246 // operands are already available in the caller's incoming argument space. 2247 NumBytes = 0; 2248 } 2249 2250 // FPDiff is the byte offset of the call's argument area from the callee's. 2251 // Stores to callee stack arguments will be placed in FixedStackSlots offset 2252 // by this amount for a tail call. In a sibling call it must be 0 because the 2253 // caller will deallocate the entire stack and the callee still expects its 2254 // arguments to begin at SP+0. Completely unused for non-tail calls. 2255 int32_t FPDiff = 0; 2256 MachineFrameInfo &MFI = MF.getFrameInfo(); 2257 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2258 2259 SDValue CallerSavedFP; 2260 2261 // Adjust the stack pointer for the new arguments... 2262 // These operations are automatically eliminated by the prolog/epilog pass 2263 if (!IsSibCall) { 2264 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); 2265 2266 unsigned OffsetReg = Info->getScratchWaveOffsetReg(); 2267 2268 // In the HSA case, this should be an identity copy. 2269 SDValue ScratchRSrcReg 2270 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); 2271 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); 2272 2273 // TODO: Don't hardcode these registers and get from the callee function. 2274 SDValue ScratchWaveOffsetReg 2275 = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32); 2276 RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg); 2277 2278 if (!Info->isEntryFunction()) { 2279 // Avoid clobbering this function's FP value. In the current convention 2280 // callee will overwrite this, so do save/restore around the call site. 2281 CallerSavedFP = DAG.getCopyFromReg(Chain, DL, 2282 Info->getFrameOffsetReg(), MVT::i32); 2283 } 2284 } 2285 2286 // Stack pointer relative accesses are done by changing the offset SGPR. This 2287 // is just the VGPR offset component. 2288 SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32); 2289 2290 SmallVector<SDValue, 8> MemOpChains; 2291 MVT PtrVT = MVT::i32; 2292 2293 // Walk the register/memloc assignments, inserting copies/loads. 2294 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; 2295 ++i, ++realArgIdx) { 2296 CCValAssign &VA = ArgLocs[i]; 2297 SDValue Arg = OutVals[realArgIdx]; 2298 2299 // Promote the value if needed. 2300 switch (VA.getLocInfo()) { 2301 case CCValAssign::Full: 2302 break; 2303 case CCValAssign::BCvt: 2304 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 2305 break; 2306 case CCValAssign::ZExt: 2307 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 2308 break; 2309 case CCValAssign::SExt: 2310 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 2311 break; 2312 case CCValAssign::AExt: 2313 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 2314 break; 2315 case CCValAssign::FPExt: 2316 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 2317 break; 2318 default: 2319 llvm_unreachable("Unknown loc info!"); 2320 } 2321 2322 if (VA.isRegLoc()) { 2323 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2324 } else { 2325 assert(VA.isMemLoc()); 2326 2327 SDValue DstAddr; 2328 MachinePointerInfo DstInfo; 2329 2330 unsigned LocMemOffset = VA.getLocMemOffset(); 2331 int32_t Offset = LocMemOffset; 2332 2333 SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset); 2334 2335 if (IsTailCall) { 2336 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2337 unsigned OpSize = Flags.isByVal() ? 2338 Flags.getByValSize() : VA.getValVT().getStoreSize(); 2339 2340 Offset = Offset + FPDiff; 2341 int FI = MFI.CreateFixedObject(OpSize, Offset, true); 2342 2343 DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT), 2344 StackPtr); 2345 DstInfo = MachinePointerInfo::getFixedStack(MF, FI); 2346 2347 // Make sure any stack arguments overlapping with where we're storing 2348 // are loaded before this eventual operation. Otherwise they'll be 2349 // clobbered. 2350 2351 // FIXME: Why is this really necessary? This seems to just result in a 2352 // lot of code to copy the stack and write them back to the same 2353 // locations, which are supposed to be immutable? 2354 Chain = addTokenForArgument(Chain, DAG, MFI, FI); 2355 } else { 2356 DstAddr = PtrOff; 2357 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); 2358 } 2359 2360 if (Outs[i].Flags.isByVal()) { 2361 SDValue SizeNode = 2362 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32); 2363 SDValue Cpy = DAG.getMemcpy( 2364 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), 2365 /*isVol = */ false, /*AlwaysInline = */ true, 2366 /*isTailCall = */ false, DstInfo, 2367 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy( 2368 *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)))); 2369 2370 MemOpChains.push_back(Cpy); 2371 } else { 2372 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); 2373 MemOpChains.push_back(Store); 2374 } 2375 } 2376 } 2377 2378 // Copy special input registers after user input arguments. 2379 passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr); 2380 2381 if (!MemOpChains.empty()) 2382 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 2383 2384 // Build a sequence of copy-to-reg nodes chained together with token chain 2385 // and flag operands which copy the outgoing args into the appropriate regs. 2386 SDValue InFlag; 2387 for (auto &RegToPass : RegsToPass) { 2388 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, 2389 RegToPass.second, InFlag); 2390 InFlag = Chain.getValue(1); 2391 } 2392 2393 2394 SDValue PhysReturnAddrReg; 2395 if (IsTailCall) { 2396 // Since the return is being combined with the call, we need to pass on the 2397 // return address. 2398 2399 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 2400 SDValue ReturnAddrReg = CreateLiveInRegister( 2401 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); 2402 2403 PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), 2404 MVT::i64); 2405 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag); 2406 InFlag = Chain.getValue(1); 2407 } 2408 2409 // We don't usually want to end the call-sequence here because we would tidy 2410 // the frame up *after* the call, however in the ABI-changing tail-call case 2411 // we've carefully laid out the parameters so that when sp is reset they'll be 2412 // in the correct location. 2413 if (IsTailCall && !IsSibCall) { 2414 Chain = DAG.getCALLSEQ_END(Chain, 2415 DAG.getTargetConstant(NumBytes, DL, MVT::i32), 2416 DAG.getTargetConstant(0, DL, MVT::i32), 2417 InFlag, DL); 2418 InFlag = Chain.getValue(1); 2419 } 2420 2421 std::vector<SDValue> Ops; 2422 Ops.push_back(Chain); 2423 Ops.push_back(Callee); 2424 2425 if (IsTailCall) { 2426 // Each tail call may have to adjust the stack by a different amount, so 2427 // this information must travel along with the operation for eventual 2428 // consumption by emitEpilogue. 2429 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 2430 2431 Ops.push_back(PhysReturnAddrReg); 2432 } 2433 2434 // Add argument registers to the end of the list so that they are known live 2435 // into the call. 2436 for (auto &RegToPass : RegsToPass) { 2437 Ops.push_back(DAG.getRegister(RegToPass.first, 2438 RegToPass.second.getValueType())); 2439 } 2440 2441 // Add a register mask operand representing the call-preserved registers. 2442 2443 const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); 2444 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); 2445 assert(Mask && "Missing call preserved mask for calling convention"); 2446 Ops.push_back(DAG.getRegisterMask(Mask)); 2447 2448 if (InFlag.getNode()) 2449 Ops.push_back(InFlag); 2450 2451 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2452 2453 // If we're doing a tall call, use a TC_RETURN here rather than an 2454 // actual call instruction. 2455 if (IsTailCall) { 2456 MFI.setHasTailCall(); 2457 return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops); 2458 } 2459 2460 // Returns a chain and a flag for retval copy to use. 2461 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops); 2462 Chain = Call.getValue(0); 2463 InFlag = Call.getValue(1); 2464 2465 if (CallerSavedFP) { 2466 SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32); 2467 Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag); 2468 InFlag = Chain.getValue(1); 2469 } 2470 2471 uint64_t CalleePopBytes = NumBytes; 2472 Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32), 2473 DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), 2474 InFlag, DL); 2475 if (!Ins.empty()) 2476 InFlag = Chain.getValue(1); 2477 2478 // Handle result values, copying them out of physregs into vregs that we 2479 // return. 2480 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, 2481 InVals, IsThisReturn, 2482 IsThisReturn ? OutVals[0] : SDValue()); 2483 } 2484 2485 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, 2486 SelectionDAG &DAG) const { 2487 unsigned Reg = StringSwitch<unsigned>(RegName) 2488 .Case("m0", AMDGPU::M0) 2489 .Case("exec", AMDGPU::EXEC) 2490 .Case("exec_lo", AMDGPU::EXEC_LO) 2491 .Case("exec_hi", AMDGPU::EXEC_HI) 2492 .Case("flat_scratch", AMDGPU::FLAT_SCR) 2493 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) 2494 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) 2495 .Default(AMDGPU::NoRegister); 2496 2497 if (Reg == AMDGPU::NoRegister) { 2498 report_fatal_error(Twine("invalid register name \"" 2499 + StringRef(RegName) + "\".")); 2500 2501 } 2502 2503 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && 2504 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { 2505 report_fatal_error(Twine("invalid register \"" 2506 + StringRef(RegName) + "\" for subtarget.")); 2507 } 2508 2509 switch (Reg) { 2510 case AMDGPU::M0: 2511 case AMDGPU::EXEC_LO: 2512 case AMDGPU::EXEC_HI: 2513 case AMDGPU::FLAT_SCR_LO: 2514 case AMDGPU::FLAT_SCR_HI: 2515 if (VT.getSizeInBits() == 32) 2516 return Reg; 2517 break; 2518 case AMDGPU::EXEC: 2519 case AMDGPU::FLAT_SCR: 2520 if (VT.getSizeInBits() == 64) 2521 return Reg; 2522 break; 2523 default: 2524 llvm_unreachable("missing register type checking"); 2525 } 2526 2527 report_fatal_error(Twine("invalid type for register \"" 2528 + StringRef(RegName) + "\".")); 2529 } 2530 2531 // If kill is not the last instruction, split the block so kill is always a 2532 // proper terminator. 2533 MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, 2534 MachineBasicBlock *BB) const { 2535 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 2536 2537 MachineBasicBlock::iterator SplitPoint(&MI); 2538 ++SplitPoint; 2539 2540 if (SplitPoint == BB->end()) { 2541 // Don't bother with a new block. 2542 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); 2543 return BB; 2544 } 2545 2546 MachineFunction *MF = BB->getParent(); 2547 MachineBasicBlock *SplitBB 2548 = MF->CreateMachineBasicBlock(BB->getBasicBlock()); 2549 2550 MF->insert(++MachineFunction::iterator(BB), SplitBB); 2551 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); 2552 2553 SplitBB->transferSuccessorsAndUpdatePHIs(BB); 2554 BB->addSuccessor(SplitBB); 2555 2556 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); 2557 return SplitBB; 2558 } 2559 2560 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the 2561 // wavefront. If the value is uniform and just happens to be in a VGPR, this 2562 // will only do one iteration. In the worst case, this will loop 64 times. 2563 // 2564 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. 2565 static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( 2566 const SIInstrInfo *TII, 2567 MachineRegisterInfo &MRI, 2568 MachineBasicBlock &OrigBB, 2569 MachineBasicBlock &LoopBB, 2570 const DebugLoc &DL, 2571 const MachineOperand &IdxReg, 2572 unsigned InitReg, 2573 unsigned ResultReg, 2574 unsigned PhiReg, 2575 unsigned InitSaveExecReg, 2576 int Offset, 2577 bool UseGPRIdxMode, 2578 bool IsIndirectSrc) { 2579 MachineBasicBlock::iterator I = LoopBB.begin(); 2580 2581 unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2582 unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2583 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2584 unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2585 2586 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) 2587 .addReg(InitReg) 2588 .addMBB(&OrigBB) 2589 .addReg(ResultReg) 2590 .addMBB(&LoopBB); 2591 2592 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec) 2593 .addReg(InitSaveExecReg) 2594 .addMBB(&OrigBB) 2595 .addReg(NewExec) 2596 .addMBB(&LoopBB); 2597 2598 // Read the next variant <- also loop target. 2599 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) 2600 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); 2601 2602 // Compare the just read M0 value to all possible Idx values. 2603 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) 2604 .addReg(CurrentIdxReg) 2605 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); 2606 2607 // Update EXEC, save the original EXEC value to VCC. 2608 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) 2609 .addReg(CondReg, RegState::Kill); 2610 2611 MRI.setSimpleHint(NewExec, CondReg); 2612 2613 if (UseGPRIdxMode) { 2614 unsigned IdxReg; 2615 if (Offset == 0) { 2616 IdxReg = CurrentIdxReg; 2617 } else { 2618 IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 2619 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg) 2620 .addReg(CurrentIdxReg, RegState::Kill) 2621 .addImm(Offset); 2622 } 2623 unsigned IdxMode = IsIndirectSrc ? 2624 VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; 2625 MachineInstr *SetOn = 2626 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 2627 .addReg(IdxReg, RegState::Kill) 2628 .addImm(IdxMode); 2629 SetOn->getOperand(3).setIsUndef(); 2630 } else { 2631 // Move index from VCC into M0 2632 if (Offset == 0) { 2633 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2634 .addReg(CurrentIdxReg, RegState::Kill); 2635 } else { 2636 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 2637 .addReg(CurrentIdxReg, RegState::Kill) 2638 .addImm(Offset); 2639 } 2640 } 2641 2642 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 2643 MachineInstr *InsertPt = 2644 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 2645 .addReg(AMDGPU::EXEC) 2646 .addReg(NewExec); 2647 2648 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 2649 // s_cbranch_scc0? 2650 2651 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 2652 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 2653 .addMBB(&LoopBB); 2654 2655 return InsertPt->getIterator(); 2656 } 2657 2658 // This has slightly sub-optimal regalloc when the source vector is killed by 2659 // the read. The register allocator does not understand that the kill is 2660 // per-workitem, so is kept alive for the whole loop so we end up not re-using a 2661 // subregister from it, using 1 more VGPR than necessary. This was saved when 2662 // this was expanded after register allocation. 2663 static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, 2664 MachineBasicBlock &MBB, 2665 MachineInstr &MI, 2666 unsigned InitResultReg, 2667 unsigned PhiReg, 2668 int Offset, 2669 bool UseGPRIdxMode, 2670 bool IsIndirectSrc) { 2671 MachineFunction *MF = MBB.getParent(); 2672 MachineRegisterInfo &MRI = MF->getRegInfo(); 2673 const DebugLoc &DL = MI.getDebugLoc(); 2674 MachineBasicBlock::iterator I(&MI); 2675 2676 unsigned DstReg = MI.getOperand(0).getReg(); 2677 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 2678 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 2679 2680 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); 2681 2682 // Save the EXEC mask 2683 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec) 2684 .addReg(AMDGPU::EXEC); 2685 2686 // To insert the loop we need to split the block. Move everything after this 2687 // point to a new block, and insert a new empty block between the two. 2688 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 2689 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 2690 MachineFunction::iterator MBBI(MBB); 2691 ++MBBI; 2692 2693 MF->insert(MBBI, LoopBB); 2694 MF->insert(MBBI, RemainderBB); 2695 2696 LoopBB->addSuccessor(LoopBB); 2697 LoopBB->addSuccessor(RemainderBB); 2698 2699 // Move the rest of the block into a new block. 2700 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 2701 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 2702 2703 MBB.addSuccessor(LoopBB); 2704 2705 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 2706 2707 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, 2708 InitResultReg, DstReg, PhiReg, TmpExec, 2709 Offset, UseGPRIdxMode, IsIndirectSrc); 2710 2711 MachineBasicBlock::iterator First = RemainderBB->begin(); 2712 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 2713 .addReg(SaveExec); 2714 2715 return InsPt; 2716 } 2717 2718 // Returns subreg index, offset 2719 static std::pair<unsigned, int> 2720 computeIndirectRegAndOffset(const SIRegisterInfo &TRI, 2721 const TargetRegisterClass *SuperRC, 2722 unsigned VecReg, 2723 int Offset) { 2724 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32; 2725 2726 // Skip out of bounds offsets, or else we would end up using an undefined 2727 // register. 2728 if (Offset >= NumElts || Offset < 0) 2729 return std::make_pair(AMDGPU::sub0, Offset); 2730 2731 return std::make_pair(AMDGPU::sub0 + Offset, 0); 2732 } 2733 2734 // Return true if the index is an SGPR and was set. 2735 static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, 2736 MachineRegisterInfo &MRI, 2737 MachineInstr &MI, 2738 int Offset, 2739 bool UseGPRIdxMode, 2740 bool IsIndirectSrc) { 2741 MachineBasicBlock *MBB = MI.getParent(); 2742 const DebugLoc &DL = MI.getDebugLoc(); 2743 MachineBasicBlock::iterator I(&MI); 2744 2745 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 2746 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); 2747 2748 assert(Idx->getReg() != AMDGPU::NoRegister); 2749 2750 if (!TII->getRegisterInfo().isSGPRClass(IdxRC)) 2751 return false; 2752 2753 if (UseGPRIdxMode) { 2754 unsigned IdxMode = IsIndirectSrc ? 2755 VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; 2756 if (Offset == 0) { 2757 MachineInstr *SetOn = 2758 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 2759 .add(*Idx) 2760 .addImm(IdxMode); 2761 2762 SetOn->getOperand(3).setIsUndef(); 2763 } else { 2764 unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 2765 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) 2766 .add(*Idx) 2767 .addImm(Offset); 2768 MachineInstr *SetOn = 2769 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) 2770 .addReg(Tmp, RegState::Kill) 2771 .addImm(IdxMode); 2772 2773 SetOn->getOperand(3).setIsUndef(); 2774 } 2775 2776 return true; 2777 } 2778 2779 if (Offset == 0) { 2780 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2781 .add(*Idx); 2782 } else { 2783 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 2784 .add(*Idx) 2785 .addImm(Offset); 2786 } 2787 2788 return true; 2789 } 2790 2791 // Control flow needs to be inserted if indexing with a VGPR. 2792 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, 2793 MachineBasicBlock &MBB, 2794 const SISubtarget &ST) { 2795 const SIInstrInfo *TII = ST.getInstrInfo(); 2796 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 2797 MachineFunction *MF = MBB.getParent(); 2798 MachineRegisterInfo &MRI = MF->getRegInfo(); 2799 2800 unsigned Dst = MI.getOperand(0).getReg(); 2801 unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); 2802 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 2803 2804 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); 2805 2806 unsigned SubReg; 2807 std::tie(SubReg, Offset) 2808 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); 2809 2810 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode); 2811 2812 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) { 2813 MachineBasicBlock::iterator I(&MI); 2814 const DebugLoc &DL = MI.getDebugLoc(); 2815 2816 if (UseGPRIdxMode) { 2817 // TODO: Look at the uses to avoid the copy. This may require rescheduling 2818 // to avoid interfering with other uses, so probably requires a new 2819 // optimization pass. 2820 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) 2821 .addReg(SrcReg, RegState::Undef, SubReg) 2822 .addReg(SrcReg, RegState::Implicit) 2823 .addReg(AMDGPU::M0, RegState::Implicit); 2824 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 2825 } else { 2826 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 2827 .addReg(SrcReg, RegState::Undef, SubReg) 2828 .addReg(SrcReg, RegState::Implicit); 2829 } 2830 2831 MI.eraseFromParent(); 2832 2833 return &MBB; 2834 } 2835 2836 const DebugLoc &DL = MI.getDebugLoc(); 2837 MachineBasicBlock::iterator I(&MI); 2838 2839 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2840 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2841 2842 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); 2843 2844 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, 2845 Offset, UseGPRIdxMode, true); 2846 MachineBasicBlock *LoopBB = InsPt->getParent(); 2847 2848 if (UseGPRIdxMode) { 2849 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) 2850 .addReg(SrcReg, RegState::Undef, SubReg) 2851 .addReg(SrcReg, RegState::Implicit) 2852 .addReg(AMDGPU::M0, RegState::Implicit); 2853 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 2854 } else { 2855 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 2856 .addReg(SrcReg, RegState::Undef, SubReg) 2857 .addReg(SrcReg, RegState::Implicit); 2858 } 2859 2860 MI.eraseFromParent(); 2861 2862 return LoopBB; 2863 } 2864 2865 static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI, 2866 const TargetRegisterClass *VecRC) { 2867 switch (TRI.getRegSizeInBits(*VecRC)) { 2868 case 32: // 4 bytes 2869 return AMDGPU::V_MOVRELD_B32_V1; 2870 case 64: // 8 bytes 2871 return AMDGPU::V_MOVRELD_B32_V2; 2872 case 128: // 16 bytes 2873 return AMDGPU::V_MOVRELD_B32_V4; 2874 case 256: // 32 bytes 2875 return AMDGPU::V_MOVRELD_B32_V8; 2876 case 512: // 64 bytes 2877 return AMDGPU::V_MOVRELD_B32_V16; 2878 default: 2879 llvm_unreachable("unsupported size for MOVRELD pseudos"); 2880 } 2881 } 2882 2883 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, 2884 MachineBasicBlock &MBB, 2885 const SISubtarget &ST) { 2886 const SIInstrInfo *TII = ST.getInstrInfo(); 2887 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 2888 MachineFunction *MF = MBB.getParent(); 2889 MachineRegisterInfo &MRI = MF->getRegInfo(); 2890 2891 unsigned Dst = MI.getOperand(0).getReg(); 2892 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); 2893 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 2894 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); 2895 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 2896 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); 2897 2898 // This can be an immediate, but will be folded later. 2899 assert(Val->getReg()); 2900 2901 unsigned SubReg; 2902 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, 2903 SrcVec->getReg(), 2904 Offset); 2905 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode); 2906 2907 if (Idx->getReg() == AMDGPU::NoRegister) { 2908 MachineBasicBlock::iterator I(&MI); 2909 const DebugLoc &DL = MI.getDebugLoc(); 2910 2911 assert(Offset == 0); 2912 2913 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) 2914 .add(*SrcVec) 2915 .add(*Val) 2916 .addImm(SubReg); 2917 2918 MI.eraseFromParent(); 2919 return &MBB; 2920 } 2921 2922 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) { 2923 MachineBasicBlock::iterator I(&MI); 2924 const DebugLoc &DL = MI.getDebugLoc(); 2925 2926 if (UseGPRIdxMode) { 2927 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) 2928 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst 2929 .add(*Val) 2930 .addReg(Dst, RegState::ImplicitDefine) 2931 .addReg(SrcVec->getReg(), RegState::Implicit) 2932 .addReg(AMDGPU::M0, RegState::Implicit); 2933 2934 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 2935 } else { 2936 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); 2937 2938 BuildMI(MBB, I, DL, MovRelDesc) 2939 .addReg(Dst, RegState::Define) 2940 .addReg(SrcVec->getReg()) 2941 .add(*Val) 2942 .addImm(SubReg - AMDGPU::sub0); 2943 } 2944 2945 MI.eraseFromParent(); 2946 return &MBB; 2947 } 2948 2949 if (Val->isReg()) 2950 MRI.clearKillFlags(Val->getReg()); 2951 2952 const DebugLoc &DL = MI.getDebugLoc(); 2953 2954 unsigned PhiReg = MRI.createVirtualRegister(VecRC); 2955 2956 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, 2957 Offset, UseGPRIdxMode, false); 2958 MachineBasicBlock *LoopBB = InsPt->getParent(); 2959 2960 if (UseGPRIdxMode) { 2961 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) 2962 .addReg(PhiReg, RegState::Undef, SubReg) // vdst 2963 .add(*Val) // src0 2964 .addReg(Dst, RegState::ImplicitDefine) 2965 .addReg(PhiReg, RegState::Implicit) 2966 .addReg(AMDGPU::M0, RegState::Implicit); 2967 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); 2968 } else { 2969 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); 2970 2971 BuildMI(*LoopBB, InsPt, DL, MovRelDesc) 2972 .addReg(Dst, RegState::Define) 2973 .addReg(PhiReg) 2974 .add(*Val) 2975 .addImm(SubReg - AMDGPU::sub0); 2976 } 2977 2978 MI.eraseFromParent(); 2979 2980 return LoopBB; 2981 } 2982 2983 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( 2984 MachineInstr &MI, MachineBasicBlock *BB) const { 2985 2986 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 2987 MachineFunction *MF = BB->getParent(); 2988 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 2989 2990 if (TII->isMIMG(MI)) { 2991 if (MI.memoperands_empty() && MI.mayLoadOrStore()) { 2992 report_fatal_error("missing mem operand from MIMG instruction"); 2993 } 2994 // Add a memoperand for mimg instructions so that they aren't assumed to 2995 // be ordered memory instuctions. 2996 2997 return BB; 2998 } 2999 3000 switch (MI.getOpcode()) { 3001 case AMDGPU::S_ADD_U64_PSEUDO: 3002 case AMDGPU::S_SUB_U64_PSEUDO: { 3003 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 3004 const DebugLoc &DL = MI.getDebugLoc(); 3005 3006 MachineOperand &Dest = MI.getOperand(0); 3007 MachineOperand &Src0 = MI.getOperand(1); 3008 MachineOperand &Src1 = MI.getOperand(2); 3009 3010 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 3011 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 3012 3013 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, 3014 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, 3015 &AMDGPU::SReg_32_XM0RegClass); 3016 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, 3017 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, 3018 &AMDGPU::SReg_32_XM0RegClass); 3019 3020 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, 3021 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, 3022 &AMDGPU::SReg_32_XM0RegClass); 3023 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, 3024 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, 3025 &AMDGPU::SReg_32_XM0RegClass); 3026 3027 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 3028 3029 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 3030 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 3031 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) 3032 .add(Src0Sub0) 3033 .add(Src1Sub0); 3034 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) 3035 .add(Src0Sub1) 3036 .add(Src1Sub1); 3037 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) 3038 .addReg(DestSub0) 3039 .addImm(AMDGPU::sub0) 3040 .addReg(DestSub1) 3041 .addImm(AMDGPU::sub1); 3042 MI.eraseFromParent(); 3043 return BB; 3044 } 3045 case AMDGPU::SI_INIT_M0: { 3046 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), 3047 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 3048 .add(MI.getOperand(0)); 3049 MI.eraseFromParent(); 3050 return BB; 3051 } 3052 case AMDGPU::SI_INIT_EXEC: 3053 // This should be before all vector instructions. 3054 BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), 3055 AMDGPU::EXEC) 3056 .addImm(MI.getOperand(0).getImm()); 3057 MI.eraseFromParent(); 3058 return BB; 3059 3060 case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { 3061 // Extract the thread count from an SGPR input and set EXEC accordingly. 3062 // Since BFM can't shift by 64, handle that case with CMP + CMOV. 3063 // 3064 // S_BFE_U32 count, input, {shift, 7} 3065 // S_BFM_B64 exec, count, 0 3066 // S_CMP_EQ_U32 count, 64 3067 // S_CMOV_B64 exec, -1 3068 MachineInstr *FirstMI = &*BB->begin(); 3069 MachineRegisterInfo &MRI = MF->getRegInfo(); 3070 unsigned InputReg = MI.getOperand(0).getReg(); 3071 unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 3072 bool Found = false; 3073 3074 // Move the COPY of the input reg to the beginning, so that we can use it. 3075 for (auto I = BB->begin(); I != &MI; I++) { 3076 if (I->getOpcode() != TargetOpcode::COPY || 3077 I->getOperand(0).getReg() != InputReg) 3078 continue; 3079 3080 if (I == FirstMI) { 3081 FirstMI = &*++BB->begin(); 3082 } else { 3083 I->removeFromParent(); 3084 BB->insert(FirstMI, &*I); 3085 } 3086 Found = true; 3087 break; 3088 } 3089 assert(Found); 3090 (void)Found; 3091 3092 // This should be before all vector instructions. 3093 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) 3094 .addReg(InputReg) 3095 .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000); 3096 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64), 3097 AMDGPU::EXEC) 3098 .addReg(CountReg) 3099 .addImm(0); 3100 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) 3101 .addReg(CountReg, RegState::Kill) 3102 .addImm(64); 3103 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64), 3104 AMDGPU::EXEC) 3105 .addImm(-1); 3106 MI.eraseFromParent(); 3107 return BB; 3108 } 3109 3110 case AMDGPU::GET_GROUPSTATICSIZE: { 3111 DebugLoc DL = MI.getDebugLoc(); 3112 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) 3113 .add(MI.getOperand(0)) 3114 .addImm(MFI->getLDSSize()); 3115 MI.eraseFromParent(); 3116 return BB; 3117 } 3118 case AMDGPU::SI_INDIRECT_SRC_V1: 3119 case AMDGPU::SI_INDIRECT_SRC_V2: 3120 case AMDGPU::SI_INDIRECT_SRC_V4: 3121 case AMDGPU::SI_INDIRECT_SRC_V8: 3122 case AMDGPU::SI_INDIRECT_SRC_V16: 3123 return emitIndirectSrc(MI, *BB, *getSubtarget()); 3124 case AMDGPU::SI_INDIRECT_DST_V1: 3125 case AMDGPU::SI_INDIRECT_DST_V2: 3126 case AMDGPU::SI_INDIRECT_DST_V4: 3127 case AMDGPU::SI_INDIRECT_DST_V8: 3128 case AMDGPU::SI_INDIRECT_DST_V16: 3129 return emitIndirectDst(MI, *BB, *getSubtarget()); 3130 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 3131 case AMDGPU::SI_KILL_I1_PSEUDO: 3132 return splitKillBlock(MI, BB); 3133 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 3134 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 3135 3136 unsigned Dst = MI.getOperand(0).getReg(); 3137 unsigned Src0 = MI.getOperand(1).getReg(); 3138 unsigned Src1 = MI.getOperand(2).getReg(); 3139 const DebugLoc &DL = MI.getDebugLoc(); 3140 unsigned SrcCond = MI.getOperand(3).getReg(); 3141 3142 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3143 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3144 unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); 3145 3146 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) 3147 .addReg(SrcCond); 3148 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 3149 .addReg(Src0, 0, AMDGPU::sub0) 3150 .addReg(Src1, 0, AMDGPU::sub0) 3151 .addReg(SrcCondCopy); 3152 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 3153 .addReg(Src0, 0, AMDGPU::sub1) 3154 .addReg(Src1, 0, AMDGPU::sub1) 3155 .addReg(SrcCondCopy); 3156 3157 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) 3158 .addReg(DstLo) 3159 .addImm(AMDGPU::sub0) 3160 .addReg(DstHi) 3161 .addImm(AMDGPU::sub1); 3162 MI.eraseFromParent(); 3163 return BB; 3164 } 3165 case AMDGPU::SI_BR_UNDEF: { 3166 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 3167 const DebugLoc &DL = MI.getDebugLoc(); 3168 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) 3169 .add(MI.getOperand(0)); 3170 Br->getOperand(1).setIsUndef(true); // read undef SCC 3171 MI.eraseFromParent(); 3172 return BB; 3173 } 3174 case AMDGPU::ADJCALLSTACKUP: 3175 case AMDGPU::ADJCALLSTACKDOWN: { 3176 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 3177 MachineInstrBuilder MIB(*MF, &MI); 3178 3179 // Add an implicit use of the frame offset reg to prevent the restore copy 3180 // inserted after the call from being reorderd after stack operations in the 3181 // the caller's frame. 3182 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine) 3183 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit) 3184 .addReg(Info->getFrameOffsetReg(), RegState::Implicit); 3185 return BB; 3186 } 3187 case AMDGPU::SI_CALL_ISEL: 3188 case AMDGPU::SI_TCRETURN_ISEL: { 3189 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 3190 const DebugLoc &DL = MI.getDebugLoc(); 3191 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); 3192 3193 MachineRegisterInfo &MRI = MF->getRegInfo(); 3194 unsigned GlobalAddrReg = MI.getOperand(0).getReg(); 3195 MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg); 3196 assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET); 3197 3198 const GlobalValue *G = PCRel->getOperand(1).getGlobal(); 3199 3200 MachineInstrBuilder MIB; 3201 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { 3202 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg) 3203 .add(MI.getOperand(0)) 3204 .addGlobalAddress(G); 3205 } else { 3206 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN)) 3207 .add(MI.getOperand(0)) 3208 .addGlobalAddress(G); 3209 3210 // There is an additional imm operand for tcreturn, but it should be in the 3211 // right place already. 3212 } 3213 3214 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) 3215 MIB.add(MI.getOperand(I)); 3216 3217 MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); 3218 MI.eraseFromParent(); 3219 return BB; 3220 } 3221 default: 3222 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 3223 } 3224 } 3225 3226 bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const { 3227 return isTypeLegal(VT.getScalarType()); 3228 } 3229 3230 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 3231 // This currently forces unfolding various combinations of fsub into fma with 3232 // free fneg'd operands. As long as we have fast FMA (controlled by 3233 // isFMAFasterThanFMulAndFAdd), we should perform these. 3234 3235 // When fma is quarter rate, for f64 where add / sub are at best half rate, 3236 // most of these combines appear to be cycle neutral but save on instruction 3237 // count / code size. 3238 return true; 3239 } 3240 3241 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, 3242 EVT VT) const { 3243 if (!VT.isVector()) { 3244 return MVT::i1; 3245 } 3246 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 3247 } 3248 3249 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { 3250 // TODO: Should i16 be used always if legal? For now it would force VALU 3251 // shifts. 3252 return (VT == MVT::i16) ? MVT::i16 : MVT::i32; 3253 } 3254 3255 // Answering this is somewhat tricky and depends on the specific device which 3256 // have different rates for fma or all f64 operations. 3257 // 3258 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 3259 // regardless of which device (although the number of cycles differs between 3260 // devices), so it is always profitable for f64. 3261 // 3262 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 3263 // only on full rate devices. Normally, we should prefer selecting v_mad_f32 3264 // which we can always do even without fused FP ops since it returns the same 3265 // result as the separate operations and since it is always full 3266 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 3267 // however does not support denormals, so we do report fma as faster if we have 3268 // a fast fma device and require denormals. 3269 // 3270 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 3271 VT = VT.getScalarType(); 3272 3273 switch (VT.getSimpleVT().SimpleTy) { 3274 case MVT::f32: 3275 // This is as fast on some subtargets. However, we always have full rate f32 3276 // mad available which returns the same result as the separate operations 3277 // which we should prefer over fma. We can't use this if we want to support 3278 // denormals, so only report this in these cases. 3279 return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); 3280 case MVT::f64: 3281 return true; 3282 case MVT::f16: 3283 return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals(); 3284 default: 3285 break; 3286 } 3287 3288 return false; 3289 } 3290 3291 //===----------------------------------------------------------------------===// 3292 // Custom DAG Lowering Operations 3293 //===----------------------------------------------------------------------===// 3294 3295 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 3296 switch (Op.getOpcode()) { 3297 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 3298 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 3299 case ISD::LOAD: { 3300 SDValue Result = LowerLOAD(Op, DAG); 3301 assert((!Result.getNode() || 3302 Result.getNode()->getNumValues() == 2) && 3303 "Load should return a value and a chain"); 3304 return Result; 3305 } 3306 3307 case ISD::FSIN: 3308 case ISD::FCOS: 3309 return LowerTrig(Op, DAG); 3310 case ISD::SELECT: return LowerSELECT(Op, DAG); 3311 case ISD::FDIV: return LowerFDIV(Op, DAG); 3312 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); 3313 case ISD::STORE: return LowerSTORE(Op, DAG); 3314 case ISD::GlobalAddress: { 3315 MachineFunction &MF = DAG.getMachineFunction(); 3316 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3317 return LowerGlobalAddress(MFI, Op, DAG); 3318 } 3319 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 3320 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); 3321 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); 3322 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); 3323 case ISD::INSERT_VECTOR_ELT: 3324 return lowerINSERT_VECTOR_ELT(Op, DAG); 3325 case ISD::EXTRACT_VECTOR_ELT: 3326 return lowerEXTRACT_VECTOR_ELT(Op, DAG); 3327 case ISD::FP_ROUND: 3328 return lowerFP_ROUND(Op, DAG); 3329 case ISD::TRAP: 3330 case ISD::DEBUGTRAP: 3331 return lowerTRAP(Op, DAG); 3332 } 3333 return SDValue(); 3334 } 3335 3336 static unsigned getImageOpcode(unsigned IID) { 3337 switch (IID) { 3338 case Intrinsic::amdgcn_image_load: 3339 return AMDGPUISD::IMAGE_LOAD; 3340 case Intrinsic::amdgcn_image_load_mip: 3341 return AMDGPUISD::IMAGE_LOAD_MIP; 3342 3343 // Basic sample. 3344 case Intrinsic::amdgcn_image_sample: 3345 return AMDGPUISD::IMAGE_SAMPLE; 3346 case Intrinsic::amdgcn_image_sample_cl: 3347 return AMDGPUISD::IMAGE_SAMPLE_CL; 3348 case Intrinsic::amdgcn_image_sample_d: 3349 return AMDGPUISD::IMAGE_SAMPLE_D; 3350 case Intrinsic::amdgcn_image_sample_d_cl: 3351 return AMDGPUISD::IMAGE_SAMPLE_D_CL; 3352 case Intrinsic::amdgcn_image_sample_l: 3353 return AMDGPUISD::IMAGE_SAMPLE_L; 3354 case Intrinsic::amdgcn_image_sample_b: 3355 return AMDGPUISD::IMAGE_SAMPLE_B; 3356 case Intrinsic::amdgcn_image_sample_b_cl: 3357 return AMDGPUISD::IMAGE_SAMPLE_B_CL; 3358 case Intrinsic::amdgcn_image_sample_lz: 3359 return AMDGPUISD::IMAGE_SAMPLE_LZ; 3360 case Intrinsic::amdgcn_image_sample_cd: 3361 return AMDGPUISD::IMAGE_SAMPLE_CD; 3362 case Intrinsic::amdgcn_image_sample_cd_cl: 3363 return AMDGPUISD::IMAGE_SAMPLE_CD_CL; 3364 3365 // Sample with comparison. 3366 case Intrinsic::amdgcn_image_sample_c: 3367 return AMDGPUISD::IMAGE_SAMPLE_C; 3368 case Intrinsic::amdgcn_image_sample_c_cl: 3369 return AMDGPUISD::IMAGE_SAMPLE_C_CL; 3370 case Intrinsic::amdgcn_image_sample_c_d: 3371 return AMDGPUISD::IMAGE_SAMPLE_C_D; 3372 case Intrinsic::amdgcn_image_sample_c_d_cl: 3373 return AMDGPUISD::IMAGE_SAMPLE_C_D_CL; 3374 case Intrinsic::amdgcn_image_sample_c_l: 3375 return AMDGPUISD::IMAGE_SAMPLE_C_L; 3376 case Intrinsic::amdgcn_image_sample_c_b: 3377 return AMDGPUISD::IMAGE_SAMPLE_C_B; 3378 case Intrinsic::amdgcn_image_sample_c_b_cl: 3379 return AMDGPUISD::IMAGE_SAMPLE_C_B_CL; 3380 case Intrinsic::amdgcn_image_sample_c_lz: 3381 return AMDGPUISD::IMAGE_SAMPLE_C_LZ; 3382 case Intrinsic::amdgcn_image_sample_c_cd: 3383 return AMDGPUISD::IMAGE_SAMPLE_C_CD; 3384 case Intrinsic::amdgcn_image_sample_c_cd_cl: 3385 return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL; 3386 3387 // Sample with offsets. 3388 case Intrinsic::amdgcn_image_sample_o: 3389 return AMDGPUISD::IMAGE_SAMPLE_O; 3390 case Intrinsic::amdgcn_image_sample_cl_o: 3391 return AMDGPUISD::IMAGE_SAMPLE_CL_O; 3392 case Intrinsic::amdgcn_image_sample_d_o: 3393 return AMDGPUISD::IMAGE_SAMPLE_D_O; 3394 case Intrinsic::amdgcn_image_sample_d_cl_o: 3395 return AMDGPUISD::IMAGE_SAMPLE_D_CL_O; 3396 case Intrinsic::amdgcn_image_sample_l_o: 3397 return AMDGPUISD::IMAGE_SAMPLE_L_O; 3398 case Intrinsic::amdgcn_image_sample_b_o: 3399 return AMDGPUISD::IMAGE_SAMPLE_B_O; 3400 case Intrinsic::amdgcn_image_sample_b_cl_o: 3401 return AMDGPUISD::IMAGE_SAMPLE_B_CL_O; 3402 case Intrinsic::amdgcn_image_sample_lz_o: 3403 return AMDGPUISD::IMAGE_SAMPLE_LZ_O; 3404 case Intrinsic::amdgcn_image_sample_cd_o: 3405 return AMDGPUISD::IMAGE_SAMPLE_CD_O; 3406 case Intrinsic::amdgcn_image_sample_cd_cl_o: 3407 return AMDGPUISD::IMAGE_SAMPLE_CD_CL_O; 3408 3409 // Sample with comparison and offsets. 3410 case Intrinsic::amdgcn_image_sample_c_o: 3411 return AMDGPUISD::IMAGE_SAMPLE_C_O; 3412 case Intrinsic::amdgcn_image_sample_c_cl_o: 3413 return AMDGPUISD::IMAGE_SAMPLE_C_CL_O; 3414 case Intrinsic::amdgcn_image_sample_c_d_o: 3415 return AMDGPUISD::IMAGE_SAMPLE_C_D_O; 3416 case Intrinsic::amdgcn_image_sample_c_d_cl_o: 3417 return AMDGPUISD::IMAGE_SAMPLE_C_D_CL_O; 3418 case Intrinsic::amdgcn_image_sample_c_l_o: 3419 return AMDGPUISD::IMAGE_SAMPLE_C_L_O; 3420 case Intrinsic::amdgcn_image_sample_c_b_o: 3421 return AMDGPUISD::IMAGE_SAMPLE_C_B_O; 3422 case Intrinsic::amdgcn_image_sample_c_b_cl_o: 3423 return AMDGPUISD::IMAGE_SAMPLE_C_B_CL_O; 3424 case Intrinsic::amdgcn_image_sample_c_lz_o: 3425 return AMDGPUISD::IMAGE_SAMPLE_C_LZ_O; 3426 case Intrinsic::amdgcn_image_sample_c_cd_o: 3427 return AMDGPUISD::IMAGE_SAMPLE_C_CD_O; 3428 case Intrinsic::amdgcn_image_sample_c_cd_cl_o: 3429 return AMDGPUISD::IMAGE_SAMPLE_C_CD_CL_O; 3430 3431 // Basic gather4. 3432 case Intrinsic::amdgcn_image_gather4: 3433 return AMDGPUISD::IMAGE_GATHER4; 3434 case Intrinsic::amdgcn_image_gather4_cl: 3435 return AMDGPUISD::IMAGE_GATHER4_CL; 3436 case Intrinsic::amdgcn_image_gather4_l: 3437 return AMDGPUISD::IMAGE_GATHER4_L; 3438 case Intrinsic::amdgcn_image_gather4_b: 3439 return AMDGPUISD::IMAGE_GATHER4_B; 3440 case Intrinsic::amdgcn_image_gather4_b_cl: 3441 return AMDGPUISD::IMAGE_GATHER4_B_CL; 3442 case Intrinsic::amdgcn_image_gather4_lz: 3443 return AMDGPUISD::IMAGE_GATHER4_LZ; 3444 3445 // Gather4 with comparison. 3446 case Intrinsic::amdgcn_image_gather4_c: 3447 return AMDGPUISD::IMAGE_GATHER4_C; 3448 case Intrinsic::amdgcn_image_gather4_c_cl: 3449 return AMDGPUISD::IMAGE_GATHER4_C_CL; 3450 case Intrinsic::amdgcn_image_gather4_c_l: 3451 return AMDGPUISD::IMAGE_GATHER4_C_L; 3452 case Intrinsic::amdgcn_image_gather4_c_b: 3453 return AMDGPUISD::IMAGE_GATHER4_C_B; 3454 case Intrinsic::amdgcn_image_gather4_c_b_cl: 3455 return AMDGPUISD::IMAGE_GATHER4_C_B_CL; 3456 case Intrinsic::amdgcn_image_gather4_c_lz: 3457 return AMDGPUISD::IMAGE_GATHER4_C_LZ; 3458 3459 // Gather4 with offsets. 3460 case Intrinsic::amdgcn_image_gather4_o: 3461 return AMDGPUISD::IMAGE_GATHER4_O; 3462 case Intrinsic::amdgcn_image_gather4_cl_o: 3463 return AMDGPUISD::IMAGE_GATHER4_CL_O; 3464 case Intrinsic::amdgcn_image_gather4_l_o: 3465 return AMDGPUISD::IMAGE_GATHER4_L_O; 3466 case Intrinsic::amdgcn_image_gather4_b_o: 3467 return AMDGPUISD::IMAGE_GATHER4_B_O; 3468 case Intrinsic::amdgcn_image_gather4_b_cl_o: 3469 return AMDGPUISD::IMAGE_GATHER4_B_CL_O; 3470 case Intrinsic::amdgcn_image_gather4_lz_o: 3471 return AMDGPUISD::IMAGE_GATHER4_LZ_O; 3472 3473 // Gather4 with comparison and offsets. 3474 case Intrinsic::amdgcn_image_gather4_c_o: 3475 return AMDGPUISD::IMAGE_GATHER4_C_O; 3476 case Intrinsic::amdgcn_image_gather4_c_cl_o: 3477 return AMDGPUISD::IMAGE_GATHER4_C_CL_O; 3478 case Intrinsic::amdgcn_image_gather4_c_l_o: 3479 return AMDGPUISD::IMAGE_GATHER4_C_L_O; 3480 case Intrinsic::amdgcn_image_gather4_c_b_o: 3481 return AMDGPUISD::IMAGE_GATHER4_C_B_O; 3482 case Intrinsic::amdgcn_image_gather4_c_b_cl_o: 3483 return AMDGPUISD::IMAGE_GATHER4_C_B_CL_O; 3484 case Intrinsic::amdgcn_image_gather4_c_lz_o: 3485 return AMDGPUISD::IMAGE_GATHER4_C_LZ_O; 3486 3487 default: 3488 break; 3489 } 3490 return 0; 3491 } 3492 3493 static SDValue adjustLoadValueType(SDValue Result, EVT LoadVT, SDLoc DL, 3494 SelectionDAG &DAG, bool Unpacked) { 3495 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. 3496 // Truncate to v2i16/v4i16. 3497 EVT IntLoadVT = LoadVT.changeTypeToInteger(); 3498 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntLoadVT, Result); 3499 // Bitcast to original type (v2f16/v4f16). 3500 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc); 3501 } 3502 // Cast back to the original packed type. 3503 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); 3504 } 3505 3506 // This is to lower INTRINSIC_W_CHAIN with illegal result types. 3507 SDValue SITargetLowering::lowerIntrinsicWChain_IllegalReturnType(SDValue Op, 3508 SDValue &Chain, SelectionDAG &DAG) const { 3509 EVT LoadVT = Op.getValueType(); 3510 // TODO: handle v3f16. 3511 if (LoadVT != MVT::v2f16 && LoadVT != MVT::v4f16) 3512 return SDValue(); 3513 3514 bool Unpacked = Subtarget->hasUnpackedD16VMem(); 3515 EVT UnpackedLoadVT = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; 3516 EVT EquivLoadVT = Unpacked ? UnpackedLoadVT : 3517 getEquivalentMemType(*DAG.getContext(), LoadVT); 3518 // Change from v4f16/v2f16 to EquivLoadVT. 3519 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); 3520 3521 SDValue Res; 3522 SDLoc DL(Op); 3523 MemSDNode *M = cast<MemSDNode>(Op); 3524 unsigned IID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 3525 switch (IID) { 3526 case Intrinsic::amdgcn_tbuffer_load: { 3527 SDValue Ops[] = { 3528 Op.getOperand(0), // Chain 3529 Op.getOperand(2), // rsrc 3530 Op.getOperand(3), // vindex 3531 Op.getOperand(4), // voffset 3532 Op.getOperand(5), // soffset 3533 Op.getOperand(6), // offset 3534 Op.getOperand(7), // dfmt 3535 Op.getOperand(8), // nfmt 3536 Op.getOperand(9), // glc 3537 Op.getOperand(10) // slc 3538 }; 3539 Res = DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL, 3540 VTList, Ops, M->getMemoryVT(), 3541 M->getMemOperand()); 3542 Chain = Res.getValue(1); 3543 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); 3544 } 3545 case Intrinsic::amdgcn_buffer_load_format: { 3546 SDValue Ops[] = { 3547 Op.getOperand(0), // Chain 3548 Op.getOperand(2), // rsrc 3549 Op.getOperand(3), // vindex 3550 Op.getOperand(4), // offset 3551 Op.getOperand(5), // glc 3552 Op.getOperand(6) // slc 3553 }; 3554 Res = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, 3555 DL, VTList, Ops, M->getMemoryVT(), 3556 M->getMemOperand()); 3557 Chain = Res.getValue(1); 3558 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); 3559 } 3560 case Intrinsic::amdgcn_image_load: 3561 case Intrinsic::amdgcn_image_load_mip: { 3562 SDValue Ops[] = { 3563 Op.getOperand(0), // Chain 3564 Op.getOperand(2), // vaddr 3565 Op.getOperand(3), // rsrc 3566 Op.getOperand(4), // dmask 3567 Op.getOperand(5), // glc 3568 Op.getOperand(6), // slc 3569 Op.getOperand(7), // lwe 3570 Op.getOperand(8) // da 3571 }; 3572 unsigned Opc = getImageOpcode(IID); 3573 Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), 3574 M->getMemOperand()); 3575 Chain = Res.getValue(1); 3576 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); 3577 } 3578 // Basic sample. 3579 case Intrinsic::amdgcn_image_sample: 3580 case Intrinsic::amdgcn_image_sample_cl: 3581 case Intrinsic::amdgcn_image_sample_d: 3582 case Intrinsic::amdgcn_image_sample_d_cl: 3583 case Intrinsic::amdgcn_image_sample_l: 3584 case Intrinsic::amdgcn_image_sample_b: 3585 case Intrinsic::amdgcn_image_sample_b_cl: 3586 case Intrinsic::amdgcn_image_sample_lz: 3587 case Intrinsic::amdgcn_image_sample_cd: 3588 case Intrinsic::amdgcn_image_sample_cd_cl: 3589 3590 // Sample with comparison. 3591 case Intrinsic::amdgcn_image_sample_c: 3592 case Intrinsic::amdgcn_image_sample_c_cl: 3593 case Intrinsic::amdgcn_image_sample_c_d: 3594 case Intrinsic::amdgcn_image_sample_c_d_cl: 3595 case Intrinsic::amdgcn_image_sample_c_l: 3596 case Intrinsic::amdgcn_image_sample_c_b: 3597 case Intrinsic::amdgcn_image_sample_c_b_cl: 3598 case Intrinsic::amdgcn_image_sample_c_lz: 3599 case Intrinsic::amdgcn_image_sample_c_cd: 3600 case Intrinsic::amdgcn_image_sample_c_cd_cl: 3601 3602 // Sample with offsets. 3603 case Intrinsic::amdgcn_image_sample_o: 3604 case Intrinsic::amdgcn_image_sample_cl_o: 3605 case Intrinsic::amdgcn_image_sample_d_o: 3606 case Intrinsic::amdgcn_image_sample_d_cl_o: 3607 case Intrinsic::amdgcn_image_sample_l_o: 3608 case Intrinsic::amdgcn_image_sample_b_o: 3609 case Intrinsic::amdgcn_image_sample_b_cl_o: 3610 case Intrinsic::amdgcn_image_sample_lz_o: 3611 case Intrinsic::amdgcn_image_sample_cd_o: 3612 case Intrinsic::amdgcn_image_sample_cd_cl_o: 3613 3614 // Sample with comparison and offsets. 3615 case Intrinsic::amdgcn_image_sample_c_o: 3616 case Intrinsic::amdgcn_image_sample_c_cl_o: 3617 case Intrinsic::amdgcn_image_sample_c_d_o: 3618 case Intrinsic::amdgcn_image_sample_c_d_cl_o: 3619 case Intrinsic::amdgcn_image_sample_c_l_o: 3620 case Intrinsic::amdgcn_image_sample_c_b_o: 3621 case Intrinsic::amdgcn_image_sample_c_b_cl_o: 3622 case Intrinsic::amdgcn_image_sample_c_lz_o: 3623 case Intrinsic::amdgcn_image_sample_c_cd_o: 3624 case Intrinsic::amdgcn_image_sample_c_cd_cl_o: 3625 3626 // Basic gather4 3627 case Intrinsic::amdgcn_image_gather4: 3628 case Intrinsic::amdgcn_image_gather4_cl: 3629 case Intrinsic::amdgcn_image_gather4_l: 3630 case Intrinsic::amdgcn_image_gather4_b: 3631 case Intrinsic::amdgcn_image_gather4_b_cl: 3632 case Intrinsic::amdgcn_image_gather4_lz: 3633 3634 // Gather4 with comparison 3635 case Intrinsic::amdgcn_image_gather4_c: 3636 case Intrinsic::amdgcn_image_gather4_c_cl: 3637 case Intrinsic::amdgcn_image_gather4_c_l: 3638 case Intrinsic::amdgcn_image_gather4_c_b: 3639 case Intrinsic::amdgcn_image_gather4_c_b_cl: 3640 case Intrinsic::amdgcn_image_gather4_c_lz: 3641 3642 // Gather4 with offsets 3643 case Intrinsic::amdgcn_image_gather4_o: 3644 case Intrinsic::amdgcn_image_gather4_cl_o: 3645 case Intrinsic::amdgcn_image_gather4_l_o: 3646 case Intrinsic::amdgcn_image_gather4_b_o: 3647 case Intrinsic::amdgcn_image_gather4_b_cl_o: 3648 case Intrinsic::amdgcn_image_gather4_lz_o: 3649 3650 // Gather4 with comparison and offsets 3651 case Intrinsic::amdgcn_image_gather4_c_o: 3652 case Intrinsic::amdgcn_image_gather4_c_cl_o: 3653 case Intrinsic::amdgcn_image_gather4_c_l_o: 3654 case Intrinsic::amdgcn_image_gather4_c_b_o: 3655 case Intrinsic::amdgcn_image_gather4_c_b_cl_o: 3656 case Intrinsic::amdgcn_image_gather4_c_lz_o: { 3657 SDValue Ops[] = { 3658 Op.getOperand(0), // Chain 3659 Op.getOperand(2), // vaddr 3660 Op.getOperand(3), // rsrc 3661 Op.getOperand(4), // sampler 3662 Op.getOperand(5), // dmask 3663 Op.getOperand(6), // unorm 3664 Op.getOperand(7), // glc 3665 Op.getOperand(8), // slc 3666 Op.getOperand(9), // lwe 3667 Op.getOperand(10) // da 3668 }; 3669 unsigned Opc = getImageOpcode(IID); 3670 Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), 3671 M->getMemOperand()); 3672 Chain = Res.getValue(1); 3673 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); 3674 } 3675 default: { 3676 const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = 3677 AMDGPU::lookupD16ImageDimIntrinsicByIntr(IID); 3678 if (D16ImageDimIntr) { 3679 SmallVector<SDValue, 20> Ops; 3680 for (auto Value : Op.getNode()->op_values()) 3681 Ops.push_back(Value); 3682 Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32); 3683 Res = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTList, Ops, 3684 M->getMemoryVT(), M->getMemOperand()); 3685 Chain = Res.getValue(1); 3686 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); 3687 } 3688 3689 return SDValue(); 3690 } 3691 } 3692 } 3693 3694 void SITargetLowering::ReplaceNodeResults(SDNode *N, 3695 SmallVectorImpl<SDValue> &Results, 3696 SelectionDAG &DAG) const { 3697 switch (N->getOpcode()) { 3698 case ISD::INSERT_VECTOR_ELT: { 3699 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG)) 3700 Results.push_back(Res); 3701 return; 3702 } 3703 case ISD::EXTRACT_VECTOR_ELT: { 3704 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG)) 3705 Results.push_back(Res); 3706 return; 3707 } 3708 case ISD::INTRINSIC_WO_CHAIN: { 3709 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 3710 switch (IID) { 3711 case Intrinsic::amdgcn_cvt_pkrtz: { 3712 SDValue Src0 = N->getOperand(1); 3713 SDValue Src1 = N->getOperand(2); 3714 SDLoc SL(N); 3715 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, 3716 Src0, Src1); 3717 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt)); 3718 return; 3719 } 3720 case Intrinsic::amdgcn_cvt_pknorm_i16: 3721 case Intrinsic::amdgcn_cvt_pknorm_u16: 3722 case Intrinsic::amdgcn_cvt_pk_i16: 3723 case Intrinsic::amdgcn_cvt_pk_u16: { 3724 SDValue Src0 = N->getOperand(1); 3725 SDValue Src1 = N->getOperand(2); 3726 SDLoc SL(N); 3727 unsigned Opcode; 3728 3729 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16) 3730 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; 3731 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16) 3732 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; 3733 else if (IID == Intrinsic::amdgcn_cvt_pk_i16) 3734 Opcode = AMDGPUISD::CVT_PK_I16_I32; 3735 else 3736 Opcode = AMDGPUISD::CVT_PK_U16_U32; 3737 3738 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1); 3739 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt)); 3740 return; 3741 } 3742 } 3743 break; 3744 } 3745 case ISD::INTRINSIC_W_CHAIN: { 3746 SDValue Chain; 3747 if (SDValue Res = lowerIntrinsicWChain_IllegalReturnType(SDValue(N, 0), 3748 Chain, DAG)) { 3749 Results.push_back(Res); 3750 Results.push_back(Chain); 3751 return; 3752 } 3753 break; 3754 } 3755 case ISD::SELECT: { 3756 SDLoc SL(N); 3757 EVT VT = N->getValueType(0); 3758 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 3759 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1)); 3760 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2)); 3761 3762 EVT SelectVT = NewVT; 3763 if (NewVT.bitsLT(MVT::i32)) { 3764 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS); 3765 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS); 3766 SelectVT = MVT::i32; 3767 } 3768 3769 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT, 3770 N->getOperand(0), LHS, RHS); 3771 3772 if (NewVT != SelectVT) 3773 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect); 3774 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect)); 3775 return; 3776 } 3777 default: 3778 break; 3779 } 3780 } 3781 3782 /// \brief Helper function for LowerBRCOND 3783 static SDNode *findUser(SDValue Value, unsigned Opcode) { 3784 3785 SDNode *Parent = Value.getNode(); 3786 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 3787 I != E; ++I) { 3788 3789 if (I.getUse().get() != Value) 3790 continue; 3791 3792 if (I->getOpcode() == Opcode) 3793 return *I; 3794 } 3795 return nullptr; 3796 } 3797 3798 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { 3799 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 3800 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { 3801 case Intrinsic::amdgcn_if: 3802 return AMDGPUISD::IF; 3803 case Intrinsic::amdgcn_else: 3804 return AMDGPUISD::ELSE; 3805 case Intrinsic::amdgcn_loop: 3806 return AMDGPUISD::LOOP; 3807 case Intrinsic::amdgcn_end_cf: 3808 llvm_unreachable("should not occur"); 3809 default: 3810 return 0; 3811 } 3812 } 3813 3814 // break, if_break, else_break are all only used as inputs to loop, not 3815 // directly as branch conditions. 3816 return 0; 3817 } 3818 3819 void SITargetLowering::createDebuggerPrologueStackObjects( 3820 MachineFunction &MF) const { 3821 // Create stack objects that are used for emitting debugger prologue. 3822 // 3823 // Debugger prologue writes work group IDs and work item IDs to scratch memory 3824 // at fixed location in the following format: 3825 // offset 0: work group ID x 3826 // offset 4: work group ID y 3827 // offset 8: work group ID z 3828 // offset 16: work item ID x 3829 // offset 20: work item ID y 3830 // offset 24: work item ID z 3831 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 3832 int ObjectIdx = 0; 3833 3834 // For each dimension: 3835 for (unsigned i = 0; i < 3; ++i) { 3836 // Create fixed stack object for work group ID. 3837 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true); 3838 Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); 3839 // Create fixed stack object for work item ID. 3840 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true); 3841 Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); 3842 } 3843 } 3844 3845 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { 3846 const Triple &TT = getTargetMachine().getTargetTriple(); 3847 return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || 3848 GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && 3849 AMDGPU::shouldEmitConstantsToTextSection(TT); 3850 } 3851 3852 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { 3853 return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || 3854 GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || 3855 GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && 3856 !shouldEmitFixup(GV) && 3857 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); 3858 } 3859 3860 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { 3861 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); 3862 } 3863 3864 /// This transforms the control flow intrinsics to get the branch destination as 3865 /// last parameter, also switches branch target with BR if the need arise 3866 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 3867 SelectionDAG &DAG) const { 3868 SDLoc DL(BRCOND); 3869 3870 SDNode *Intr = BRCOND.getOperand(1).getNode(); 3871 SDValue Target = BRCOND.getOperand(2); 3872 SDNode *BR = nullptr; 3873 SDNode *SetCC = nullptr; 3874 3875 if (Intr->getOpcode() == ISD::SETCC) { 3876 // As long as we negate the condition everything is fine 3877 SetCC = Intr; 3878 Intr = SetCC->getOperand(0).getNode(); 3879 3880 } else { 3881 // Get the target from BR if we don't negate the condition 3882 BR = findUser(BRCOND, ISD::BR); 3883 Target = BR->getOperand(1); 3884 } 3885 3886 // FIXME: This changes the types of the intrinsics instead of introducing new 3887 // nodes with the correct types. 3888 // e.g. llvm.amdgcn.loop 3889 3890 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3 3891 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088> 3892 3893 unsigned CFNode = isCFIntrinsic(Intr); 3894 if (CFNode == 0) { 3895 // This is a uniform branch so we don't need to legalize. 3896 return BRCOND; 3897 } 3898 3899 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID || 3900 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; 3901 3902 assert(!SetCC || 3903 (SetCC->getConstantOperandVal(1) == 1 && 3904 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 3905 ISD::SETNE)); 3906 3907 // operands of the new intrinsic call 3908 SmallVector<SDValue, 4> Ops; 3909 if (HaveChain) 3910 Ops.push_back(BRCOND.getOperand(0)); 3911 3912 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end()); 3913 Ops.push_back(Target); 3914 3915 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 3916 3917 // build the new intrinsic call 3918 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode(); 3919 3920 if (!HaveChain) { 3921 SDValue Ops[] = { 3922 SDValue(Result, 0), 3923 BRCOND.getOperand(0) 3924 }; 3925 3926 Result = DAG.getMergeValues(Ops, DL).getNode(); 3927 } 3928 3929 if (BR) { 3930 // Give the branch instruction our target 3931 SDValue Ops[] = { 3932 BR->getOperand(0), 3933 BRCOND.getOperand(2) 3934 }; 3935 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 3936 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 3937 BR = NewBR.getNode(); 3938 } 3939 3940 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 3941 3942 // Copy the intrinsic results to registers 3943 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 3944 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 3945 if (!CopyToReg) 3946 continue; 3947 3948 Chain = DAG.getCopyToReg( 3949 Chain, DL, 3950 CopyToReg->getOperand(1), 3951 SDValue(Result, i - 1), 3952 SDValue()); 3953 3954 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 3955 } 3956 3957 // Remove the old intrinsic from the chain 3958 DAG.ReplaceAllUsesOfValueWith( 3959 SDValue(Intr, Intr->getNumValues() - 1), 3960 Intr->getOperand(0)); 3961 3962 return Chain; 3963 } 3964 3965 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, 3966 SDValue Op, 3967 const SDLoc &DL, 3968 EVT VT) const { 3969 return Op.getValueType().bitsLE(VT) ? 3970 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) : 3971 DAG.getNode(ISD::FTRUNC, DL, VT, Op); 3972 } 3973 3974 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 3975 assert(Op.getValueType() == MVT::f16 && 3976 "Do not know how to custom lower FP_ROUND for non-f16 type"); 3977 3978 SDValue Src = Op.getOperand(0); 3979 EVT SrcVT = Src.getValueType(); 3980 if (SrcVT != MVT::f64) 3981 return Op; 3982 3983 SDLoc DL(Op); 3984 3985 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); 3986 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); 3987 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); 3988 } 3989 3990 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { 3991 SDLoc SL(Op); 3992 MachineFunction &MF = DAG.getMachineFunction(); 3993 SDValue Chain = Op.getOperand(0); 3994 3995 unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ? 3996 SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap; 3997 3998 if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa && 3999 Subtarget->isTrapHandlerEnabled()) { 4000 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4001 unsigned UserSGPR = Info->getQueuePtrUserSGPR(); 4002 assert(UserSGPR != AMDGPU::NoRegister); 4003 4004 SDValue QueuePtr = CreateLiveInRegister( 4005 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); 4006 4007 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); 4008 4009 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, 4010 QueuePtr, SDValue()); 4011 4012 SDValue Ops[] = { 4013 ToReg, 4014 DAG.getTargetConstant(TrapID, SL, MVT::i16), 4015 SGPR01, 4016 ToReg.getValue(1) 4017 }; 4018 4019 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); 4020 } 4021 4022 switch (TrapID) { 4023 case SISubtarget::TrapIDLLVMTrap: 4024 return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); 4025 case SISubtarget::TrapIDLLVMDebugTrap: { 4026 DiagnosticInfoUnsupported NoTrap(MF.getFunction(), 4027 "debugtrap handler not supported", 4028 Op.getDebugLoc(), 4029 DS_Warning); 4030 LLVMContext &Ctx = MF.getFunction().getContext(); 4031 Ctx.diagnose(NoTrap); 4032 return Chain; 4033 } 4034 default: 4035 llvm_unreachable("unsupported trap handler type!"); 4036 } 4037 4038 return Chain; 4039 } 4040 4041 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, 4042 SelectionDAG &DAG) const { 4043 // FIXME: Use inline constants (src_{shared, private}_base) instead. 4044 if (Subtarget->hasApertureRegs()) { 4045 unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ? 4046 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 4047 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 4048 unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ? 4049 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 4050 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 4051 unsigned Encoding = 4052 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 4053 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 4054 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 4055 4056 SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16); 4057 SDValue ApertureReg = SDValue( 4058 DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0); 4059 SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32); 4060 return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount); 4061 } 4062 4063 MachineFunction &MF = DAG.getMachineFunction(); 4064 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4065 unsigned UserSGPR = Info->getQueuePtrUserSGPR(); 4066 assert(UserSGPR != AMDGPU::NoRegister); 4067 4068 SDValue QueuePtr = CreateLiveInRegister( 4069 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); 4070 4071 // Offset into amd_queue_t for group_segment_aperture_base_hi / 4072 // private_segment_aperture_base_hi. 4073 uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44; 4074 4075 SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset); 4076 4077 // TODO: Use custom target PseudoSourceValue. 4078 // TODO: We should use the value from the IR intrinsic call, but it might not 4079 // be available and how do we get it? 4080 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), 4081 AMDGPUASI.CONSTANT_ADDRESS)); 4082 4083 MachinePointerInfo PtrInfo(V, StructOffset); 4084 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, 4085 MinAlign(64, StructOffset), 4086 MachineMemOperand::MODereferenceable | 4087 MachineMemOperand::MOInvariant); 4088 } 4089 4090 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, 4091 SelectionDAG &DAG) const { 4092 SDLoc SL(Op); 4093 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op); 4094 4095 SDValue Src = ASC->getOperand(0); 4096 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); 4097 4098 const AMDGPUTargetMachine &TM = 4099 static_cast<const AMDGPUTargetMachine &>(getTargetMachine()); 4100 4101 // flat -> local/private 4102 if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) { 4103 unsigned DestAS = ASC->getDestAddressSpace(); 4104 4105 if (DestAS == AMDGPUASI.LOCAL_ADDRESS || 4106 DestAS == AMDGPUASI.PRIVATE_ADDRESS) { 4107 unsigned NullVal = TM.getNullPointerValue(DestAS); 4108 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); 4109 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); 4110 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); 4111 4112 return DAG.getNode(ISD::SELECT, SL, MVT::i32, 4113 NonNull, Ptr, SegmentNullPtr); 4114 } 4115 } 4116 4117 // local/private -> flat 4118 if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) { 4119 unsigned SrcAS = ASC->getSrcAddressSpace(); 4120 4121 if (SrcAS == AMDGPUASI.LOCAL_ADDRESS || 4122 SrcAS == AMDGPUASI.PRIVATE_ADDRESS) { 4123 unsigned NullVal = TM.getNullPointerValue(SrcAS); 4124 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); 4125 4126 SDValue NonNull 4127 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); 4128 4129 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG); 4130 SDValue CvtPtr 4131 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); 4132 4133 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, 4134 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), 4135 FlatNullPtr); 4136 } 4137 } 4138 4139 // global <-> flat are no-ops and never emitted. 4140 4141 const MachineFunction &MF = DAG.getMachineFunction(); 4142 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 4143 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); 4144 DAG.getContext()->diagnose(InvalidAddrSpaceCast); 4145 4146 return DAG.getUNDEF(ASC->getValueType(0)); 4147 } 4148 4149 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, 4150 SelectionDAG &DAG) const { 4151 SDValue Idx = Op.getOperand(2); 4152 if (isa<ConstantSDNode>(Idx)) 4153 return SDValue(); 4154 4155 // Avoid stack access for dynamic indexing. 4156 SDLoc SL(Op); 4157 SDValue Vec = Op.getOperand(0); 4158 SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1)); 4159 4160 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec 4161 SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val); 4162 4163 // Convert vector index to bit-index. 4164 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, 4165 DAG.getConstant(16, SL, MVT::i32)); 4166 4167 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); 4168 4169 SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32, 4170 DAG.getConstant(0xffff, SL, MVT::i32), 4171 ScaledIdx); 4172 4173 SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal); 4174 SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32, 4175 DAG.getNOT(SL, BFM, MVT::i32), BCVec); 4176 4177 SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS); 4178 return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI); 4179 } 4180 4181 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, 4182 SelectionDAG &DAG) const { 4183 SDLoc SL(Op); 4184 4185 EVT ResultVT = Op.getValueType(); 4186 SDValue Vec = Op.getOperand(0); 4187 SDValue Idx = Op.getOperand(1); 4188 4189 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); 4190 4191 // Make sure we we do any optimizations that will make it easier to fold 4192 // source modifiers before obscuring it with bit operations. 4193 4194 // XXX - Why doesn't this get called when vector_shuffle is expanded? 4195 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) 4196 return Combined; 4197 4198 if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) { 4199 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); 4200 4201 if (CIdx->getZExtValue() == 1) { 4202 Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result, 4203 DAG.getConstant(16, SL, MVT::i32)); 4204 } else { 4205 assert(CIdx->getZExtValue() == 0); 4206 } 4207 4208 if (ResultVT.bitsLT(MVT::i32)) 4209 Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result); 4210 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); 4211 } 4212 4213 SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32); 4214 4215 // Convert vector index to bit-index. 4216 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen); 4217 4218 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); 4219 SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx); 4220 4221 SDValue Result = Elt; 4222 if (ResultVT.bitsLT(MVT::i32)) 4223 Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result); 4224 4225 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); 4226 } 4227 4228 bool 4229 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 4230 // We can fold offsets for anything that doesn't require a GOT relocation. 4231 return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || 4232 GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || 4233 GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && 4234 !shouldEmitGOTReloc(GA->getGlobal()); 4235 } 4236 4237 static SDValue 4238 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, 4239 const SDLoc &DL, unsigned Offset, EVT PtrVT, 4240 unsigned GAFlags = SIInstrInfo::MO_NONE) { 4241 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is 4242 // lowered to the following code sequence: 4243 // 4244 // For constant address space: 4245 // s_getpc_b64 s[0:1] 4246 // s_add_u32 s0, s0, $symbol 4247 // s_addc_u32 s1, s1, 0 4248 // 4249 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 4250 // a fixup or relocation is emitted to replace $symbol with a literal 4251 // constant, which is a pc-relative offset from the encoding of the $symbol 4252 // operand to the global variable. 4253 // 4254 // For global address space: 4255 // s_getpc_b64 s[0:1] 4256 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 4257 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 4258 // 4259 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 4260 // fixups or relocations are emitted to replace $symbol@*@lo and 4261 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 4262 // which is a 64-bit pc-relative offset from the encoding of the $symbol 4263 // operand to the global variable. 4264 // 4265 // What we want here is an offset from the value returned by s_getpc 4266 // (which is the address of the s_add_u32 instruction) to the global 4267 // variable, but since the encoding of $symbol starts 4 bytes after the start 4268 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 4269 // small. This requires us to add 4 to the global variable offset in order to 4270 // compute the correct address. 4271 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, 4272 GAFlags); 4273 SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, 4274 GAFlags == SIInstrInfo::MO_NONE ? 4275 GAFlags : GAFlags + 1); 4276 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); 4277 } 4278 4279 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 4280 SDValue Op, 4281 SelectionDAG &DAG) const { 4282 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 4283 const GlobalValue *GV = GSD->getGlobal(); 4284 4285 if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS && 4286 GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT && 4287 GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS && 4288 // FIXME: It isn't correct to rely on the type of the pointer. This should 4289 // be removed when address space 0 is 64-bit. 4290 !GV->getType()->getElementType()->isFunctionTy()) 4291 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 4292 4293 SDLoc DL(GSD); 4294 EVT PtrVT = Op.getValueType(); 4295 4296 if (shouldEmitFixup(GV)) 4297 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); 4298 else if (shouldEmitPCReloc(GV)) 4299 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT, 4300 SIInstrInfo::MO_REL32); 4301 4302 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, 4303 SIInstrInfo::MO_GOTPCREL32); 4304 4305 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); 4306 PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); 4307 const DataLayout &DataLayout = DAG.getDataLayout(); 4308 unsigned Align = DataLayout.getABITypeAlignment(PtrTy); 4309 // FIXME: Use a PseudoSourceValue once those can be assigned an address space. 4310 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 4311 4312 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align, 4313 MachineMemOperand::MODereferenceable | 4314 MachineMemOperand::MOInvariant); 4315 } 4316 4317 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, 4318 const SDLoc &DL, SDValue V) const { 4319 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as 4320 // the destination register. 4321 // 4322 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, 4323 // so we will end up with redundant moves to m0. 4324 // 4325 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. 4326 4327 // A Null SDValue creates a glue result. 4328 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, 4329 V, Chain); 4330 return SDValue(M0, 0); 4331 } 4332 4333 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, 4334 SDValue Op, 4335 MVT VT, 4336 unsigned Offset) const { 4337 SDLoc SL(Op); 4338 SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL, 4339 DAG.getEntryNode(), Offset, false); 4340 // The local size values will have the hi 16-bits as zero. 4341 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, 4342 DAG.getValueType(VT)); 4343 } 4344 4345 static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, 4346 EVT VT) { 4347 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(), 4348 "non-hsa intrinsic with hsa target", 4349 DL.getDebugLoc()); 4350 DAG.getContext()->diagnose(BadIntrin); 4351 return DAG.getUNDEF(VT); 4352 } 4353 4354 static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, 4355 EVT VT) { 4356 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(), 4357 "intrinsic not supported on subtarget", 4358 DL.getDebugLoc()); 4359 DAG.getContext()->diagnose(BadIntrin); 4360 return DAG.getUNDEF(VT); 4361 } 4362 4363 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 4364 SelectionDAG &DAG) const { 4365 MachineFunction &MF = DAG.getMachineFunction(); 4366 auto MFI = MF.getInfo<SIMachineFunctionInfo>(); 4367 4368 EVT VT = Op.getValueType(); 4369 SDLoc DL(Op); 4370 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4371 4372 // TODO: Should this propagate fast-math-flags? 4373 4374 switch (IntrinsicID) { 4375 case Intrinsic::amdgcn_implicit_buffer_ptr: { 4376 if (getSubtarget()->isAmdCodeObjectV2(MF)) 4377 return emitNonHSAIntrinsicError(DAG, DL, VT); 4378 return getPreloadedValue(DAG, *MFI, VT, 4379 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4380 } 4381 case Intrinsic::amdgcn_dispatch_ptr: 4382 case Intrinsic::amdgcn_queue_ptr: { 4383 if (!Subtarget->isAmdCodeObjectV2(MF)) { 4384 DiagnosticInfoUnsupported BadIntrin( 4385 MF.getFunction(), "unsupported hsa intrinsic without hsa target", 4386 DL.getDebugLoc()); 4387 DAG.getContext()->diagnose(BadIntrin); 4388 return DAG.getUNDEF(VT); 4389 } 4390 4391 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? 4392 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR; 4393 return getPreloadedValue(DAG, *MFI, VT, RegID); 4394 } 4395 case Intrinsic::amdgcn_implicitarg_ptr: { 4396 if (MFI->isEntryFunction()) 4397 return getImplicitArgPtr(DAG, DL); 4398 return getPreloadedValue(DAG, *MFI, VT, 4399 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 4400 } 4401 case Intrinsic::amdgcn_kernarg_segment_ptr: { 4402 return getPreloadedValue(DAG, *MFI, VT, 4403 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4404 } 4405 case Intrinsic::amdgcn_dispatch_id: { 4406 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID); 4407 } 4408 case Intrinsic::amdgcn_rcp: 4409 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); 4410 case Intrinsic::amdgcn_rsq: 4411 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 4412 case Intrinsic::amdgcn_rsq_legacy: 4413 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 4414 return emitRemovedIntrinsicError(DAG, DL, VT); 4415 4416 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 4417 case Intrinsic::amdgcn_rcp_legacy: 4418 if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) 4419 return emitRemovedIntrinsicError(DAG, DL, VT); 4420 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1)); 4421 case Intrinsic::amdgcn_rsq_clamp: { 4422 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) 4423 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 4424 4425 Type *Type = VT.getTypeForEVT(*DAG.getContext()); 4426 APFloat Max = APFloat::getLargest(Type->getFltSemantics()); 4427 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); 4428 4429 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 4430 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, 4431 DAG.getConstantFP(Max, DL, VT)); 4432 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, 4433 DAG.getConstantFP(Min, DL, VT)); 4434 } 4435 case Intrinsic::r600_read_ngroups_x: 4436 if (Subtarget->isAmdHsaOS()) 4437 return emitNonHSAIntrinsicError(DAG, DL, VT); 4438 4439 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4440 SI::KernelInputOffsets::NGROUPS_X, false); 4441 case Intrinsic::r600_read_ngroups_y: 4442 if (Subtarget->isAmdHsaOS()) 4443 return emitNonHSAIntrinsicError(DAG, DL, VT); 4444 4445 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4446 SI::KernelInputOffsets::NGROUPS_Y, false); 4447 case Intrinsic::r600_read_ngroups_z: 4448 if (Subtarget->isAmdHsaOS()) 4449 return emitNonHSAIntrinsicError(DAG, DL, VT); 4450 4451 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4452 SI::KernelInputOffsets::NGROUPS_Z, false); 4453 case Intrinsic::r600_read_global_size_x: 4454 if (Subtarget->isAmdHsaOS()) 4455 return emitNonHSAIntrinsicError(DAG, DL, VT); 4456 4457 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4458 SI::KernelInputOffsets::GLOBAL_SIZE_X, false); 4459 case Intrinsic::r600_read_global_size_y: 4460 if (Subtarget->isAmdHsaOS()) 4461 return emitNonHSAIntrinsicError(DAG, DL, VT); 4462 4463 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4464 SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); 4465 case Intrinsic::r600_read_global_size_z: 4466 if (Subtarget->isAmdHsaOS()) 4467 return emitNonHSAIntrinsicError(DAG, DL, VT); 4468 4469 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4470 SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); 4471 case Intrinsic::r600_read_local_size_x: 4472 if (Subtarget->isAmdHsaOS()) 4473 return emitNonHSAIntrinsicError(DAG, DL, VT); 4474 4475 return lowerImplicitZextParam(DAG, Op, MVT::i16, 4476 SI::KernelInputOffsets::LOCAL_SIZE_X); 4477 case Intrinsic::r600_read_local_size_y: 4478 if (Subtarget->isAmdHsaOS()) 4479 return emitNonHSAIntrinsicError(DAG, DL, VT); 4480 4481 return lowerImplicitZextParam(DAG, Op, MVT::i16, 4482 SI::KernelInputOffsets::LOCAL_SIZE_Y); 4483 case Intrinsic::r600_read_local_size_z: 4484 if (Subtarget->isAmdHsaOS()) 4485 return emitNonHSAIntrinsicError(DAG, DL, VT); 4486 4487 return lowerImplicitZextParam(DAG, Op, MVT::i16, 4488 SI::KernelInputOffsets::LOCAL_SIZE_Z); 4489 case Intrinsic::amdgcn_workgroup_id_x: 4490 case Intrinsic::r600_read_tgid_x: 4491 return getPreloadedValue(DAG, *MFI, VT, 4492 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4493 case Intrinsic::amdgcn_workgroup_id_y: 4494 case Intrinsic::r600_read_tgid_y: 4495 return getPreloadedValue(DAG, *MFI, VT, 4496 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4497 case Intrinsic::amdgcn_workgroup_id_z: 4498 case Intrinsic::r600_read_tgid_z: 4499 return getPreloadedValue(DAG, *MFI, VT, 4500 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4501 case Intrinsic::amdgcn_workitem_id_x: { 4502 case Intrinsic::r600_read_tidig_x: 4503 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, 4504 SDLoc(DAG.getEntryNode()), 4505 MFI->getArgInfo().WorkItemIDX); 4506 } 4507 case Intrinsic::amdgcn_workitem_id_y: 4508 case Intrinsic::r600_read_tidig_y: 4509 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, 4510 SDLoc(DAG.getEntryNode()), 4511 MFI->getArgInfo().WorkItemIDY); 4512 case Intrinsic::amdgcn_workitem_id_z: 4513 case Intrinsic::r600_read_tidig_z: 4514 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, 4515 SDLoc(DAG.getEntryNode()), 4516 MFI->getArgInfo().WorkItemIDZ); 4517 case AMDGPUIntrinsic::SI_load_const: { 4518 SDValue Ops[] = { 4519 Op.getOperand(1), 4520 Op.getOperand(2) 4521 }; 4522 4523 MachineMemOperand *MMO = MF.getMachineMemOperand( 4524 MachinePointerInfo(), 4525 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4526 MachineMemOperand::MOInvariant, 4527 VT.getStoreSize(), 4); 4528 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 4529 Op->getVTList(), Ops, VT, MMO); 4530 } 4531 case Intrinsic::amdgcn_fdiv_fast: 4532 return lowerFDIV_FAST(Op, DAG); 4533 case Intrinsic::amdgcn_interp_mov: { 4534 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); 4535 SDValue Glue = M0.getValue(1); 4536 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1), 4537 Op.getOperand(2), Op.getOperand(3), Glue); 4538 } 4539 case Intrinsic::amdgcn_interp_p1: { 4540 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); 4541 SDValue Glue = M0.getValue(1); 4542 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), 4543 Op.getOperand(2), Op.getOperand(3), Glue); 4544 } 4545 case Intrinsic::amdgcn_interp_p2: { 4546 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); 4547 SDValue Glue = SDValue(M0.getNode(), 1); 4548 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), 4549 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), 4550 Glue); 4551 } 4552 case Intrinsic::amdgcn_sin: 4553 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); 4554 4555 case Intrinsic::amdgcn_cos: 4556 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); 4557 4558 case Intrinsic::amdgcn_log_clamp: { 4559 if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) 4560 return SDValue(); 4561 4562 DiagnosticInfoUnsupported BadIntrin( 4563 MF.getFunction(), "intrinsic not supported on subtarget", 4564 DL.getDebugLoc()); 4565 DAG.getContext()->diagnose(BadIntrin); 4566 return DAG.getUNDEF(VT); 4567 } 4568 case Intrinsic::amdgcn_ldexp: 4569 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, 4570 Op.getOperand(1), Op.getOperand(2)); 4571 4572 case Intrinsic::amdgcn_fract: 4573 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 4574 4575 case Intrinsic::amdgcn_class: 4576 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, 4577 Op.getOperand(1), Op.getOperand(2)); 4578 case Intrinsic::amdgcn_div_fmas: 4579 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, 4580 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), 4581 Op.getOperand(4)); 4582 4583 case Intrinsic::amdgcn_div_fixup: 4584 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, 4585 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4586 4587 case Intrinsic::amdgcn_trig_preop: 4588 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, 4589 Op.getOperand(1), Op.getOperand(2)); 4590 case Intrinsic::amdgcn_div_scale: { 4591 // 3rd parameter required to be a constant. 4592 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 4593 if (!Param) 4594 return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL); 4595 4596 // Translate to the operands expected by the machine instruction. The 4597 // first parameter must be the same as the first instruction. 4598 SDValue Numerator = Op.getOperand(1); 4599 SDValue Denominator = Op.getOperand(2); 4600 4601 // Note this order is opposite of the machine instruction's operations, 4602 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The 4603 // intrinsic has the numerator as the first operand to match a normal 4604 // division operation. 4605 4606 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; 4607 4608 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, 4609 Denominator, Numerator); 4610 } 4611 case Intrinsic::amdgcn_icmp: { 4612 const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 4613 if (!CD) 4614 return DAG.getUNDEF(VT); 4615 4616 int CondCode = CD->getSExtValue(); 4617 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || 4618 CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE) 4619 return DAG.getUNDEF(VT); 4620 4621 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); 4622 ISD::CondCode CCOpcode = getICmpCondCode(IcInput); 4623 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), 4624 Op.getOperand(2), DAG.getCondCode(CCOpcode)); 4625 } 4626 case Intrinsic::amdgcn_fcmp: { 4627 const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 4628 if (!CD) 4629 return DAG.getUNDEF(VT); 4630 4631 int CondCode = CD->getSExtValue(); 4632 if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE || 4633 CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) 4634 return DAG.getUNDEF(VT); 4635 4636 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); 4637 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); 4638 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), 4639 Op.getOperand(2), DAG.getCondCode(CCOpcode)); 4640 } 4641 case Intrinsic::amdgcn_fmed3: 4642 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, 4643 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4644 case Intrinsic::amdgcn_fmul_legacy: 4645 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, 4646 Op.getOperand(1), Op.getOperand(2)); 4647 case Intrinsic::amdgcn_sffbh: 4648 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); 4649 case Intrinsic::amdgcn_sbfe: 4650 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, 4651 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4652 case Intrinsic::amdgcn_ubfe: 4653 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, 4654 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4655 case Intrinsic::amdgcn_cvt_pkrtz: 4656 case Intrinsic::amdgcn_cvt_pknorm_i16: 4657 case Intrinsic::amdgcn_cvt_pknorm_u16: 4658 case Intrinsic::amdgcn_cvt_pk_i16: 4659 case Intrinsic::amdgcn_cvt_pk_u16: { 4660 // FIXME: Stop adding cast if v2f16/v2i16 are legal. 4661 EVT VT = Op.getValueType(); 4662 unsigned Opcode; 4663 4664 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz) 4665 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32; 4666 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16) 4667 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; 4668 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16) 4669 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; 4670 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16) 4671 Opcode = AMDGPUISD::CVT_PK_I16_I32; 4672 else 4673 Opcode = AMDGPUISD::CVT_PK_U16_U32; 4674 4675 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32, 4676 Op.getOperand(1), Op.getOperand(2)); 4677 return DAG.getNode(ISD::BITCAST, DL, VT, Node); 4678 } 4679 case Intrinsic::amdgcn_wqm: { 4680 SDValue Src = Op.getOperand(1); 4681 return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src), 4682 0); 4683 } 4684 case Intrinsic::amdgcn_wwm: { 4685 SDValue Src = Op.getOperand(1); 4686 return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src), 4687 0); 4688 } 4689 case Intrinsic::amdgcn_image_getlod: 4690 case Intrinsic::amdgcn_image_getresinfo: { 4691 unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4; 4692 4693 // Replace dmask with everything disabled with undef. 4694 const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx)); 4695 if (!DMask || DMask->isNullValue()) 4696 return DAG.getUNDEF(Op.getValueType()); 4697 return SDValue(); 4698 } 4699 default: 4700 return Op; 4701 } 4702 } 4703 4704 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, 4705 SelectionDAG &DAG) const { 4706 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4707 SDLoc DL(Op); 4708 4709 switch (IntrID) { 4710 case Intrinsic::amdgcn_atomic_inc: 4711 case Intrinsic::amdgcn_atomic_dec: 4712 case Intrinsic::amdgcn_ds_fadd: 4713 case Intrinsic::amdgcn_ds_fmin: 4714 case Intrinsic::amdgcn_ds_fmax: { 4715 MemSDNode *M = cast<MemSDNode>(Op); 4716 unsigned Opc; 4717 switch (IntrID) { 4718 case Intrinsic::amdgcn_atomic_inc: 4719 Opc = AMDGPUISD::ATOMIC_INC; 4720 break; 4721 case Intrinsic::amdgcn_atomic_dec: 4722 Opc = AMDGPUISD::ATOMIC_DEC; 4723 break; 4724 case Intrinsic::amdgcn_ds_fadd: 4725 Opc = AMDGPUISD::ATOMIC_LOAD_FADD; 4726 break; 4727 case Intrinsic::amdgcn_ds_fmin: 4728 Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; 4729 break; 4730 case Intrinsic::amdgcn_ds_fmax: 4731 Opc = AMDGPUISD::ATOMIC_LOAD_FMAX; 4732 break; 4733 default: 4734 llvm_unreachable("Unknown intrinsic!"); 4735 } 4736 SDValue Ops[] = { 4737 M->getOperand(0), // Chain 4738 M->getOperand(2), // Ptr 4739 M->getOperand(3) // Value 4740 }; 4741 4742 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, 4743 M->getMemoryVT(), M->getMemOperand()); 4744 } 4745 case Intrinsic::amdgcn_buffer_load: 4746 case Intrinsic::amdgcn_buffer_load_format: { 4747 SDValue Ops[] = { 4748 Op.getOperand(0), // Chain 4749 Op.getOperand(2), // rsrc 4750 Op.getOperand(3), // vindex 4751 Op.getOperand(4), // offset 4752 Op.getOperand(5), // glc 4753 Op.getOperand(6) // slc 4754 }; 4755 4756 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? 4757 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; 4758 EVT VT = Op.getValueType(); 4759 EVT IntVT = VT.changeTypeToInteger(); 4760 4761 auto *M = cast<MemSDNode>(Op); 4762 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, 4763 M->getMemOperand()); 4764 } 4765 case Intrinsic::amdgcn_tbuffer_load: { 4766 MemSDNode *M = cast<MemSDNode>(Op); 4767 SDValue Ops[] = { 4768 Op.getOperand(0), // Chain 4769 Op.getOperand(2), // rsrc 4770 Op.getOperand(3), // vindex 4771 Op.getOperand(4), // voffset 4772 Op.getOperand(5), // soffset 4773 Op.getOperand(6), // offset 4774 Op.getOperand(7), // dfmt 4775 Op.getOperand(8), // nfmt 4776 Op.getOperand(9), // glc 4777 Op.getOperand(10) // slc 4778 }; 4779 4780 EVT VT = Op.getValueType(); 4781 4782 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, 4783 Op->getVTList(), Ops, VT, M->getMemOperand()); 4784 } 4785 case Intrinsic::amdgcn_buffer_atomic_swap: 4786 case Intrinsic::amdgcn_buffer_atomic_add: 4787 case Intrinsic::amdgcn_buffer_atomic_sub: 4788 case Intrinsic::amdgcn_buffer_atomic_smin: 4789 case Intrinsic::amdgcn_buffer_atomic_umin: 4790 case Intrinsic::amdgcn_buffer_atomic_smax: 4791 case Intrinsic::amdgcn_buffer_atomic_umax: 4792 case Intrinsic::amdgcn_buffer_atomic_and: 4793 case Intrinsic::amdgcn_buffer_atomic_or: 4794 case Intrinsic::amdgcn_buffer_atomic_xor: { 4795 SDValue Ops[] = { 4796 Op.getOperand(0), // Chain 4797 Op.getOperand(2), // vdata 4798 Op.getOperand(3), // rsrc 4799 Op.getOperand(4), // vindex 4800 Op.getOperand(5), // offset 4801 Op.getOperand(6) // slc 4802 }; 4803 EVT VT = Op.getValueType(); 4804 4805 auto *M = cast<MemSDNode>(Op); 4806 unsigned Opcode = 0; 4807 4808 switch (IntrID) { 4809 case Intrinsic::amdgcn_buffer_atomic_swap: 4810 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; 4811 break; 4812 case Intrinsic::amdgcn_buffer_atomic_add: 4813 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; 4814 break; 4815 case Intrinsic::amdgcn_buffer_atomic_sub: 4816 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; 4817 break; 4818 case Intrinsic::amdgcn_buffer_atomic_smin: 4819 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; 4820 break; 4821 case Intrinsic::amdgcn_buffer_atomic_umin: 4822 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; 4823 break; 4824 case Intrinsic::amdgcn_buffer_atomic_smax: 4825 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; 4826 break; 4827 case Intrinsic::amdgcn_buffer_atomic_umax: 4828 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; 4829 break; 4830 case Intrinsic::amdgcn_buffer_atomic_and: 4831 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; 4832 break; 4833 case Intrinsic::amdgcn_buffer_atomic_or: 4834 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; 4835 break; 4836 case Intrinsic::amdgcn_buffer_atomic_xor: 4837 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; 4838 break; 4839 default: 4840 llvm_unreachable("unhandled atomic opcode"); 4841 } 4842 4843 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, 4844 M->getMemOperand()); 4845 } 4846 4847 case Intrinsic::amdgcn_buffer_atomic_cmpswap: { 4848 SDValue Ops[] = { 4849 Op.getOperand(0), // Chain 4850 Op.getOperand(2), // src 4851 Op.getOperand(3), // cmp 4852 Op.getOperand(4), // rsrc 4853 Op.getOperand(5), // vindex 4854 Op.getOperand(6), // offset 4855 Op.getOperand(7) // slc 4856 }; 4857 EVT VT = Op.getValueType(); 4858 auto *M = cast<MemSDNode>(Op); 4859 4860 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, 4861 Op->getVTList(), Ops, VT, M->getMemOperand()); 4862 } 4863 4864 // Basic sample. 4865 case Intrinsic::amdgcn_image_sample: 4866 case Intrinsic::amdgcn_image_sample_cl: 4867 case Intrinsic::amdgcn_image_sample_d: 4868 case Intrinsic::amdgcn_image_sample_d_cl: 4869 case Intrinsic::amdgcn_image_sample_l: 4870 case Intrinsic::amdgcn_image_sample_b: 4871 case Intrinsic::amdgcn_image_sample_b_cl: 4872 case Intrinsic::amdgcn_image_sample_lz: 4873 case Intrinsic::amdgcn_image_sample_cd: 4874 case Intrinsic::amdgcn_image_sample_cd_cl: 4875 4876 // Sample with comparison. 4877 case Intrinsic::amdgcn_image_sample_c: 4878 case Intrinsic::amdgcn_image_sample_c_cl: 4879 case Intrinsic::amdgcn_image_sample_c_d: 4880 case Intrinsic::amdgcn_image_sample_c_d_cl: 4881 case Intrinsic::amdgcn_image_sample_c_l: 4882 case Intrinsic::amdgcn_image_sample_c_b: 4883 case Intrinsic::amdgcn_image_sample_c_b_cl: 4884 case Intrinsic::amdgcn_image_sample_c_lz: 4885 case Intrinsic::amdgcn_image_sample_c_cd: 4886 case Intrinsic::amdgcn_image_sample_c_cd_cl: 4887 4888 // Sample with offsets. 4889 case Intrinsic::amdgcn_image_sample_o: 4890 case Intrinsic::amdgcn_image_sample_cl_o: 4891 case Intrinsic::amdgcn_image_sample_d_o: 4892 case Intrinsic::amdgcn_image_sample_d_cl_o: 4893 case Intrinsic::amdgcn_image_sample_l_o: 4894 case Intrinsic::amdgcn_image_sample_b_o: 4895 case Intrinsic::amdgcn_image_sample_b_cl_o: 4896 case Intrinsic::amdgcn_image_sample_lz_o: 4897 case Intrinsic::amdgcn_image_sample_cd_o: 4898 case Intrinsic::amdgcn_image_sample_cd_cl_o: 4899 4900 // Sample with comparison and offsets. 4901 case Intrinsic::amdgcn_image_sample_c_o: 4902 case Intrinsic::amdgcn_image_sample_c_cl_o: 4903 case Intrinsic::amdgcn_image_sample_c_d_o: 4904 case Intrinsic::amdgcn_image_sample_c_d_cl_o: 4905 case Intrinsic::amdgcn_image_sample_c_l_o: 4906 case Intrinsic::amdgcn_image_sample_c_b_o: 4907 case Intrinsic::amdgcn_image_sample_c_b_cl_o: 4908 case Intrinsic::amdgcn_image_sample_c_lz_o: 4909 case Intrinsic::amdgcn_image_sample_c_cd_o: 4910 case Intrinsic::amdgcn_image_sample_c_cd_cl_o: { 4911 // Replace dmask with everything disabled with undef. 4912 const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5)); 4913 if (!DMask || DMask->isNullValue()) { 4914 SDValue Undef = DAG.getUNDEF(Op.getValueType()); 4915 return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op)); 4916 } 4917 4918 return SDValue(); 4919 } 4920 default: 4921 return SDValue(); 4922 } 4923 } 4924 4925 SDValue SITargetLowering::handleD16VData(SDValue VData, 4926 SelectionDAG &DAG) const { 4927 EVT StoreVT = VData.getValueType(); 4928 SDLoc DL(VData); 4929 4930 if (StoreVT.isVector()) { 4931 assert ((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"); 4932 if (!Subtarget->hasUnpackedD16VMem()) { 4933 if (!isTypeLegal(StoreVT)) { 4934 // If Target supports packed vmem, we just need to workaround 4935 // the illegal type by casting to an equivalent one. 4936 EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT); 4937 return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); 4938 } 4939 } else { // We need to unpack the packed data to store. 4940 EVT IntStoreVT = StoreVT.changeTypeToInteger(); 4941 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); 4942 EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; 4943 return DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); 4944 } 4945 } 4946 // No change for f16 and legal vector D16 types. 4947 return VData; 4948 } 4949 4950 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 4951 SelectionDAG &DAG) const { 4952 SDLoc DL(Op); 4953 SDValue Chain = Op.getOperand(0); 4954 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4955 MachineFunction &MF = DAG.getMachineFunction(); 4956 4957 switch (IntrinsicID) { 4958 case Intrinsic::amdgcn_exp: { 4959 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2)); 4960 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3)); 4961 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8)); 4962 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9)); 4963 4964 const SDValue Ops[] = { 4965 Chain, 4966 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt 4967 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en 4968 Op.getOperand(4), // src0 4969 Op.getOperand(5), // src1 4970 Op.getOperand(6), // src2 4971 Op.getOperand(7), // src3 4972 DAG.getTargetConstant(0, DL, MVT::i1), // compr 4973 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1) 4974 }; 4975 4976 unsigned Opc = Done->isNullValue() ? 4977 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; 4978 return DAG.getNode(Opc, DL, Op->getVTList(), Ops); 4979 } 4980 case Intrinsic::amdgcn_exp_compr: { 4981 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2)); 4982 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3)); 4983 SDValue Src0 = Op.getOperand(4); 4984 SDValue Src1 = Op.getOperand(5); 4985 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6)); 4986 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7)); 4987 4988 SDValue Undef = DAG.getUNDEF(MVT::f32); 4989 const SDValue Ops[] = { 4990 Chain, 4991 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt 4992 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en 4993 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), 4994 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), 4995 Undef, // src2 4996 Undef, // src3 4997 DAG.getTargetConstant(1, DL, MVT::i1), // compr 4998 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1) 4999 }; 5000 5001 unsigned Opc = Done->isNullValue() ? 5002 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; 5003 return DAG.getNode(Opc, DL, Op->getVTList(), Ops); 5004 } 5005 case Intrinsic::amdgcn_s_sendmsg: 5006 case Intrinsic::amdgcn_s_sendmsghalt: { 5007 unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ? 5008 AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT; 5009 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 5010 SDValue Glue = Chain.getValue(1); 5011 return DAG.getNode(NodeOp, DL, MVT::Other, Chain, 5012 Op.getOperand(2), Glue); 5013 } 5014 case Intrinsic::amdgcn_init_exec: { 5015 return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain, 5016 Op.getOperand(2)); 5017 } 5018 case Intrinsic::amdgcn_init_exec_from_input: { 5019 return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain, 5020 Op.getOperand(2), Op.getOperand(3)); 5021 } 5022 case AMDGPUIntrinsic::AMDGPU_kill: { 5023 SDValue Src = Op.getOperand(2); 5024 if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) { 5025 if (!K->isNegative()) 5026 return Chain; 5027 5028 SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32); 5029 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne); 5030 } 5031 5032 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); 5033 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast); 5034 } 5035 case Intrinsic::amdgcn_s_barrier: { 5036 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { 5037 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 5038 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; 5039 if (WGSize <= ST.getWavefrontSize()) 5040 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, 5041 Op.getOperand(0)), 0); 5042 } 5043 return SDValue(); 5044 }; 5045 case AMDGPUIntrinsic::SI_tbuffer_store: { 5046 5047 // Extract vindex and voffset from vaddr as appropriate 5048 const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10)); 5049 const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11)); 5050 SDValue VAddr = Op.getOperand(5); 5051 5052 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); 5053 5054 assert(!(OffEn->isOne() && IdxEn->isOne()) && 5055 "Legacy intrinsic doesn't support both offset and index - use new version"); 5056 5057 SDValue VIndex = IdxEn->isOne() ? VAddr : Zero; 5058 SDValue VOffset = OffEn->isOne() ? VAddr : Zero; 5059 5060 // Deal with the vec-3 case 5061 const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4)); 5062 auto Opcode = NumChannels->getZExtValue() == 3 ? 5063 AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT; 5064 5065 SDValue Ops[] = { 5066 Chain, 5067 Op.getOperand(3), // vdata 5068 Op.getOperand(2), // rsrc 5069 VIndex, 5070 VOffset, 5071 Op.getOperand(6), // soffset 5072 Op.getOperand(7), // inst_offset 5073 Op.getOperand(8), // dfmt 5074 Op.getOperand(9), // nfmt 5075 Op.getOperand(12), // glc 5076 Op.getOperand(13), // slc 5077 }; 5078 5079 assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 && 5080 "Value of tfe other than zero is unsupported"); 5081 5082 EVT VT = Op.getOperand(3).getValueType(); 5083 MachineMemOperand *MMO = MF.getMachineMemOperand( 5084 MachinePointerInfo(), 5085 MachineMemOperand::MOStore, 5086 VT.getStoreSize(), 4); 5087 return DAG.getMemIntrinsicNode(Opcode, DL, 5088 Op->getVTList(), Ops, VT, MMO); 5089 } 5090 5091 case Intrinsic::amdgcn_tbuffer_store: { 5092 SDValue VData = Op.getOperand(2); 5093 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); 5094 if (IsD16) 5095 VData = handleD16VData(VData, DAG); 5096 SDValue Ops[] = { 5097 Chain, 5098 VData, // vdata 5099 Op.getOperand(3), // rsrc 5100 Op.getOperand(4), // vindex 5101 Op.getOperand(5), // voffset 5102 Op.getOperand(6), // soffset 5103 Op.getOperand(7), // offset 5104 Op.getOperand(8), // dfmt 5105 Op.getOperand(9), // nfmt 5106 Op.getOperand(10), // glc 5107 Op.getOperand(11) // slc 5108 }; 5109 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : 5110 AMDGPUISD::TBUFFER_STORE_FORMAT; 5111 MemSDNode *M = cast<MemSDNode>(Op); 5112 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 5113 M->getMemoryVT(), M->getMemOperand()); 5114 } 5115 5116 case Intrinsic::amdgcn_buffer_store: 5117 case Intrinsic::amdgcn_buffer_store_format: { 5118 SDValue VData = Op.getOperand(2); 5119 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); 5120 if (IsD16) 5121 VData = handleD16VData(VData, DAG); 5122 SDValue Ops[] = { 5123 Chain, 5124 VData, // vdata 5125 Op.getOperand(3), // rsrc 5126 Op.getOperand(4), // vindex 5127 Op.getOperand(5), // offset 5128 Op.getOperand(6), // glc 5129 Op.getOperand(7) // slc 5130 }; 5131 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? 5132 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; 5133 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; 5134 MemSDNode *M = cast<MemSDNode>(Op); 5135 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 5136 M->getMemoryVT(), M->getMemOperand()); 5137 } 5138 5139 case Intrinsic::amdgcn_image_store: 5140 case Intrinsic::amdgcn_image_store_mip: { 5141 SDValue VData = Op.getOperand(2); 5142 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); 5143 if (IsD16) 5144 VData = handleD16VData(VData, DAG); 5145 SDValue Ops[] = { 5146 Chain, // Chain 5147 VData, // vdata 5148 Op.getOperand(3), // vaddr 5149 Op.getOperand(4), // rsrc 5150 Op.getOperand(5), // dmask 5151 Op.getOperand(6), // glc 5152 Op.getOperand(7), // slc 5153 Op.getOperand(8), // lwe 5154 Op.getOperand(9) // da 5155 }; 5156 unsigned Opc = (IntrinsicID==Intrinsic::amdgcn_image_store) ? 5157 AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP; 5158 MemSDNode *M = cast<MemSDNode>(Op); 5159 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 5160 M->getMemoryVT(), M->getMemOperand()); 5161 } 5162 5163 default: { 5164 const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = 5165 AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrinsicID); 5166 if (D16ImageDimIntr) { 5167 SDValue VData = Op.getOperand(2); 5168 EVT StoreVT = VData.getValueType(); 5169 if ((StoreVT == MVT::v2f16 && !isTypeLegal(StoreVT)) || 5170 StoreVT == MVT::v4f16) { 5171 VData = handleD16VData(VData, DAG); 5172 5173 SmallVector<SDValue, 12> Ops; 5174 for (auto Value : Op.getNode()->op_values()) 5175 Ops.push_back(Value); 5176 Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32); 5177 Ops[2] = VData; 5178 5179 MemSDNode *M = cast<MemSDNode>(Op); 5180 return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Op->getVTList(), 5181 Ops, M->getMemoryVT(), 5182 M->getMemOperand()); 5183 } 5184 } 5185 5186 return Op; 5187 } 5188 } 5189 } 5190 5191 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 5192 SDLoc DL(Op); 5193 LoadSDNode *Load = cast<LoadSDNode>(Op); 5194 ISD::LoadExtType ExtType = Load->getExtensionType(); 5195 EVT MemVT = Load->getMemoryVT(); 5196 5197 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { 5198 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16)) 5199 return SDValue(); 5200 5201 // FIXME: Copied from PPC 5202 // First, load into 32 bits, then truncate to 1 bit. 5203 5204 SDValue Chain = Load->getChain(); 5205 SDValue BasePtr = Load->getBasePtr(); 5206 MachineMemOperand *MMO = Load->getMemOperand(); 5207 5208 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; 5209 5210 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, 5211 BasePtr, RealMemVT, MMO); 5212 5213 SDValue Ops[] = { 5214 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), 5215 NewLD.getValue(1) 5216 }; 5217 5218 return DAG.getMergeValues(Ops, DL); 5219 } 5220 5221 if (!MemVT.isVector()) 5222 return SDValue(); 5223 5224 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 5225 "Custom lowering for non-i32 vectors hasn't been implemented."); 5226 5227 unsigned Alignment = Load->getAlignment(); 5228 unsigned AS = Load->getAddressSpace(); 5229 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, 5230 AS, Alignment)) { 5231 SDValue Ops[2]; 5232 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 5233 return DAG.getMergeValues(Ops, DL); 5234 } 5235 5236 MachineFunction &MF = DAG.getMachineFunction(); 5237 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 5238 // If there is a possibilty that flat instruction access scratch memory 5239 // then we need to use the same legalization rules we use for private. 5240 if (AS == AMDGPUASI.FLAT_ADDRESS) 5241 AS = MFI->hasFlatScratchInit() ? 5242 AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; 5243 5244 unsigned NumElements = MemVT.getVectorNumElements(); 5245 5246 if (AS == AMDGPUASI.CONSTANT_ADDRESS || 5247 AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { 5248 if (!Op->isDivergent() && Alignment >= 4) 5249 return SDValue(); 5250 // Non-uniform loads will be selected to MUBUF instructions, so they 5251 // have the same legalization requirements as global and private 5252 // loads. 5253 // 5254 } 5255 5256 if (AS == AMDGPUASI.CONSTANT_ADDRESS || 5257 AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || 5258 AS == AMDGPUASI.GLOBAL_ADDRESS) { 5259 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && 5260 !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && 5261 Alignment >= 4) 5262 return SDValue(); 5263 // Non-uniform loads will be selected to MUBUF instructions, so they 5264 // have the same legalization requirements as global and private 5265 // loads. 5266 // 5267 } 5268 if (AS == AMDGPUASI.CONSTANT_ADDRESS || 5269 AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || 5270 AS == AMDGPUASI.GLOBAL_ADDRESS || 5271 AS == AMDGPUASI.FLAT_ADDRESS) { 5272 if (NumElements > 4) 5273 return SplitVectorLoad(Op, DAG); 5274 // v4 loads are supported for private and global memory. 5275 return SDValue(); 5276 } 5277 if (AS == AMDGPUASI.PRIVATE_ADDRESS) { 5278 // Depending on the setting of the private_element_size field in the 5279 // resource descriptor, we can only make private accesses up to a certain 5280 // size. 5281 switch (Subtarget->getMaxPrivateElementSize()) { 5282 case 4: 5283 return scalarizeVectorLoad(Load, DAG); 5284 case 8: 5285 if (NumElements > 2) 5286 return SplitVectorLoad(Op, DAG); 5287 return SDValue(); 5288 case 16: 5289 // Same as global/flat 5290 if (NumElements > 4) 5291 return SplitVectorLoad(Op, DAG); 5292 return SDValue(); 5293 default: 5294 llvm_unreachable("unsupported private_element_size"); 5295 } 5296 } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { 5297 // Use ds_read_b128 if possible. 5298 if (Subtarget->useDS128() && Load->getAlignment() >= 16 && 5299 MemVT.getStoreSize() == 16) 5300 return SDValue(); 5301 5302 if (NumElements > 2) 5303 return SplitVectorLoad(Op, DAG); 5304 } 5305 return SDValue(); 5306 } 5307 5308 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 5309 if (Op.getValueType() != MVT::i64) 5310 return SDValue(); 5311 5312 SDLoc DL(Op); 5313 SDValue Cond = Op.getOperand(0); 5314 5315 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 5316 SDValue One = DAG.getConstant(1, DL, MVT::i32); 5317 5318 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 5319 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 5320 5321 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 5322 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 5323 5324 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 5325 5326 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 5327 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 5328 5329 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 5330 5331 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); 5332 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 5333 } 5334 5335 // Catch division cases where we can use shortcuts with rcp and rsq 5336 // instructions. 5337 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, 5338 SelectionDAG &DAG) const { 5339 SDLoc SL(Op); 5340 SDValue LHS = Op.getOperand(0); 5341 SDValue RHS = Op.getOperand(1); 5342 EVT VT = Op.getValueType(); 5343 const SDNodeFlags Flags = Op->getFlags(); 5344 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || 5345 Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal(); 5346 5347 if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals()) 5348 return SDValue(); 5349 5350 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 5351 if (Unsafe || VT == MVT::f32 || VT == MVT::f16) { 5352 if (CLHS->isExactlyValue(1.0)) { 5353 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 5354 // the CI documentation has a worst case error of 1 ulp. 5355 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 5356 // use it as long as we aren't trying to use denormals. 5357 // 5358 // v_rcp_f16 and v_rsq_f16 DO support denormals. 5359 5360 // 1.0 / sqrt(x) -> rsq(x) 5361 5362 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 5363 // error seems really high at 2^29 ULP. 5364 if (RHS.getOpcode() == ISD::FSQRT) 5365 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); 5366 5367 // 1.0 / x -> rcp(x) 5368 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 5369 } 5370 5371 // Same as for 1.0, but expand the sign out of the constant. 5372 if (CLHS->isExactlyValue(-1.0)) { 5373 // -1.0 / x -> rcp (fneg x) 5374 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 5375 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); 5376 } 5377 } 5378 } 5379 5380 if (Unsafe) { 5381 // Turn into multiply by the reciprocal. 5382 // x / y -> x * (1.0 / y) 5383 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 5384 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); 5385 } 5386 5387 return SDValue(); 5388 } 5389 5390 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, 5391 EVT VT, SDValue A, SDValue B, SDValue GlueChain) { 5392 if (GlueChain->getNumValues() <= 1) { 5393 return DAG.getNode(Opcode, SL, VT, A, B); 5394 } 5395 5396 assert(GlueChain->getNumValues() == 3); 5397 5398 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); 5399 switch (Opcode) { 5400 default: llvm_unreachable("no chain equivalent for opcode"); 5401 case ISD::FMUL: 5402 Opcode = AMDGPUISD::FMUL_W_CHAIN; 5403 break; 5404 } 5405 5406 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, 5407 GlueChain.getValue(2)); 5408 } 5409 5410 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, 5411 EVT VT, SDValue A, SDValue B, SDValue C, 5412 SDValue GlueChain) { 5413 if (GlueChain->getNumValues() <= 1) { 5414 return DAG.getNode(Opcode, SL, VT, A, B, C); 5415 } 5416 5417 assert(GlueChain->getNumValues() == 3); 5418 5419 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); 5420 switch (Opcode) { 5421 default: llvm_unreachable("no chain equivalent for opcode"); 5422 case ISD::FMA: 5423 Opcode = AMDGPUISD::FMA_W_CHAIN; 5424 break; 5425 } 5426 5427 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C, 5428 GlueChain.getValue(2)); 5429 } 5430 5431 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { 5432 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) 5433 return FastLowered; 5434 5435 SDLoc SL(Op); 5436 SDValue Src0 = Op.getOperand(0); 5437 SDValue Src1 = Op.getOperand(1); 5438 5439 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); 5440 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); 5441 5442 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1); 5443 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1); 5444 5445 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32); 5446 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag); 5447 5448 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0); 5449 } 5450 5451 // Faster 2.5 ULP division that does not support denormals. 5452 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { 5453 SDLoc SL(Op); 5454 SDValue LHS = Op.getOperand(1); 5455 SDValue RHS = Op.getOperand(2); 5456 5457 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); 5458 5459 const APFloat K0Val(BitsToFloat(0x6f800000)); 5460 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); 5461 5462 const APFloat K1Val(BitsToFloat(0x2f800000)); 5463 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); 5464 5465 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 5466 5467 EVT SetCCVT = 5468 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); 5469 5470 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 5471 5472 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); 5473 5474 // TODO: Should this propagate fast-math-flags? 5475 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); 5476 5477 // rcp does not support denormals. 5478 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); 5479 5480 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); 5481 5482 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); 5483 } 5484 5485 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 5486 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) 5487 return FastLowered; 5488 5489 SDLoc SL(Op); 5490 SDValue LHS = Op.getOperand(0); 5491 SDValue RHS = Op.getOperand(1); 5492 5493 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 5494 5495 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); 5496 5497 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, 5498 RHS, RHS, LHS); 5499 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, 5500 LHS, RHS, LHS); 5501 5502 // Denominator is scaled to not be denormal, so using rcp is ok. 5503 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, 5504 DenominatorScaled); 5505 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, 5506 DenominatorScaled); 5507 5508 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | 5509 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 5510 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 5511 5512 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); 5513 5514 if (!Subtarget->hasFP32Denormals()) { 5515 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 5516 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, 5517 SL, MVT::i32); 5518 SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, 5519 DAG.getEntryNode(), 5520 EnableDenormValue, BitField); 5521 SDValue Ops[3] = { 5522 NegDivScale0, 5523 EnableDenorm.getValue(0), 5524 EnableDenorm.getValue(1) 5525 }; 5526 5527 NegDivScale0 = DAG.getMergeValues(Ops, SL); 5528 } 5529 5530 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, 5531 ApproxRcp, One, NegDivScale0); 5532 5533 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, 5534 ApproxRcp, Fma0); 5535 5536 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, 5537 Fma1, Fma1); 5538 5539 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, 5540 NumeratorScaled, Mul); 5541 5542 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2); 5543 5544 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, 5545 NumeratorScaled, Fma3); 5546 5547 if (!Subtarget->hasFP32Denormals()) { 5548 const SDValue DisableDenormValue = 5549 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); 5550 SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, 5551 Fma4.getValue(1), 5552 DisableDenormValue, 5553 BitField, 5554 Fma4.getValue(2)); 5555 5556 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 5557 DisableDenorm, DAG.getRoot()); 5558 DAG.setRoot(OutputChain); 5559 } 5560 5561 SDValue Scale = NumeratorScaled.getValue(1); 5562 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, 5563 Fma4, Fma1, Fma3, Scale); 5564 5565 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); 5566 } 5567 5568 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 5569 if (DAG.getTarget().Options.UnsafeFPMath) 5570 return lowerFastUnsafeFDIV(Op, DAG); 5571 5572 SDLoc SL(Op); 5573 SDValue X = Op.getOperand(0); 5574 SDValue Y = Op.getOperand(1); 5575 5576 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 5577 5578 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 5579 5580 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 5581 5582 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 5583 5584 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 5585 5586 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 5587 5588 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 5589 5590 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 5591 5592 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 5593 5594 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 5595 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 5596 5597 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, 5598 NegDivScale0, Mul, DivScale1); 5599 5600 SDValue Scale; 5601 5602 if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { 5603 // Workaround a hardware bug on SI where the condition output from div_scale 5604 // is not usable. 5605 5606 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); 5607 5608 // Figure out if the scale to use for div_fmas. 5609 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 5610 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 5611 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 5612 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 5613 5614 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 5615 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 5616 5617 SDValue Scale0Hi 5618 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 5619 SDValue Scale1Hi 5620 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 5621 5622 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 5623 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 5624 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 5625 } else { 5626 Scale = DivScale1.getValue(1); 5627 } 5628 5629 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, 5630 Fma4, Fma3, Mul, Scale); 5631 5632 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 5633 } 5634 5635 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 5636 EVT VT = Op.getValueType(); 5637 5638 if (VT == MVT::f32) 5639 return LowerFDIV32(Op, DAG); 5640 5641 if (VT == MVT::f64) 5642 return LowerFDIV64(Op, DAG); 5643 5644 if (VT == MVT::f16) 5645 return LowerFDIV16(Op, DAG); 5646 5647 llvm_unreachable("Unexpected type for fdiv"); 5648 } 5649 5650 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 5651 SDLoc DL(Op); 5652 StoreSDNode *Store = cast<StoreSDNode>(Op); 5653 EVT VT = Store->getMemoryVT(); 5654 5655 if (VT == MVT::i1) { 5656 return DAG.getTruncStore(Store->getChain(), DL, 5657 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 5658 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 5659 } 5660 5661 assert(VT.isVector() && 5662 Store->getValue().getValueType().getScalarType() == MVT::i32); 5663 5664 unsigned AS = Store->getAddressSpace(); 5665 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, 5666 AS, Store->getAlignment())) { 5667 return expandUnalignedStore(Store, DAG); 5668 } 5669 5670 MachineFunction &MF = DAG.getMachineFunction(); 5671 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 5672 // If there is a possibilty that flat instruction access scratch memory 5673 // then we need to use the same legalization rules we use for private. 5674 if (AS == AMDGPUASI.FLAT_ADDRESS) 5675 AS = MFI->hasFlatScratchInit() ? 5676 AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; 5677 5678 unsigned NumElements = VT.getVectorNumElements(); 5679 if (AS == AMDGPUASI.GLOBAL_ADDRESS || 5680 AS == AMDGPUASI.FLAT_ADDRESS) { 5681 if (NumElements > 4) 5682 return SplitVectorStore(Op, DAG); 5683 return SDValue(); 5684 } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { 5685 switch (Subtarget->getMaxPrivateElementSize()) { 5686 case 4: 5687 return scalarizeVectorStore(Store, DAG); 5688 case 8: 5689 if (NumElements > 2) 5690 return SplitVectorStore(Op, DAG); 5691 return SDValue(); 5692 case 16: 5693 if (NumElements > 4) 5694 return SplitVectorStore(Op, DAG); 5695 return SDValue(); 5696 default: 5697 llvm_unreachable("unsupported private_element_size"); 5698 } 5699 } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { 5700 // Use ds_write_b128 if possible. 5701 if (Subtarget->useDS128() && Store->getAlignment() >= 16 && 5702 VT.getStoreSize() == 16) 5703 return SDValue(); 5704 5705 if (NumElements > 2) 5706 return SplitVectorStore(Op, DAG); 5707 return SDValue(); 5708 } else { 5709 llvm_unreachable("unhandled address space"); 5710 } 5711 } 5712 5713 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 5714 SDLoc DL(Op); 5715 EVT VT = Op.getValueType(); 5716 SDValue Arg = Op.getOperand(0); 5717 // TODO: Should this propagate fast-math-flags? 5718 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 5719 DAG.getNode(ISD::FMUL, DL, VT, Arg, 5720 DAG.getConstantFP(0.5/M_PI, DL, 5721 VT))); 5722 5723 switch (Op.getOpcode()) { 5724 case ISD::FCOS: 5725 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); 5726 case ISD::FSIN: 5727 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); 5728 default: 5729 llvm_unreachable("Wrong trig opcode"); 5730 } 5731 } 5732 5733 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 5734 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op); 5735 assert(AtomicNode->isCompareAndSwap()); 5736 unsigned AS = AtomicNode->getAddressSpace(); 5737 5738 // No custom lowering required for local address space 5739 if (!isFlatGlobalAddrSpace(AS, AMDGPUASI)) 5740 return Op; 5741 5742 // Non-local address space requires custom lowering for atomic compare 5743 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 5744 SDLoc DL(Op); 5745 SDValue ChainIn = Op.getOperand(0); 5746 SDValue Addr = Op.getOperand(1); 5747 SDValue Old = Op.getOperand(2); 5748 SDValue New = Op.getOperand(3); 5749 EVT VT = Op.getValueType(); 5750 MVT SimpleVT = VT.getSimpleVT(); 5751 MVT VecType = MVT::getVectorVT(SimpleVT, 2); 5752 5753 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old}); 5754 SDValue Ops[] = { ChainIn, Addr, NewOld }; 5755 5756 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(), 5757 Ops, VT, AtomicNode->getMemOperand()); 5758 } 5759 5760 //===----------------------------------------------------------------------===// 5761 // Custom DAG optimizations 5762 //===----------------------------------------------------------------------===// 5763 5764 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 5765 DAGCombinerInfo &DCI) const { 5766 EVT VT = N->getValueType(0); 5767 EVT ScalarVT = VT.getScalarType(); 5768 if (ScalarVT != MVT::f32) 5769 return SDValue(); 5770 5771 SelectionDAG &DAG = DCI.DAG; 5772 SDLoc DL(N); 5773 5774 SDValue Src = N->getOperand(0); 5775 EVT SrcVT = Src.getValueType(); 5776 5777 // TODO: We could try to match extracting the higher bytes, which would be 5778 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 5779 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 5780 // about in practice. 5781 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) { 5782 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 5783 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); 5784 DCI.AddToWorklist(Cvt.getNode()); 5785 return Cvt; 5786 } 5787 } 5788 5789 return SDValue(); 5790 } 5791 5792 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 5793 5794 // This is a variant of 5795 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 5796 // 5797 // The normal DAG combiner will do this, but only if the add has one use since 5798 // that would increase the number of instructions. 5799 // 5800 // This prevents us from seeing a constant offset that can be folded into a 5801 // memory instruction's addressing mode. If we know the resulting add offset of 5802 // a pointer can be folded into an addressing offset, we can replace the pointer 5803 // operand with the add of new constant offset. This eliminates one of the uses, 5804 // and may allow the remaining use to also be simplified. 5805 // 5806 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, 5807 unsigned AddrSpace, 5808 EVT MemVT, 5809 DAGCombinerInfo &DCI) const { 5810 SDValue N0 = N->getOperand(0); 5811 SDValue N1 = N->getOperand(1); 5812 5813 // We only do this to handle cases where it's profitable when there are 5814 // multiple uses of the add, so defer to the standard combine. 5815 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) || 5816 N0->hasOneUse()) 5817 return SDValue(); 5818 5819 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 5820 if (!CN1) 5821 return SDValue(); 5822 5823 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5824 if (!CAdd) 5825 return SDValue(); 5826 5827 // If the resulting offset is too large, we can't fold it into the addressing 5828 // mode offset. 5829 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 5830 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext()); 5831 5832 AddrMode AM; 5833 AM.HasBaseReg = true; 5834 AM.BaseOffs = Offset.getSExtValue(); 5835 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace)) 5836 return SDValue(); 5837 5838 SelectionDAG &DAG = DCI.DAG; 5839 SDLoc SL(N); 5840 EVT VT = N->getValueType(0); 5841 5842 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 5843 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); 5844 5845 SDNodeFlags Flags; 5846 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() && 5847 (N0.getOpcode() == ISD::OR || 5848 N0->getFlags().hasNoUnsignedWrap())); 5849 5850 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags); 5851 } 5852 5853 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, 5854 DAGCombinerInfo &DCI) const { 5855 SDValue Ptr = N->getBasePtr(); 5856 SelectionDAG &DAG = DCI.DAG; 5857 SDLoc SL(N); 5858 5859 // TODO: We could also do this for multiplies. 5860 if (Ptr.getOpcode() == ISD::SHL) { 5861 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(), 5862 N->getMemoryVT(), DCI); 5863 if (NewPtr) { 5864 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end()); 5865 5866 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; 5867 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); 5868 } 5869 } 5870 5871 return SDValue(); 5872 } 5873 5874 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) { 5875 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) || 5876 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) || 5877 (Opc == ISD::XOR && Val == 0); 5878 } 5879 5880 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This 5881 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit 5882 // integer combine opportunities since most 64-bit operations are decomposed 5883 // this way. TODO: We won't want this for SALU especially if it is an inline 5884 // immediate. 5885 SDValue SITargetLowering::splitBinaryBitConstantOp( 5886 DAGCombinerInfo &DCI, 5887 const SDLoc &SL, 5888 unsigned Opc, SDValue LHS, 5889 const ConstantSDNode *CRHS) const { 5890 uint64_t Val = CRHS->getZExtValue(); 5891 uint32_t ValLo = Lo_32(Val); 5892 uint32_t ValHi = Hi_32(Val); 5893 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 5894 5895 if ((bitOpWithConstantIsReducible(Opc, ValLo) || 5896 bitOpWithConstantIsReducible(Opc, ValHi)) || 5897 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { 5898 // If we need to materialize a 64-bit immediate, it will be split up later 5899 // anyway. Avoid creating the harder to understand 64-bit immediate 5900 // materialization. 5901 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi); 5902 } 5903 5904 return SDValue(); 5905 } 5906 5907 // Returns true if argument is a boolean value which is not serialized into 5908 // memory or argument and does not require v_cmdmask_b32 to be deserialized. 5909 static bool isBoolSGPR(SDValue V) { 5910 if (V.getValueType() != MVT::i1) 5911 return false; 5912 switch (V.getOpcode()) { 5913 default: break; 5914 case ISD::SETCC: 5915 case ISD::AND: 5916 case ISD::OR: 5917 case ISD::XOR: 5918 case AMDGPUISD::FP_CLASS: 5919 return true; 5920 } 5921 return false; 5922 } 5923 5924 SDValue SITargetLowering::performAndCombine(SDNode *N, 5925 DAGCombinerInfo &DCI) const { 5926 if (DCI.isBeforeLegalize()) 5927 return SDValue(); 5928 5929 SelectionDAG &DAG = DCI.DAG; 5930 EVT VT = N->getValueType(0); 5931 SDValue LHS = N->getOperand(0); 5932 SDValue RHS = N->getOperand(1); 5933 5934 5935 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 5936 if (VT == MVT::i64 && CRHS) { 5937 if (SDValue Split 5938 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) 5939 return Split; 5940 } 5941 5942 if (CRHS && VT == MVT::i32) { 5943 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb 5944 // nb = number of trailing zeroes in mask 5945 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, 5946 // given that we are selecting 8 or 16 bit fields starting at byte boundary. 5947 uint64_t Mask = CRHS->getZExtValue(); 5948 unsigned Bits = countPopulation(Mask); 5949 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && 5950 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) { 5951 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { 5952 unsigned Shift = CShift->getZExtValue(); 5953 unsigned NB = CRHS->getAPIntValue().countTrailingZeros(); 5954 unsigned Offset = NB + Shift; 5955 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. 5956 SDLoc SL(N); 5957 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, 5958 LHS->getOperand(0), 5959 DAG.getConstant(Offset, SL, MVT::i32), 5960 DAG.getConstant(Bits, SL, MVT::i32)); 5961 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits); 5962 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE, 5963 DAG.getValueType(NarrowVT)); 5964 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext, 5965 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32)); 5966 return Shl; 5967 } 5968 } 5969 } 5970 } 5971 5972 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 5973 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 5974 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) { 5975 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 5976 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 5977 5978 SDValue X = LHS.getOperand(0); 5979 SDValue Y = RHS.getOperand(0); 5980 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) 5981 return SDValue(); 5982 5983 if (LCC == ISD::SETO) { 5984 if (X != LHS.getOperand(1)) 5985 return SDValue(); 5986 5987 if (RCC == ISD::SETUNE) { 5988 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 5989 if (!C1 || !C1->isInfinity() || C1->isNegative()) 5990 return SDValue(); 5991 5992 const uint32_t Mask = SIInstrFlags::N_NORMAL | 5993 SIInstrFlags::N_SUBNORMAL | 5994 SIInstrFlags::N_ZERO | 5995 SIInstrFlags::P_ZERO | 5996 SIInstrFlags::P_SUBNORMAL | 5997 SIInstrFlags::P_NORMAL; 5998 5999 static_assert(((~(SIInstrFlags::S_NAN | 6000 SIInstrFlags::Q_NAN | 6001 SIInstrFlags::N_INFINITY | 6002 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, 6003 "mask not equal"); 6004 6005 SDLoc DL(N); 6006 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 6007 X, DAG.getConstant(Mask, DL, MVT::i32)); 6008 } 6009 } 6010 } 6011 6012 if (VT == MVT::i32 && 6013 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) { 6014 // and x, (sext cc from i1) => select cc, x, 0 6015 if (RHS.getOpcode() != ISD::SIGN_EXTEND) 6016 std::swap(LHS, RHS); 6017 if (isBoolSGPR(RHS.getOperand(0))) 6018 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), 6019 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32)); 6020 } 6021 6022 return SDValue(); 6023 } 6024 6025 SDValue SITargetLowering::performOrCombine(SDNode *N, 6026 DAGCombinerInfo &DCI) const { 6027 SelectionDAG &DAG = DCI.DAG; 6028 SDValue LHS = N->getOperand(0); 6029 SDValue RHS = N->getOperand(1); 6030 6031 EVT VT = N->getValueType(0); 6032 if (VT == MVT::i1) { 6033 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 6034 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 6035 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 6036 SDValue Src = LHS.getOperand(0); 6037 if (Src != RHS.getOperand(0)) 6038 return SDValue(); 6039 6040 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 6041 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 6042 if (!CLHS || !CRHS) 6043 return SDValue(); 6044 6045 // Only 10 bits are used. 6046 static const uint32_t MaxMask = 0x3ff; 6047 6048 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 6049 SDLoc DL(N); 6050 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, 6051 Src, DAG.getConstant(NewMask, DL, MVT::i32)); 6052 } 6053 6054 return SDValue(); 6055 } 6056 6057 if (VT != MVT::i64) 6058 return SDValue(); 6059 6060 // TODO: This could be a generic combine with a predicate for extracting the 6061 // high half of an integer being free. 6062 6063 // (or i64:x, (zero_extend i32:y)) -> 6064 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) 6065 if (LHS.getOpcode() == ISD::ZERO_EXTEND && 6066 RHS.getOpcode() != ISD::ZERO_EXTEND) 6067 std::swap(LHS, RHS); 6068 6069 if (RHS.getOpcode() == ISD::ZERO_EXTEND) { 6070 SDValue ExtSrc = RHS.getOperand(0); 6071 EVT SrcVT = ExtSrc.getValueType(); 6072 if (SrcVT == MVT::i32) { 6073 SDLoc SL(N); 6074 SDValue LowLHS, HiBits; 6075 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); 6076 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); 6077 6078 DCI.AddToWorklist(LowOr.getNode()); 6079 DCI.AddToWorklist(HiBits.getNode()); 6080 6081 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 6082 LowOr, HiBits); 6083 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 6084 } 6085 } 6086 6087 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 6088 if (CRHS) { 6089 if (SDValue Split 6090 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS)) 6091 return Split; 6092 } 6093 6094 return SDValue(); 6095 } 6096 6097 SDValue SITargetLowering::performXorCombine(SDNode *N, 6098 DAGCombinerInfo &DCI) const { 6099 EVT VT = N->getValueType(0); 6100 if (VT != MVT::i64) 6101 return SDValue(); 6102 6103 SDValue LHS = N->getOperand(0); 6104 SDValue RHS = N->getOperand(1); 6105 6106 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 6107 if (CRHS) { 6108 if (SDValue Split 6109 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) 6110 return Split; 6111 } 6112 6113 return SDValue(); 6114 } 6115 6116 // Instructions that will be lowered with a final instruction that zeros the 6117 // high result bits. 6118 // XXX - probably only need to list legal operations. 6119 static bool fp16SrcZerosHighBits(unsigned Opc) { 6120 switch (Opc) { 6121 case ISD::FADD: 6122 case ISD::FSUB: 6123 case ISD::FMUL: 6124 case ISD::FDIV: 6125 case ISD::FREM: 6126 case ISD::FMA: 6127 case ISD::FMAD: 6128 case ISD::FCANONICALIZE: 6129 case ISD::FP_ROUND: 6130 case ISD::UINT_TO_FP: 6131 case ISD::SINT_TO_FP: 6132 case ISD::FABS: 6133 // Fabs is lowered to a bit operation, but it's an and which will clear the 6134 // high bits anyway. 6135 case ISD::FSQRT: 6136 case ISD::FSIN: 6137 case ISD::FCOS: 6138 case ISD::FPOWI: 6139 case ISD::FPOW: 6140 case ISD::FLOG: 6141 case ISD::FLOG2: 6142 case ISD::FLOG10: 6143 case ISD::FEXP: 6144 case ISD::FEXP2: 6145 case ISD::FCEIL: 6146 case ISD::FTRUNC: 6147 case ISD::FRINT: 6148 case ISD::FNEARBYINT: 6149 case ISD::FROUND: 6150 case ISD::FFLOOR: 6151 case ISD::FMINNUM: 6152 case ISD::FMAXNUM: 6153 case AMDGPUISD::FRACT: 6154 case AMDGPUISD::CLAMP: 6155 case AMDGPUISD::COS_HW: 6156 case AMDGPUISD::SIN_HW: 6157 case AMDGPUISD::FMIN3: 6158 case AMDGPUISD::FMAX3: 6159 case AMDGPUISD::FMED3: 6160 case AMDGPUISD::FMAD_FTZ: 6161 case AMDGPUISD::RCP: 6162 case AMDGPUISD::RSQ: 6163 case AMDGPUISD::LDEXP: 6164 return true; 6165 default: 6166 // fcopysign, select and others may be lowered to 32-bit bit operations 6167 // which don't zero the high bits. 6168 return false; 6169 } 6170 } 6171 6172 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, 6173 DAGCombinerInfo &DCI) const { 6174 if (!Subtarget->has16BitInsts() || 6175 DCI.getDAGCombineLevel() < AfterLegalizeDAG) 6176 return SDValue(); 6177 6178 EVT VT = N->getValueType(0); 6179 if (VT != MVT::i32) 6180 return SDValue(); 6181 6182 SDValue Src = N->getOperand(0); 6183 if (Src.getValueType() != MVT::i16) 6184 return SDValue(); 6185 6186 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src 6187 // FIXME: It is not universally true that the high bits are zeroed on gfx9. 6188 if (Src.getOpcode() == ISD::BITCAST) { 6189 SDValue BCSrc = Src.getOperand(0); 6190 if (BCSrc.getValueType() == MVT::f16 && 6191 fp16SrcZerosHighBits(BCSrc.getOpcode())) 6192 return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc); 6193 } 6194 6195 return SDValue(); 6196 } 6197 6198 SDValue SITargetLowering::performClassCombine(SDNode *N, 6199 DAGCombinerInfo &DCI) const { 6200 SelectionDAG &DAG = DCI.DAG; 6201 SDValue Mask = N->getOperand(1); 6202 6203 // fp_class x, 0 -> false 6204 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { 6205 if (CMask->isNullValue()) 6206 return DAG.getConstant(0, SDLoc(N), MVT::i1); 6207 } 6208 6209 if (N->getOperand(0).isUndef()) 6210 return DAG.getUNDEF(MVT::i1); 6211 6212 return SDValue(); 6213 } 6214 6215 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { 6216 if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) 6217 return true; 6218 6219 return DAG.isKnownNeverNaN(Op); 6220 } 6221 6222 static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, 6223 const SISubtarget *ST, unsigned MaxDepth=5) { 6224 // If source is a result of another standard FP operation it is already in 6225 // canonical form. 6226 6227 switch (Op.getOpcode()) { 6228 default: 6229 break; 6230 6231 // These will flush denorms if required. 6232 case ISD::FADD: 6233 case ISD::FSUB: 6234 case ISD::FMUL: 6235 case ISD::FSQRT: 6236 case ISD::FCEIL: 6237 case ISD::FFLOOR: 6238 case ISD::FMA: 6239 case ISD::FMAD: 6240 6241 case ISD::FCANONICALIZE: 6242 return true; 6243 6244 case ISD::FP_ROUND: 6245 return Op.getValueType().getScalarType() != MVT::f16 || 6246 ST->hasFP16Denormals(); 6247 6248 case ISD::FP_EXTEND: 6249 return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 || 6250 ST->hasFP16Denormals(); 6251 6252 case ISD::FP16_TO_FP: 6253 case ISD::FP_TO_FP16: 6254 return ST->hasFP16Denormals(); 6255 6256 // It can/will be lowered or combined as a bit operation. 6257 // Need to check their input recursively to handle. 6258 case ISD::FNEG: 6259 case ISD::FABS: 6260 return (MaxDepth > 0) && 6261 isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1); 6262 6263 case ISD::FSIN: 6264 case ISD::FCOS: 6265 case ISD::FSINCOS: 6266 return Op.getValueType().getScalarType() != MVT::f16; 6267 6268 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. 6269 // For such targets need to check their input recursively. 6270 case ISD::FMINNUM: 6271 case ISD::FMAXNUM: 6272 case ISD::FMINNAN: 6273 case ISD::FMAXNAN: 6274 6275 if (ST->supportsMinMaxDenormModes() && 6276 DAG.isKnownNeverNaN(Op.getOperand(0)) && 6277 DAG.isKnownNeverNaN(Op.getOperand(1))) 6278 return true; 6279 6280 return (MaxDepth > 0) && 6281 isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) && 6282 isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1); 6283 6284 case ISD::ConstantFP: { 6285 auto F = cast<ConstantFPSDNode>(Op)->getValueAPF(); 6286 return !F.isDenormal() && !(F.isNaN() && F.isSignaling()); 6287 } 6288 } 6289 return false; 6290 } 6291 6292 // Constant fold canonicalize. 6293 SDValue SITargetLowering::performFCanonicalizeCombine( 6294 SDNode *N, 6295 DAGCombinerInfo &DCI) const { 6296 SelectionDAG &DAG = DCI.DAG; 6297 ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0)); 6298 6299 if (!CFP) { 6300 SDValue N0 = N->getOperand(0); 6301 EVT VT = N0.getValueType().getScalarType(); 6302 auto ST = getSubtarget(); 6303 6304 if (((VT == MVT::f32 && ST->hasFP32Denormals()) || 6305 (VT == MVT::f64 && ST->hasFP64Denormals()) || 6306 (VT == MVT::f16 && ST->hasFP16Denormals())) && 6307 DAG.isKnownNeverNaN(N0)) 6308 return N0; 6309 6310 bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); 6311 6312 if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) && 6313 isCanonicalized(DAG, N0, ST)) 6314 return N0; 6315 6316 return SDValue(); 6317 } 6318 6319 const APFloat &C = CFP->getValueAPF(); 6320 6321 // Flush denormals to 0 if not enabled. 6322 if (C.isDenormal()) { 6323 EVT VT = N->getValueType(0); 6324 EVT SVT = VT.getScalarType(); 6325 if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals()) 6326 return DAG.getConstantFP(0.0, SDLoc(N), VT); 6327 6328 if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals()) 6329 return DAG.getConstantFP(0.0, SDLoc(N), VT); 6330 6331 if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals()) 6332 return DAG.getConstantFP(0.0, SDLoc(N), VT); 6333 } 6334 6335 if (C.isNaN()) { 6336 EVT VT = N->getValueType(0); 6337 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); 6338 if (C.isSignaling()) { 6339 // Quiet a signaling NaN. 6340 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); 6341 } 6342 6343 // Make sure it is the canonical NaN bitpattern. 6344 // 6345 // TODO: Can we use -1 as the canonical NaN value since it's an inline 6346 // immediate? 6347 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) 6348 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); 6349 } 6350 6351 return N->getOperand(0); 6352 } 6353 6354 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 6355 switch (Opc) { 6356 case ISD::FMAXNUM: 6357 return AMDGPUISD::FMAX3; 6358 case ISD::SMAX: 6359 return AMDGPUISD::SMAX3; 6360 case ISD::UMAX: 6361 return AMDGPUISD::UMAX3; 6362 case ISD::FMINNUM: 6363 return AMDGPUISD::FMIN3; 6364 case ISD::SMIN: 6365 return AMDGPUISD::SMIN3; 6366 case ISD::UMIN: 6367 return AMDGPUISD::UMIN3; 6368 default: 6369 llvm_unreachable("Not a min/max opcode"); 6370 } 6371 } 6372 6373 SDValue SITargetLowering::performIntMed3ImmCombine( 6374 SelectionDAG &DAG, const SDLoc &SL, 6375 SDValue Op0, SDValue Op1, bool Signed) const { 6376 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1); 6377 if (!K1) 6378 return SDValue(); 6379 6380 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1)); 6381 if (!K0) 6382 return SDValue(); 6383 6384 if (Signed) { 6385 if (K0->getAPIntValue().sge(K1->getAPIntValue())) 6386 return SDValue(); 6387 } else { 6388 if (K0->getAPIntValue().uge(K1->getAPIntValue())) 6389 return SDValue(); 6390 } 6391 6392 EVT VT = K0->getValueType(0); 6393 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; 6394 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) { 6395 return DAG.getNode(Med3Opc, SL, VT, 6396 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); 6397 } 6398 6399 // If there isn't a 16-bit med3 operation, convert to 32-bit. 6400 MVT NVT = MVT::i32; 6401 unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 6402 6403 SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); 6404 SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); 6405 SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); 6406 6407 SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); 6408 return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); 6409 } 6410 6411 static ConstantFPSDNode *getSplatConstantFP(SDValue Op) { 6412 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) 6413 return C; 6414 6415 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) { 6416 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode()) 6417 return C; 6418 } 6419 6420 return nullptr; 6421 } 6422 6423 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, 6424 const SDLoc &SL, 6425 SDValue Op0, 6426 SDValue Op1) const { 6427 ConstantFPSDNode *K1 = getSplatConstantFP(Op1); 6428 if (!K1) 6429 return SDValue(); 6430 6431 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1)); 6432 if (!K0) 6433 return SDValue(); 6434 6435 // Ordered >= (although NaN inputs should have folded away by now). 6436 APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); 6437 if (Cmp == APFloat::cmpGreaterThan) 6438 return SDValue(); 6439 6440 // TODO: Check IEEE bit enabled? 6441 EVT VT = Op0.getValueType(); 6442 if (Subtarget->enableDX10Clamp()) { 6443 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the 6444 // hardware fmed3 behavior converting to a min. 6445 // FIXME: Should this be allowing -0.0? 6446 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0)) 6447 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0)); 6448 } 6449 6450 // med3 for f16 is only available on gfx9+, and not available for v2f16. 6451 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) { 6452 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a 6453 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would 6454 // then give the other result, which is different from med3 with a NaN 6455 // input. 6456 SDValue Var = Op0.getOperand(0); 6457 if (!isKnownNeverSNan(DAG, Var)) 6458 return SDValue(); 6459 6460 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), 6461 Var, SDValue(K0, 0), SDValue(K1, 0)); 6462 } 6463 6464 return SDValue(); 6465 } 6466 6467 SDValue SITargetLowering::performMinMaxCombine(SDNode *N, 6468 DAGCombinerInfo &DCI) const { 6469 SelectionDAG &DAG = DCI.DAG; 6470 6471 EVT VT = N->getValueType(0); 6472 unsigned Opc = N->getOpcode(); 6473 SDValue Op0 = N->getOperand(0); 6474 SDValue Op1 = N->getOperand(1); 6475 6476 // Only do this if the inner op has one use since this will just increases 6477 // register pressure for no benefit. 6478 6479 6480 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && 6481 !VT.isVector() && VT != MVT::f64 && 6482 ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) { 6483 // max(max(a, b), c) -> max3(a, b, c) 6484 // min(min(a, b), c) -> min3(a, b, c) 6485 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 6486 SDLoc DL(N); 6487 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 6488 DL, 6489 N->getValueType(0), 6490 Op0.getOperand(0), 6491 Op0.getOperand(1), 6492 Op1); 6493 } 6494 6495 // Try commuted. 6496 // max(a, max(b, c)) -> max3(a, b, c) 6497 // min(a, min(b, c)) -> min3(a, b, c) 6498 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 6499 SDLoc DL(N); 6500 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), 6501 DL, 6502 N->getValueType(0), 6503 Op0, 6504 Op1.getOperand(0), 6505 Op1.getOperand(1)); 6506 } 6507 } 6508 6509 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) 6510 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { 6511 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) 6512 return Med3; 6513 } 6514 6515 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { 6516 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) 6517 return Med3; 6518 } 6519 6520 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) 6521 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || 6522 (Opc == AMDGPUISD::FMIN_LEGACY && 6523 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && 6524 (VT == MVT::f32 || VT == MVT::f64 || 6525 (VT == MVT::f16 && Subtarget->has16BitInsts()) || 6526 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && 6527 Op0.hasOneUse()) { 6528 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) 6529 return Res; 6530 } 6531 6532 return SDValue(); 6533 } 6534 6535 static bool isClampZeroToOne(SDValue A, SDValue B) { 6536 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) { 6537 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) { 6538 // FIXME: Should this be allowing -0.0? 6539 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) || 6540 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0)); 6541 } 6542 } 6543 6544 return false; 6545 } 6546 6547 // FIXME: Should only worry about snans for version with chain. 6548 SDValue SITargetLowering::performFMed3Combine(SDNode *N, 6549 DAGCombinerInfo &DCI) const { 6550 EVT VT = N->getValueType(0); 6551 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and 6552 // NaNs. With a NaN input, the order of the operands may change the result. 6553 6554 SelectionDAG &DAG = DCI.DAG; 6555 SDLoc SL(N); 6556 6557 SDValue Src0 = N->getOperand(0); 6558 SDValue Src1 = N->getOperand(1); 6559 SDValue Src2 = N->getOperand(2); 6560 6561 if (isClampZeroToOne(Src0, Src1)) { 6562 // const_a, const_b, x -> clamp is safe in all cases including signaling 6563 // nans. 6564 // FIXME: Should this be allowing -0.0? 6565 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); 6566 } 6567 6568 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother 6569 // handling no dx10-clamp? 6570 if (Subtarget->enableDX10Clamp()) { 6571 // If NaNs is clamped to 0, we are free to reorder the inputs. 6572 6573 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) 6574 std::swap(Src0, Src1); 6575 6576 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2)) 6577 std::swap(Src1, Src2); 6578 6579 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) 6580 std::swap(Src0, Src1); 6581 6582 if (isClampZeroToOne(Src1, Src2)) 6583 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0); 6584 } 6585 6586 return SDValue(); 6587 } 6588 6589 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, 6590 DAGCombinerInfo &DCI) const { 6591 SDValue Src0 = N->getOperand(0); 6592 SDValue Src1 = N->getOperand(1); 6593 if (Src0.isUndef() && Src1.isUndef()) 6594 return DCI.DAG.getUNDEF(N->getValueType(0)); 6595 return SDValue(); 6596 } 6597 6598 SDValue SITargetLowering::performExtractVectorEltCombine( 6599 SDNode *N, DAGCombinerInfo &DCI) const { 6600 SDValue Vec = N->getOperand(0); 6601 6602 SelectionDAG &DAG = DCI.DAG; 6603 if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) { 6604 SDLoc SL(N); 6605 EVT EltVT = N->getValueType(0); 6606 SDValue Idx = N->getOperand(1); 6607 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, 6608 Vec.getOperand(0), Idx); 6609 return DAG.getNode(ISD::FNEG, SL, EltVT, Elt); 6610 } 6611 6612 return SDValue(); 6613 } 6614 6615 static bool convertBuildVectorCastElt(SelectionDAG &DAG, 6616 SDValue &Lo, SDValue &Hi) { 6617 if (Hi.getOpcode() == ISD::BITCAST && 6618 Hi.getOperand(0).getValueType() == MVT::f16 && 6619 (isa<ConstantSDNode>(Lo) || Lo.isUndef())) { 6620 Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo); 6621 Hi = Hi.getOperand(0); 6622 return true; 6623 } 6624 6625 return false; 6626 } 6627 6628 SDValue SITargetLowering::performBuildVectorCombine( 6629 SDNode *N, DAGCombinerInfo &DCI) const { 6630 SDLoc SL(N); 6631 6632 if (!isTypeLegal(MVT::v2i16)) 6633 return SDValue(); 6634 SelectionDAG &DAG = DCI.DAG; 6635 EVT VT = N->getValueType(0); 6636 6637 if (VT == MVT::v2i16) { 6638 SDValue Lo = N->getOperand(0); 6639 SDValue Hi = N->getOperand(1); 6640 6641 // v2i16 build_vector (const|undef), (bitcast f16:$x) 6642 // -> bitcast (v2f16 build_vector const|undef, $x 6643 if (convertBuildVectorCastElt(DAG, Lo, Hi)) { 6644 SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi }); 6645 return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); 6646 } 6647 6648 if (convertBuildVectorCastElt(DAG, Hi, Lo)) { 6649 SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo }); 6650 return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); 6651 } 6652 } 6653 6654 return SDValue(); 6655 } 6656 6657 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, 6658 const SDNode *N0, 6659 const SDNode *N1) const { 6660 EVT VT = N0->getValueType(0); 6661 6662 // Only do this if we are not trying to support denormals. v_mad_f32 does not 6663 // support denormals ever. 6664 if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || 6665 (VT == MVT::f16 && !Subtarget->hasFP16Denormals())) 6666 return ISD::FMAD; 6667 6668 const TargetOptions &Options = DAG.getTarget().Options; 6669 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || 6670 (N0->getFlags().hasUnsafeAlgebra() && 6671 N1->getFlags().hasUnsafeAlgebra())) && 6672 isFMAFasterThanFMulAndFAdd(VT)) { 6673 return ISD::FMA; 6674 } 6675 6676 return 0; 6677 } 6678 6679 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, 6680 EVT VT, 6681 SDValue N0, SDValue N1, SDValue N2, 6682 bool Signed) { 6683 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32; 6684 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1); 6685 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2); 6686 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad); 6687 } 6688 6689 SDValue SITargetLowering::performAddCombine(SDNode *N, 6690 DAGCombinerInfo &DCI) const { 6691 SelectionDAG &DAG = DCI.DAG; 6692 EVT VT = N->getValueType(0); 6693 SDLoc SL(N); 6694 SDValue LHS = N->getOperand(0); 6695 SDValue RHS = N->getOperand(1); 6696 6697 if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) 6698 && Subtarget->hasMad64_32() && 6699 !VT.isVector() && VT.getScalarSizeInBits() > 32 && 6700 VT.getScalarSizeInBits() <= 64) { 6701 if (LHS.getOpcode() != ISD::MUL) 6702 std::swap(LHS, RHS); 6703 6704 SDValue MulLHS = LHS.getOperand(0); 6705 SDValue MulRHS = LHS.getOperand(1); 6706 SDValue AddRHS = RHS; 6707 6708 // TODO: Maybe restrict if SGPR inputs. 6709 if (numBitsUnsigned(MulLHS, DAG) <= 32 && 6710 numBitsUnsigned(MulRHS, DAG) <= 32) { 6711 MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32); 6712 MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32); 6713 AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64); 6714 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false); 6715 } 6716 6717 if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) { 6718 MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32); 6719 MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32); 6720 AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64); 6721 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true); 6722 } 6723 6724 return SDValue(); 6725 } 6726 6727 if (VT != MVT::i32) 6728 return SDValue(); 6729 6730 // add x, zext (setcc) => addcarry x, 0, setcc 6731 // add x, sext (setcc) => subcarry x, 0, setcc 6732 unsigned Opc = LHS.getOpcode(); 6733 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND || 6734 Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY) 6735 std::swap(RHS, LHS); 6736 6737 Opc = RHS.getOpcode(); 6738 switch (Opc) { 6739 default: break; 6740 case ISD::ZERO_EXTEND: 6741 case ISD::SIGN_EXTEND: 6742 case ISD::ANY_EXTEND: { 6743 auto Cond = RHS.getOperand(0); 6744 if (!isBoolSGPR(Cond)) 6745 break; 6746 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); 6747 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; 6748 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY; 6749 return DAG.getNode(Opc, SL, VTList, Args); 6750 } 6751 case ISD::ADDCARRY: { 6752 // add x, (addcarry y, 0, cc) => addcarry x, y, cc 6753 auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 6754 if (!C || C->getZExtValue() != 0) break; 6755 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) }; 6756 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args); 6757 } 6758 } 6759 return SDValue(); 6760 } 6761 6762 SDValue SITargetLowering::performSubCombine(SDNode *N, 6763 DAGCombinerInfo &DCI) const { 6764 SelectionDAG &DAG = DCI.DAG; 6765 EVT VT = N->getValueType(0); 6766 6767 if (VT != MVT::i32) 6768 return SDValue(); 6769 6770 SDLoc SL(N); 6771 SDValue LHS = N->getOperand(0); 6772 SDValue RHS = N->getOperand(1); 6773 6774 unsigned Opc = LHS.getOpcode(); 6775 if (Opc != ISD::SUBCARRY) 6776 std::swap(RHS, LHS); 6777 6778 if (LHS.getOpcode() == ISD::SUBCARRY) { 6779 // sub (subcarry x, 0, cc), y => subcarry x, y, cc 6780 auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 6781 if (!C || C->getZExtValue() != 0) 6782 return SDValue(); 6783 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) }; 6784 return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args); 6785 } 6786 return SDValue(); 6787 } 6788 6789 SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N, 6790 DAGCombinerInfo &DCI) const { 6791 6792 if (N->getValueType(0) != MVT::i32) 6793 return SDValue(); 6794 6795 auto C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 6796 if (!C || C->getZExtValue() != 0) 6797 return SDValue(); 6798 6799 SelectionDAG &DAG = DCI.DAG; 6800 SDValue LHS = N->getOperand(0); 6801 6802 // addcarry (add x, y), 0, cc => addcarry x, y, cc 6803 // subcarry (sub x, y), 0, cc => subcarry x, y, cc 6804 unsigned LHSOpc = LHS.getOpcode(); 6805 unsigned Opc = N->getOpcode(); 6806 if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) || 6807 (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) { 6808 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) }; 6809 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args); 6810 } 6811 return SDValue(); 6812 } 6813 6814 SDValue SITargetLowering::performFAddCombine(SDNode *N, 6815 DAGCombinerInfo &DCI) const { 6816 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 6817 return SDValue(); 6818 6819 SelectionDAG &DAG = DCI.DAG; 6820 EVT VT = N->getValueType(0); 6821 6822 SDLoc SL(N); 6823 SDValue LHS = N->getOperand(0); 6824 SDValue RHS = N->getOperand(1); 6825 6826 // These should really be instruction patterns, but writing patterns with 6827 // source modiifiers is a pain. 6828 6829 // fadd (fadd (a, a), b) -> mad 2.0, a, b 6830 if (LHS.getOpcode() == ISD::FADD) { 6831 SDValue A = LHS.getOperand(0); 6832 if (A == LHS.getOperand(1)) { 6833 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); 6834 if (FusedOp != 0) { 6835 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 6836 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS); 6837 } 6838 } 6839 } 6840 6841 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 6842 if (RHS.getOpcode() == ISD::FADD) { 6843 SDValue A = RHS.getOperand(0); 6844 if (A == RHS.getOperand(1)) { 6845 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); 6846 if (FusedOp != 0) { 6847 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 6848 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS); 6849 } 6850 } 6851 } 6852 6853 return SDValue(); 6854 } 6855 6856 SDValue SITargetLowering::performFSubCombine(SDNode *N, 6857 DAGCombinerInfo &DCI) const { 6858 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 6859 return SDValue(); 6860 6861 SelectionDAG &DAG = DCI.DAG; 6862 SDLoc SL(N); 6863 EVT VT = N->getValueType(0); 6864 assert(!VT.isVector()); 6865 6866 // Try to get the fneg to fold into the source modifier. This undoes generic 6867 // DAG combines and folds them into the mad. 6868 // 6869 // Only do this if we are not trying to support denormals. v_mad_f32 does 6870 // not support denormals ever. 6871 SDValue LHS = N->getOperand(0); 6872 SDValue RHS = N->getOperand(1); 6873 if (LHS.getOpcode() == ISD::FADD) { 6874 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 6875 SDValue A = LHS.getOperand(0); 6876 if (A == LHS.getOperand(1)) { 6877 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); 6878 if (FusedOp != 0){ 6879 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 6880 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 6881 6882 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS); 6883 } 6884 } 6885 } 6886 6887 if (RHS.getOpcode() == ISD::FADD) { 6888 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 6889 6890 SDValue A = RHS.getOperand(0); 6891 if (A == RHS.getOperand(1)) { 6892 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); 6893 if (FusedOp != 0){ 6894 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT); 6895 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS); 6896 } 6897 } 6898 } 6899 6900 return SDValue(); 6901 } 6902 6903 SDValue SITargetLowering::performSetCCCombine(SDNode *N, 6904 DAGCombinerInfo &DCI) const { 6905 SelectionDAG &DAG = DCI.DAG; 6906 SDLoc SL(N); 6907 6908 SDValue LHS = N->getOperand(0); 6909 SDValue RHS = N->getOperand(1); 6910 EVT VT = LHS.getValueType(); 6911 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 6912 6913 auto CRHS = dyn_cast<ConstantSDNode>(RHS); 6914 if (!CRHS) { 6915 CRHS = dyn_cast<ConstantSDNode>(LHS); 6916 if (CRHS) { 6917 std::swap(LHS, RHS); 6918 CC = getSetCCSwappedOperands(CC); 6919 } 6920 } 6921 6922 if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && 6923 isBoolSGPR(LHS.getOperand(0))) { 6924 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 6925 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc 6926 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 6927 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc 6928 if ((CRHS->isAllOnesValue() && 6929 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || 6930 (CRHS->isNullValue() && 6931 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) 6932 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), 6933 DAG.getConstant(-1, SL, MVT::i1)); 6934 if ((CRHS->isAllOnesValue() && 6935 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || 6936 (CRHS->isNullValue() && 6937 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) 6938 return LHS.getOperand(0); 6939 } 6940 6941 if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() && 6942 VT != MVT::f16)) 6943 return SDValue(); 6944 6945 // Match isinf pattern 6946 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 6947 if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { 6948 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 6949 if (!CRHS) 6950 return SDValue(); 6951 6952 const APFloat &APF = CRHS->getValueAPF(); 6953 if (APF.isInfinity() && !APF.isNegative()) { 6954 unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; 6955 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), 6956 DAG.getConstant(Mask, SL, MVT::i32)); 6957 } 6958 } 6959 6960 return SDValue(); 6961 } 6962 6963 SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, 6964 DAGCombinerInfo &DCI) const { 6965 SelectionDAG &DAG = DCI.DAG; 6966 SDLoc SL(N); 6967 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 6968 6969 SDValue Src = N->getOperand(0); 6970 SDValue Srl = N->getOperand(0); 6971 if (Srl.getOpcode() == ISD::ZERO_EXTEND) 6972 Srl = Srl.getOperand(0); 6973 6974 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. 6975 if (Srl.getOpcode() == ISD::SRL) { 6976 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x 6977 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x 6978 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x 6979 6980 if (const ConstantSDNode *C = 6981 dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { 6982 Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)), 6983 EVT(MVT::i32)); 6984 6985 unsigned SrcOffset = C->getZExtValue() + 8 * Offset; 6986 if (SrcOffset < 32 && SrcOffset % 8 == 0) { 6987 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL, 6988 MVT::f32, Srl); 6989 } 6990 } 6991 } 6992 6993 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 6994 6995 KnownBits Known; 6996 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 6997 !DCI.isBeforeLegalizeOps()); 6998 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6999 if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) || 7000 TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) { 7001 DCI.CommitTargetLoweringOpt(TLO); 7002 } 7003 7004 return SDValue(); 7005 } 7006 7007 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 7008 DAGCombinerInfo &DCI) const { 7009 switch (N->getOpcode()) { 7010 default: 7011 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 7012 case ISD::ADD: 7013 return performAddCombine(N, DCI); 7014 case ISD::SUB: 7015 return performSubCombine(N, DCI); 7016 case ISD::ADDCARRY: 7017 case ISD::SUBCARRY: 7018 return performAddCarrySubCarryCombine(N, DCI); 7019 case ISD::FADD: 7020 return performFAddCombine(N, DCI); 7021 case ISD::FSUB: 7022 return performFSubCombine(N, DCI); 7023 case ISD::SETCC: 7024 return performSetCCCombine(N, DCI); 7025 case ISD::FMAXNUM: 7026 case ISD::FMINNUM: 7027 case ISD::SMAX: 7028 case ISD::SMIN: 7029 case ISD::UMAX: 7030 case ISD::UMIN: 7031 case AMDGPUISD::FMIN_LEGACY: 7032 case AMDGPUISD::FMAX_LEGACY: { 7033 if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && 7034 getTargetMachine().getOptLevel() > CodeGenOpt::None) 7035 return performMinMaxCombine(N, DCI); 7036 break; 7037 } 7038 case ISD::LOAD: 7039 case ISD::STORE: 7040 case ISD::ATOMIC_LOAD: 7041 case ISD::ATOMIC_STORE: 7042 case ISD::ATOMIC_CMP_SWAP: 7043 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 7044 case ISD::ATOMIC_SWAP: 7045 case ISD::ATOMIC_LOAD_ADD: 7046 case ISD::ATOMIC_LOAD_SUB: 7047 case ISD::ATOMIC_LOAD_AND: 7048 case ISD::ATOMIC_LOAD_OR: 7049 case ISD::ATOMIC_LOAD_XOR: 7050 case ISD::ATOMIC_LOAD_NAND: 7051 case ISD::ATOMIC_LOAD_MIN: 7052 case ISD::ATOMIC_LOAD_MAX: 7053 case ISD::ATOMIC_LOAD_UMIN: 7054 case ISD::ATOMIC_LOAD_UMAX: 7055 case AMDGPUISD::ATOMIC_INC: 7056 case AMDGPUISD::ATOMIC_DEC: 7057 case AMDGPUISD::ATOMIC_LOAD_FADD: 7058 case AMDGPUISD::ATOMIC_LOAD_FMIN: 7059 case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. 7060 if (DCI.isBeforeLegalize()) 7061 break; 7062 return performMemSDNodeCombine(cast<MemSDNode>(N), DCI); 7063 case ISD::AND: 7064 return performAndCombine(N, DCI); 7065 case ISD::OR: 7066 return performOrCombine(N, DCI); 7067 case ISD::XOR: 7068 return performXorCombine(N, DCI); 7069 case ISD::ZERO_EXTEND: 7070 return performZeroExtendCombine(N, DCI); 7071 case AMDGPUISD::FP_CLASS: 7072 return performClassCombine(N, DCI); 7073 case ISD::FCANONICALIZE: 7074 return performFCanonicalizeCombine(N, DCI); 7075 case AMDGPUISD::FRACT: 7076 case AMDGPUISD::RCP: 7077 case AMDGPUISD::RSQ: 7078 case AMDGPUISD::RCP_LEGACY: 7079 case AMDGPUISD::RSQ_LEGACY: 7080 case AMDGPUISD::RSQ_CLAMP: 7081 case AMDGPUISD::LDEXP: { 7082 SDValue Src = N->getOperand(0); 7083 if (Src.isUndef()) 7084 return Src; 7085 break; 7086 } 7087 case ISD::SINT_TO_FP: 7088 case ISD::UINT_TO_FP: 7089 return performUCharToFloatCombine(N, DCI); 7090 case AMDGPUISD::CVT_F32_UBYTE0: 7091 case AMDGPUISD::CVT_F32_UBYTE1: 7092 case AMDGPUISD::CVT_F32_UBYTE2: 7093 case AMDGPUISD::CVT_F32_UBYTE3: 7094 return performCvtF32UByteNCombine(N, DCI); 7095 case AMDGPUISD::FMED3: 7096 return performFMed3Combine(N, DCI); 7097 case AMDGPUISD::CVT_PKRTZ_F16_F32: 7098 return performCvtPkRTZCombine(N, DCI); 7099 case ISD::SCALAR_TO_VECTOR: { 7100 SelectionDAG &DAG = DCI.DAG; 7101 EVT VT = N->getValueType(0); 7102 7103 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) 7104 if (VT == MVT::v2i16 || VT == MVT::v2f16) { 7105 SDLoc SL(N); 7106 SDValue Src = N->getOperand(0); 7107 EVT EltVT = Src.getValueType(); 7108 if (EltVT == MVT::f16) 7109 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src); 7110 7111 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src); 7112 return DAG.getNode(ISD::BITCAST, SL, VT, Ext); 7113 } 7114 7115 break; 7116 } 7117 case ISD::EXTRACT_VECTOR_ELT: 7118 return performExtractVectorEltCombine(N, DCI); 7119 case ISD::BUILD_VECTOR: 7120 return performBuildVectorCombine(N, DCI); 7121 } 7122 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 7123 } 7124 7125 /// \brief Helper function for adjustWritemask 7126 static unsigned SubIdx2Lane(unsigned Idx) { 7127 switch (Idx) { 7128 default: return 0; 7129 case AMDGPU::sub0: return 0; 7130 case AMDGPU::sub1: return 1; 7131 case AMDGPU::sub2: return 2; 7132 case AMDGPU::sub3: return 3; 7133 } 7134 } 7135 7136 /// \brief Adjust the writemask of MIMG instructions 7137 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, 7138 SelectionDAG &DAG) const { 7139 SDNode *Users[4] = { nullptr }; 7140 unsigned Lane = 0; 7141 unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; 7142 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); 7143 unsigned NewDmask = 0; 7144 bool HasChain = Node->getNumValues() > 1; 7145 7146 if (OldDmask == 0) { 7147 // These are folded out, but on the chance it happens don't assert. 7148 return Node; 7149 } 7150 7151 // Try to figure out the used register components 7152 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 7153 I != E; ++I) { 7154 7155 // Don't look at users of the chain. 7156 if (I.getUse().getResNo() != 0) 7157 continue; 7158 7159 // Abort if we can't understand the usage 7160 if (!I->isMachineOpcode() || 7161 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 7162 return Node; 7163 7164 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used. 7165 // Note that subregs are packed, i.e. Lane==0 is the first bit set 7166 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 7167 // set, etc. 7168 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 7169 7170 // Set which texture component corresponds to the lane. 7171 unsigned Comp; 7172 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 7173 Comp = countTrailingZeros(Dmask); 7174 Dmask &= ~(1 << Comp); 7175 } 7176 7177 // Abort if we have more than one user per component 7178 if (Users[Lane]) 7179 return Node; 7180 7181 Users[Lane] = *I; 7182 NewDmask |= 1 << Comp; 7183 } 7184 7185 // Abort if there's no change 7186 if (NewDmask == OldDmask) 7187 return Node; 7188 7189 unsigned BitsSet = countPopulation(NewDmask); 7190 7191 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 7192 int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII, 7193 Node->getMachineOpcode(), BitsSet); 7194 assert(NewOpcode != -1 && 7195 NewOpcode != static_cast<int>(Node->getMachineOpcode()) && 7196 "failed to find equivalent MIMG op"); 7197 7198 // Adjust the writemask in the node 7199 SmallVector<SDValue, 12> Ops; 7200 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); 7201 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); 7202 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); 7203 7204 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); 7205 7206 MVT ResultVT = BitsSet == 1 ? 7207 SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet); 7208 SDVTList NewVTList = HasChain ? 7209 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); 7210 7211 7212 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node), 7213 NewVTList, Ops); 7214 7215 if (HasChain) { 7216 // Update chain. 7217 NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end()); 7218 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); 7219 } 7220 7221 if (BitsSet == 1) { 7222 assert(Node->hasNUsesOfValue(1, 0)); 7223 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY, 7224 SDLoc(Node), Users[Lane]->getValueType(0), 7225 SDValue(NewNode, 0)); 7226 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 7227 return nullptr; 7228 } 7229 7230 // Update the users of the node with the new indices 7231 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 7232 SDNode *User = Users[i]; 7233 if (!User) 7234 continue; 7235 7236 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); 7237 DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); 7238 7239 switch (Idx) { 7240 default: break; 7241 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 7242 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 7243 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 7244 } 7245 } 7246 7247 DAG.RemoveDeadNode(Node); 7248 return nullptr; 7249 } 7250 7251 static bool isFrameIndexOp(SDValue Op) { 7252 if (Op.getOpcode() == ISD::AssertZext) 7253 Op = Op.getOperand(0); 7254 7255 return isa<FrameIndexSDNode>(Op); 7256 } 7257 7258 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) 7259 /// with frame index operands. 7260 /// LLVM assumes that inputs are to these instructions are registers. 7261 SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 7262 SelectionDAG &DAG) const { 7263 if (Node->getOpcode() == ISD::CopyToReg) { 7264 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1)); 7265 SDValue SrcVal = Node->getOperand(2); 7266 7267 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have 7268 // to try understanding copies to physical registers. 7269 if (SrcVal.getValueType() == MVT::i1 && 7270 TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) { 7271 SDLoc SL(Node); 7272 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 7273 SDValue VReg = DAG.getRegister( 7274 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1); 7275 7276 SDNode *Glued = Node->getGluedNode(); 7277 SDValue ToVReg 7278 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal, 7279 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0)); 7280 SDValue ToResultReg 7281 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0), 7282 VReg, ToVReg.getValue(1)); 7283 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode()); 7284 DAG.RemoveDeadNode(Node); 7285 return ToResultReg.getNode(); 7286 } 7287 } 7288 7289 SmallVector<SDValue, 8> Ops; 7290 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 7291 if (!isFrameIndexOp(Node->getOperand(i))) { 7292 Ops.push_back(Node->getOperand(i)); 7293 continue; 7294 } 7295 7296 SDLoc DL(Node); 7297 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 7298 Node->getOperand(i).getValueType(), 7299 Node->getOperand(i)), 0)); 7300 } 7301 7302 return DAG.UpdateNodeOperands(Node, Ops); 7303 } 7304 7305 /// \brief Fold the instructions after selecting them. 7306 /// Returns null if users were already updated. 7307 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 7308 SelectionDAG &DAG) const { 7309 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 7310 unsigned Opcode = Node->getMachineOpcode(); 7311 7312 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && 7313 !TII->isGather4(Opcode) && !TII->isD16(Opcode)) { 7314 return adjustWritemask(Node, DAG); 7315 } 7316 7317 if (Opcode == AMDGPU::INSERT_SUBREG || 7318 Opcode == AMDGPU::REG_SEQUENCE) { 7319 legalizeTargetIndependentNode(Node, DAG); 7320 return Node; 7321 } 7322 7323 switch (Opcode) { 7324 case AMDGPU::V_DIV_SCALE_F32: 7325 case AMDGPU::V_DIV_SCALE_F64: { 7326 // Satisfy the operand register constraint when one of the inputs is 7327 // undefined. Ordinarily each undef value will have its own implicit_def of 7328 // a vreg, so force these to use a single register. 7329 SDValue Src0 = Node->getOperand(0); 7330 SDValue Src1 = Node->getOperand(1); 7331 SDValue Src2 = Node->getOperand(2); 7332 7333 if ((Src0.isMachineOpcode() && 7334 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) && 7335 (Src0 == Src1 || Src0 == Src2)) 7336 break; 7337 7338 MVT VT = Src0.getValueType().getSimpleVT(); 7339 const TargetRegisterClass *RC = getRegClassFor(VT); 7340 7341 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 7342 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT); 7343 7344 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), 7345 UndefReg, Src0, SDValue()); 7346 7347 // src0 must be the same register as src1 or src2, even if the value is 7348 // undefined, so make sure we don't violate this constraint. 7349 if (Src0.isMachineOpcode() && 7350 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) { 7351 if (Src1.isMachineOpcode() && 7352 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) 7353 Src0 = Src1; 7354 else if (Src2.isMachineOpcode() && 7355 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) 7356 Src0 = Src2; 7357 else { 7358 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF); 7359 Src0 = UndefReg; 7360 Src1 = UndefReg; 7361 } 7362 } else 7363 break; 7364 7365 SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 }; 7366 for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I) 7367 Ops.push_back(Node->getOperand(I)); 7368 7369 Ops.push_back(ImpDef.getValue(1)); 7370 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 7371 } 7372 default: 7373 break; 7374 } 7375 7376 return Node; 7377 } 7378 7379 /// \brief Assign the register class depending on the number of 7380 /// bits set in the writemask 7381 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 7382 SDNode *Node) const { 7383 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 7384 7385 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 7386 7387 if (TII->isVOP3(MI.getOpcode())) { 7388 // Make sure constant bus requirements are respected. 7389 TII->legalizeOperandsVOP3(MRI, MI); 7390 return; 7391 } 7392 7393 // Replace unused atomics with the no return version. 7394 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); 7395 if (NoRetAtomicOp != -1) { 7396 if (!Node->hasAnyUseOfValue(0)) { 7397 MI.setDesc(TII->get(NoRetAtomicOp)); 7398 MI.RemoveOperand(0); 7399 return; 7400 } 7401 7402 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg 7403 // instruction, because the return type of these instructions is a vec2 of 7404 // the memory type, so it can be tied to the input operand. 7405 // This means these instructions always have a use, so we need to add a 7406 // special case to check if the atomic has only one extract_subreg use, 7407 // which itself has no uses. 7408 if ((Node->hasNUsesOfValue(1, 0) && 7409 Node->use_begin()->isMachineOpcode() && 7410 Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && 7411 !Node->use_begin()->hasAnyUseOfValue(0))) { 7412 unsigned Def = MI.getOperand(0).getReg(); 7413 7414 // Change this into a noret atomic. 7415 MI.setDesc(TII->get(NoRetAtomicOp)); 7416 MI.RemoveOperand(0); 7417 7418 // If we only remove the def operand from the atomic instruction, the 7419 // extract_subreg will be left with a use of a vreg without a def. 7420 // So we need to insert an implicit_def to avoid machine verifier 7421 // errors. 7422 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), 7423 TII->get(AMDGPU::IMPLICIT_DEF), Def); 7424 } 7425 return; 7426 } 7427 } 7428 7429 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, 7430 uint64_t Val) { 7431 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); 7432 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 7433 } 7434 7435 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 7436 const SDLoc &DL, 7437 SDValue Ptr) const { 7438 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 7439 7440 // Build the half of the subregister with the constants before building the 7441 // full 128-bit register. If we are building multiple resource descriptors, 7442 // this will allow CSEing of the 2-component register. 7443 const SDValue Ops0[] = { 7444 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), 7445 buildSMovImm32(DAG, DL, 0), 7446 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 7447 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 7448 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) 7449 }; 7450 7451 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, 7452 MVT::v2i32, Ops0), 0); 7453 7454 // Combine the constants and the pointer. 7455 const SDValue Ops1[] = { 7456 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 7457 Ptr, 7458 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), 7459 SubRegHi, 7460 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) 7461 }; 7462 7463 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 7464 } 7465 7466 /// \brief Return a resource descriptor with the 'Add TID' bit enabled 7467 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] 7468 /// of the resource descriptor) to create an offset, which is added to 7469 /// the resource pointer. 7470 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, 7471 SDValue Ptr, uint32_t RsrcDword1, 7472 uint64_t RsrcDword2And3) const { 7473 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 7474 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 7475 if (RsrcDword1) { 7476 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 7477 DAG.getConstant(RsrcDword1, DL, MVT::i32)), 7478 0); 7479 } 7480 7481 SDValue DataLo = buildSMovImm32(DAG, DL, 7482 RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 7483 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 7484 7485 const SDValue Ops[] = { 7486 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), 7487 PtrLo, 7488 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 7489 PtrHi, 7490 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), 7491 DataLo, 7492 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), 7493 DataHi, 7494 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) 7495 }; 7496 7497 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 7498 } 7499 7500 //===----------------------------------------------------------------------===// 7501 // SI Inline Assembly Support 7502 //===----------------------------------------------------------------------===// 7503 7504 std::pair<unsigned, const TargetRegisterClass *> 7505 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 7506 StringRef Constraint, 7507 MVT VT) const { 7508 if (!isTypeLegal(VT)) 7509 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 7510 7511 if (Constraint.size() == 1) { 7512 switch (Constraint[0]) { 7513 case 's': 7514 case 'r': 7515 switch (VT.getSizeInBits()) { 7516 default: 7517 return std::make_pair(0U, nullptr); 7518 case 32: 7519 case 16: 7520 return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass); 7521 case 64: 7522 return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); 7523 case 128: 7524 return std::make_pair(0U, &AMDGPU::SReg_128RegClass); 7525 case 256: 7526 return std::make_pair(0U, &AMDGPU::SReg_256RegClass); 7527 case 512: 7528 return std::make_pair(0U, &AMDGPU::SReg_512RegClass); 7529 } 7530 7531 case 'v': 7532 switch (VT.getSizeInBits()) { 7533 default: 7534 return std::make_pair(0U, nullptr); 7535 case 32: 7536 case 16: 7537 return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); 7538 case 64: 7539 return std::make_pair(0U, &AMDGPU::VReg_64RegClass); 7540 case 96: 7541 return std::make_pair(0U, &AMDGPU::VReg_96RegClass); 7542 case 128: 7543 return std::make_pair(0U, &AMDGPU::VReg_128RegClass); 7544 case 256: 7545 return std::make_pair(0U, &AMDGPU::VReg_256RegClass); 7546 case 512: 7547 return std::make_pair(0U, &AMDGPU::VReg_512RegClass); 7548 } 7549 } 7550 } 7551 7552 if (Constraint.size() > 1) { 7553 const TargetRegisterClass *RC = nullptr; 7554 if (Constraint[1] == 'v') { 7555 RC = &AMDGPU::VGPR_32RegClass; 7556 } else if (Constraint[1] == 's') { 7557 RC = &AMDGPU::SGPR_32RegClass; 7558 } 7559 7560 if (RC) { 7561 uint32_t Idx; 7562 bool Failed = Constraint.substr(2).getAsInteger(10, Idx); 7563 if (!Failed && Idx < RC->getNumRegs()) 7564 return std::make_pair(RC->getRegister(Idx), RC); 7565 } 7566 } 7567 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 7568 } 7569 7570 SITargetLowering::ConstraintType 7571 SITargetLowering::getConstraintType(StringRef Constraint) const { 7572 if (Constraint.size() == 1) { 7573 switch (Constraint[0]) { 7574 default: break; 7575 case 's': 7576 case 'v': 7577 return C_RegisterClass; 7578 } 7579 } 7580 return TargetLowering::getConstraintType(Constraint); 7581 } 7582 7583 // Figure out which registers should be reserved for stack access. Only after 7584 // the function is legalized do we know all of the non-spill stack objects or if 7585 // calls are present. 7586 void SITargetLowering::finalizeLowering(MachineFunction &MF) const { 7587 MachineRegisterInfo &MRI = MF.getRegInfo(); 7588 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 7589 const MachineFrameInfo &MFI = MF.getFrameInfo(); 7590 const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); 7591 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 7592 7593 if (Info->isEntryFunction()) { 7594 // Callable functions have fixed registers used for stack access. 7595 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); 7596 } 7597 7598 // We have to assume the SP is needed in case there are calls in the function 7599 // during lowering. Calls are only detected after the function is 7600 // lowered. We're about to reserve registers, so don't bother using it if we 7601 // aren't really going to use it. 7602 bool NeedSP = !Info->isEntryFunction() || 7603 MFI.hasVarSizedObjects() || 7604 MFI.hasCalls(); 7605 7606 if (NeedSP) { 7607 unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF); 7608 Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg); 7609 7610 assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg()); 7611 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), 7612 Info->getStackPtrOffsetReg())); 7613 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); 7614 } 7615 7616 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); 7617 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); 7618 MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, 7619 Info->getScratchWaveOffsetReg()); 7620 7621 TargetLoweringBase::finalizeLowering(MF); 7622 } 7623 7624 void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, 7625 KnownBits &Known, 7626 const APInt &DemandedElts, 7627 const SelectionDAG &DAG, 7628 unsigned Depth) const { 7629 TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, 7630 DAG, Depth); 7631 7632 if (getSubtarget()->enableHugePrivateBuffer()) 7633 return; 7634 7635 // Technically it may be possible to have a dispatch with a single workitem 7636 // that uses the full private memory size, but that's not really useful. We 7637 // can't use vaddr in MUBUF instructions if we don't know the address 7638 // calculation won't overflow, so assume the sign bit is never set. 7639 Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); 7640 } 7641