1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This is the parent TargetLowering class for hardware code gen 11 /// targets. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f 16 #define AMDGPU_LN2_F 0.693147180559945309417232121458176568f 17 #define AMDGPU_LN10_F 2.30258509299404568401799145468436421f 18 19 #include "AMDGPUISelLowering.h" 20 #include "AMDGPU.h" 21 #include "AMDGPUCallLowering.h" 22 #include "AMDGPUFrameLowering.h" 23 #include "AMDGPURegisterInfo.h" 24 #include "AMDGPUSubtarget.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "Utils/AMDGPUBaseInfo.h" 27 #include "R600MachineFunctionInfo.h" 28 #include "SIInstrInfo.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 31 #include "llvm/CodeGen/Analysis.h" 32 #include "llvm/CodeGen/CallingConvLower.h" 33 #include "llvm/CodeGen/MachineFunction.h" 34 #include "llvm/CodeGen/MachineRegisterInfo.h" 35 #include "llvm/CodeGen/SelectionDAG.h" 36 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 37 #include "llvm/IR/DataLayout.h" 38 #include "llvm/IR/DiagnosticInfo.h" 39 #include "llvm/Support/KnownBits.h" 40 using namespace llvm; 41 42 static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT, 43 CCValAssign::LocInfo LocInfo, 44 ISD::ArgFlagsTy ArgFlags, CCState &State, 45 const TargetRegisterClass *RC, 46 unsigned NumRegs) { 47 ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs); 48 unsigned RegResult = State.AllocateReg(RegList); 49 if (RegResult == AMDGPU::NoRegister) 50 return false; 51 52 State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo)); 53 return true; 54 } 55 56 static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, 57 CCValAssign::LocInfo LocInfo, 58 ISD::ArgFlagsTy ArgFlags, CCState &State) { 59 switch (LocVT.SimpleTy) { 60 case MVT::i64: 61 case MVT::f64: 62 case MVT::v2i32: 63 case MVT::v2f32: 64 case MVT::v4i16: 65 case MVT::v4f16: { 66 // Up to SGPR0-SGPR39 67 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, 68 &AMDGPU::SGPR_64RegClass, 20); 69 } 70 default: 71 return false; 72 } 73 } 74 75 // Allocate up to VGPR31. 76 // 77 // TODO: Since there are no VGPR alignent requirements would it be better to 78 // split into individual scalar registers? 79 static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, 80 CCValAssign::LocInfo LocInfo, 81 ISD::ArgFlagsTy ArgFlags, CCState &State) { 82 switch (LocVT.SimpleTy) { 83 case MVT::i64: 84 case MVT::f64: 85 case MVT::v2i32: 86 case MVT::v2f32: 87 case MVT::v4i16: 88 case MVT::v4f16: { 89 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, 90 &AMDGPU::VReg_64RegClass, 31); 91 } 92 case MVT::v4i32: 93 case MVT::v4f32: 94 case MVT::v2i64: 95 case MVT::v2f64: { 96 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, 97 &AMDGPU::VReg_128RegClass, 29); 98 } 99 case MVT::v8i32: 100 case MVT::v8f32: { 101 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, 102 &AMDGPU::VReg_256RegClass, 25); 103 104 } 105 case MVT::v16i32: 106 case MVT::v16f32: { 107 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, 108 &AMDGPU::VReg_512RegClass, 17); 109 110 } 111 default: 112 return false; 113 } 114 } 115 116 #include "AMDGPUGenCallingConv.inc" 117 118 // Find a larger type to do a load / store of a vector with. 119 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { 120 unsigned StoreSize = VT.getStoreSizeInBits(); 121 if (StoreSize <= 32) 122 return EVT::getIntegerVT(Ctx, StoreSize); 123 124 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); 125 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); 126 } 127 128 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { 129 EVT VT = Op.getValueType(); 130 KnownBits Known = DAG.computeKnownBits(Op); 131 return VT.getSizeInBits() - Known.countMinLeadingZeros(); 132 } 133 134 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { 135 EVT VT = Op.getValueType(); 136 137 // In order for this to be a signed 24-bit value, bit 23, must 138 // be a sign bit. 139 return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op); 140 } 141 142 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, 143 const AMDGPUSubtarget &STI) 144 : TargetLowering(TM), Subtarget(&STI) { 145 // Lower floating point store/load to integer store/load to reduce the number 146 // of patterns in tablegen. 147 setOperationAction(ISD::LOAD, MVT::f32, Promote); 148 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); 149 150 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 151 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); 152 153 setOperationAction(ISD::LOAD, MVT::v3f32, Promote); 154 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32); 155 156 setOperationAction(ISD::LOAD, MVT::v4f32, Promote); 157 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); 158 159 setOperationAction(ISD::LOAD, MVT::v8f32, Promote); 160 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); 161 162 setOperationAction(ISD::LOAD, MVT::v16f32, Promote); 163 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); 164 165 setOperationAction(ISD::LOAD, MVT::i64, Promote); 166 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); 167 168 setOperationAction(ISD::LOAD, MVT::v2i64, Promote); 169 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); 170 171 setOperationAction(ISD::LOAD, MVT::f64, Promote); 172 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32); 173 174 setOperationAction(ISD::LOAD, MVT::v2f64, Promote); 175 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); 176 177 // There are no 64-bit extloads. These should be done as a 32-bit extload and 178 // an extension to 64-bit. 179 for (MVT VT : MVT::integer_valuetypes()) { 180 setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); 181 setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); 182 setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); 183 } 184 185 for (MVT VT : MVT::integer_valuetypes()) { 186 if (VT == MVT::i64) 187 continue; 188 189 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 190 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); 191 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); 192 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); 193 194 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 195 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); 196 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); 197 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); 198 199 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 200 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); 201 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); 202 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); 203 } 204 205 for (MVT VT : MVT::integer_vector_valuetypes()) { 206 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); 207 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); 208 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); 209 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); 210 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); 211 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); 212 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); 213 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); 214 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); 215 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); 216 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); 217 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); 218 } 219 220 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 222 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 223 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); 224 225 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 226 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 227 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 228 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); 229 230 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 231 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 232 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 233 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); 234 235 setOperationAction(ISD::STORE, MVT::f32, Promote); 236 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); 237 238 setOperationAction(ISD::STORE, MVT::v2f32, Promote); 239 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); 240 241 setOperationAction(ISD::STORE, MVT::v3f32, Promote); 242 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32); 243 244 setOperationAction(ISD::STORE, MVT::v4f32, Promote); 245 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); 246 247 setOperationAction(ISD::STORE, MVT::v8f32, Promote); 248 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); 249 250 setOperationAction(ISD::STORE, MVT::v16f32, Promote); 251 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); 252 253 setOperationAction(ISD::STORE, MVT::i64, Promote); 254 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); 255 256 setOperationAction(ISD::STORE, MVT::v2i64, Promote); 257 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); 258 259 setOperationAction(ISD::STORE, MVT::f64, Promote); 260 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32); 261 262 setOperationAction(ISD::STORE, MVT::v2f64, Promote); 263 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); 264 265 setTruncStoreAction(MVT::i64, MVT::i1, Expand); 266 setTruncStoreAction(MVT::i64, MVT::i8, Expand); 267 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 268 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 269 270 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); 271 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand); 272 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand); 273 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); 274 275 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 276 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); 277 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); 278 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); 279 280 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 281 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 282 283 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); 284 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); 285 286 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); 287 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); 288 289 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); 290 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); 291 292 293 setOperationAction(ISD::Constant, MVT::i32, Legal); 294 setOperationAction(ISD::Constant, MVT::i64, Legal); 295 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 296 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 297 298 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 299 setOperationAction(ISD::BRIND, MVT::Other, Expand); 300 301 // This is totally unsupported, just custom lower to produce an error. 302 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 303 304 // Library functions. These default to Expand, but we have instructions 305 // for them. 306 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 307 setOperationAction(ISD::FEXP2, MVT::f32, Legal); 308 setOperationAction(ISD::FPOW, MVT::f32, Legal); 309 setOperationAction(ISD::FLOG2, MVT::f32, Legal); 310 setOperationAction(ISD::FABS, MVT::f32, Legal); 311 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 312 setOperationAction(ISD::FRINT, MVT::f32, Legal); 313 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 314 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 315 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 316 317 setOperationAction(ISD::FROUND, MVT::f32, Custom); 318 setOperationAction(ISD::FROUND, MVT::f64, Custom); 319 320 setOperationAction(ISD::FLOG, MVT::f32, Custom); 321 setOperationAction(ISD::FLOG10, MVT::f32, Custom); 322 setOperationAction(ISD::FEXP, MVT::f32, Custom); 323 324 325 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); 326 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); 327 328 setOperationAction(ISD::FREM, MVT::f32, Custom); 329 setOperationAction(ISD::FREM, MVT::f64, Custom); 330 331 // Expand to fneg + fadd. 332 setOperationAction(ISD::FSUB, MVT::f64, Expand); 333 334 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom); 335 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom); 336 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 337 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); 338 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); 339 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); 340 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); 341 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); 342 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom); 343 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom); 344 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); 345 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); 346 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); 347 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); 348 349 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 350 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); 351 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom); 352 353 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 354 for (MVT VT : ScalarIntVTs) { 355 // These should use [SU]DIVREM, so set them to expand 356 setOperationAction(ISD::SDIV, VT, Expand); 357 setOperationAction(ISD::UDIV, VT, Expand); 358 setOperationAction(ISD::SREM, VT, Expand); 359 setOperationAction(ISD::UREM, VT, Expand); 360 361 // GPU does not have divrem function for signed or unsigned. 362 setOperationAction(ISD::SDIVREM, VT, Custom); 363 setOperationAction(ISD::UDIVREM, VT, Custom); 364 365 // GPU does not have [S|U]MUL_LOHI functions as a single instruction. 366 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 367 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 368 369 setOperationAction(ISD::BSWAP, VT, Expand); 370 setOperationAction(ISD::CTTZ, VT, Expand); 371 setOperationAction(ISD::CTLZ, VT, Expand); 372 373 // AMDGPU uses ADDC/SUBC/ADDE/SUBE 374 setOperationAction(ISD::ADDC, VT, Legal); 375 setOperationAction(ISD::SUBC, VT, Legal); 376 setOperationAction(ISD::ADDE, VT, Legal); 377 setOperationAction(ISD::SUBE, VT, Legal); 378 } 379 380 // The hardware supports 32-bit ROTR, but not ROTL. 381 setOperationAction(ISD::ROTL, MVT::i32, Expand); 382 setOperationAction(ISD::ROTL, MVT::i64, Expand); 383 setOperationAction(ISD::ROTR, MVT::i64, Expand); 384 385 setOperationAction(ISD::MUL, MVT::i64, Expand); 386 setOperationAction(ISD::MULHU, MVT::i64, Expand); 387 setOperationAction(ISD::MULHS, MVT::i64, Expand); 388 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 389 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 390 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 391 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 392 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 393 394 setOperationAction(ISD::SMIN, MVT::i32, Legal); 395 setOperationAction(ISD::UMIN, MVT::i32, Legal); 396 setOperationAction(ISD::SMAX, MVT::i32, Legal); 397 setOperationAction(ISD::UMAX, MVT::i32, Legal); 398 399 setOperationAction(ISD::CTTZ, MVT::i64, Custom); 400 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom); 401 setOperationAction(ISD::CTLZ, MVT::i64, Custom); 402 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 403 404 static const MVT::SimpleValueType VectorIntTypes[] = { 405 MVT::v2i32, MVT::v3i32, MVT::v4i32 406 }; 407 408 for (MVT VT : VectorIntTypes) { 409 // Expand the following operations for the current type by default. 410 setOperationAction(ISD::ADD, VT, Expand); 411 setOperationAction(ISD::AND, VT, Expand); 412 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 413 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 414 setOperationAction(ISD::MUL, VT, Expand); 415 setOperationAction(ISD::MULHU, VT, Expand); 416 setOperationAction(ISD::MULHS, VT, Expand); 417 setOperationAction(ISD::OR, VT, Expand); 418 setOperationAction(ISD::SHL, VT, Expand); 419 setOperationAction(ISD::SRA, VT, Expand); 420 setOperationAction(ISD::SRL, VT, Expand); 421 setOperationAction(ISD::ROTL, VT, Expand); 422 setOperationAction(ISD::ROTR, VT, Expand); 423 setOperationAction(ISD::SUB, VT, Expand); 424 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 425 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 426 setOperationAction(ISD::SDIV, VT, Expand); 427 setOperationAction(ISD::UDIV, VT, Expand); 428 setOperationAction(ISD::SREM, VT, Expand); 429 setOperationAction(ISD::UREM, VT, Expand); 430 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 431 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 432 setOperationAction(ISD::SDIVREM, VT, Custom); 433 setOperationAction(ISD::UDIVREM, VT, Expand); 434 setOperationAction(ISD::SELECT, VT, Expand); 435 setOperationAction(ISD::VSELECT, VT, Expand); 436 setOperationAction(ISD::SELECT_CC, VT, Expand); 437 setOperationAction(ISD::XOR, VT, Expand); 438 setOperationAction(ISD::BSWAP, VT, Expand); 439 setOperationAction(ISD::CTPOP, VT, Expand); 440 setOperationAction(ISD::CTTZ, VT, Expand); 441 setOperationAction(ISD::CTLZ, VT, Expand); 442 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 443 setOperationAction(ISD::SETCC, VT, Expand); 444 } 445 446 static const MVT::SimpleValueType FloatVectorTypes[] = { 447 MVT::v2f32, MVT::v3f32, MVT::v4f32 448 }; 449 450 for (MVT VT : FloatVectorTypes) { 451 setOperationAction(ISD::FABS, VT, Expand); 452 setOperationAction(ISD::FMINNUM, VT, Expand); 453 setOperationAction(ISD::FMAXNUM, VT, Expand); 454 setOperationAction(ISD::FADD, VT, Expand); 455 setOperationAction(ISD::FCEIL, VT, Expand); 456 setOperationAction(ISD::FCOS, VT, Expand); 457 setOperationAction(ISD::FDIV, VT, Expand); 458 setOperationAction(ISD::FEXP2, VT, Expand); 459 setOperationAction(ISD::FEXP, VT, Expand); 460 setOperationAction(ISD::FLOG2, VT, Expand); 461 setOperationAction(ISD::FREM, VT, Expand); 462 setOperationAction(ISD::FLOG, VT, Expand); 463 setOperationAction(ISD::FLOG10, VT, Expand); 464 setOperationAction(ISD::FPOW, VT, Expand); 465 setOperationAction(ISD::FFLOOR, VT, Expand); 466 setOperationAction(ISD::FTRUNC, VT, Expand); 467 setOperationAction(ISD::FMUL, VT, Expand); 468 setOperationAction(ISD::FMA, VT, Expand); 469 setOperationAction(ISD::FRINT, VT, Expand); 470 setOperationAction(ISD::FNEARBYINT, VT, Expand); 471 setOperationAction(ISD::FSQRT, VT, Expand); 472 setOperationAction(ISD::FSIN, VT, Expand); 473 setOperationAction(ISD::FSUB, VT, Expand); 474 setOperationAction(ISD::FNEG, VT, Expand); 475 setOperationAction(ISD::VSELECT, VT, Expand); 476 setOperationAction(ISD::SELECT_CC, VT, Expand); 477 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 478 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 479 setOperationAction(ISD::SETCC, VT, Expand); 480 setOperationAction(ISD::FCANONICALIZE, VT, Expand); 481 } 482 483 // This causes using an unrolled select operation rather than expansion with 484 // bit operations. This is in general better, but the alternative using BFI 485 // instructions may be better if the select sources are SGPRs. 486 setOperationAction(ISD::SELECT, MVT::v2f32, Promote); 487 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32); 488 489 setOperationAction(ISD::SELECT, MVT::v3f32, Promote); 490 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32); 491 492 setOperationAction(ISD::SELECT, MVT::v4f32, Promote); 493 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); 494 495 // There are no libcalls of any kind. 496 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) 497 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr); 498 499 setBooleanContents(ZeroOrNegativeOneBooleanContent); 500 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 501 502 setSchedulingPreference(Sched::RegPressure); 503 setJumpIsExpensive(true); 504 505 // FIXME: This is only partially true. If we have to do vector compares, any 506 // SGPR pair can be a condition register. If we have a uniform condition, we 507 // are better off doing SALU operations, where there is only one SCC. For now, 508 // we don't have a way of knowing during instruction selection if a condition 509 // will be uniform and we always use vector compares. Assume we are using 510 // vector compares until that is fixed. 511 setHasMultipleConditionRegisters(true); 512 513 PredictableSelectIsExpensive = false; 514 515 // We want to find all load dependencies for long chains of stores to enable 516 // merging into very wide vectors. The problem is with vectors with > 4 517 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 518 // vectors are a legal type, even though we have to split the loads 519 // usually. When we can more precisely specify load legality per address 520 // space, we should be able to make FindBetterChain/MergeConsecutiveStores 521 // smarter so that they can figure out what to do in 2 iterations without all 522 // N > 4 stores on the same chain. 523 GatherAllAliasesMaxDepth = 16; 524 525 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry 526 // about these during lowering. 527 MaxStoresPerMemcpy = 0xffffffff; 528 MaxStoresPerMemmove = 0xffffffff; 529 MaxStoresPerMemset = 0xffffffff; 530 531 setTargetDAGCombine(ISD::BITCAST); 532 setTargetDAGCombine(ISD::SHL); 533 setTargetDAGCombine(ISD::SRA); 534 setTargetDAGCombine(ISD::SRL); 535 setTargetDAGCombine(ISD::TRUNCATE); 536 setTargetDAGCombine(ISD::MUL); 537 setTargetDAGCombine(ISD::MULHU); 538 setTargetDAGCombine(ISD::MULHS); 539 setTargetDAGCombine(ISD::SELECT); 540 setTargetDAGCombine(ISD::SELECT_CC); 541 setTargetDAGCombine(ISD::STORE); 542 setTargetDAGCombine(ISD::FADD); 543 setTargetDAGCombine(ISD::FSUB); 544 setTargetDAGCombine(ISD::FNEG); 545 setTargetDAGCombine(ISD::FABS); 546 setTargetDAGCombine(ISD::AssertZext); 547 setTargetDAGCombine(ISD::AssertSext); 548 } 549 550 //===----------------------------------------------------------------------===// 551 // Target Information 552 //===----------------------------------------------------------------------===// 553 554 LLVM_READNONE 555 static bool fnegFoldsIntoOp(unsigned Opc) { 556 switch (Opc) { 557 case ISD::FADD: 558 case ISD::FSUB: 559 case ISD::FMUL: 560 case ISD::FMA: 561 case ISD::FMAD: 562 case ISD::FMINNUM: 563 case ISD::FMAXNUM: 564 case ISD::FMINNUM_IEEE: 565 case ISD::FMAXNUM_IEEE: 566 case ISD::FSIN: 567 case ISD::FTRUNC: 568 case ISD::FRINT: 569 case ISD::FNEARBYINT: 570 case ISD::FCANONICALIZE: 571 case AMDGPUISD::RCP: 572 case AMDGPUISD::RCP_LEGACY: 573 case AMDGPUISD::RCP_IFLAG: 574 case AMDGPUISD::SIN_HW: 575 case AMDGPUISD::FMUL_LEGACY: 576 case AMDGPUISD::FMIN_LEGACY: 577 case AMDGPUISD::FMAX_LEGACY: 578 case AMDGPUISD::FMED3: 579 return true; 580 default: 581 return false; 582 } 583 } 584 585 /// \p returns true if the operation will definitely need to use a 64-bit 586 /// encoding, and thus will use a VOP3 encoding regardless of the source 587 /// modifiers. 588 LLVM_READONLY 589 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { 590 return N->getNumOperands() > 2 || VT == MVT::f64; 591 } 592 593 // Most FP instructions support source modifiers, but this could be refined 594 // slightly. 595 LLVM_READONLY 596 static bool hasSourceMods(const SDNode *N) { 597 if (isa<MemSDNode>(N)) 598 return false; 599 600 switch (N->getOpcode()) { 601 case ISD::CopyToReg: 602 case ISD::SELECT: 603 case ISD::FDIV: 604 case ISD::FREM: 605 case ISD::INLINEASM: 606 case ISD::INLINEASM_BR: 607 case AMDGPUISD::INTERP_P1: 608 case AMDGPUISD::INTERP_P2: 609 case AMDGPUISD::DIV_SCALE: 610 611 // TODO: Should really be looking at the users of the bitcast. These are 612 // problematic because bitcasts are used to legalize all stores to integer 613 // types. 614 case ISD::BITCAST: 615 return false; 616 default: 617 return true; 618 } 619 } 620 621 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, 622 unsigned CostThreshold) { 623 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus 624 // it is truly free to use a source modifier in all cases. If there are 625 // multiple users but for each one will necessitate using VOP3, there will be 626 // a code size increase. Try to avoid increasing code size unless we know it 627 // will save on the instruction count. 628 unsigned NumMayIncreaseSize = 0; 629 MVT VT = N->getValueType(0).getScalarType().getSimpleVT(); 630 631 // XXX - Should this limit number of uses to check? 632 for (const SDNode *U : N->uses()) { 633 if (!hasSourceMods(U)) 634 return false; 635 636 if (!opMustUseVOP3Encoding(U, VT)) { 637 if (++NumMayIncreaseSize > CostThreshold) 638 return false; 639 } 640 } 641 642 return true; 643 } 644 645 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { 646 return MVT::i32; 647 } 648 649 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { 650 return true; 651 } 652 653 // The backend supports 32 and 64 bit floating point immediates. 654 // FIXME: Why are we reporting vectors of FP immediates as legal? 655 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 656 bool ForCodeSize) const { 657 EVT ScalarVT = VT.getScalarType(); 658 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 || 659 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts())); 660 } 661 662 // We don't want to shrink f64 / f32 constants. 663 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { 664 EVT ScalarVT = VT.getScalarType(); 665 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); 666 } 667 668 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, 669 ISD::LoadExtType ExtTy, 670 EVT NewVT) const { 671 // TODO: This may be worth removing. Check regression tests for diffs. 672 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT)) 673 return false; 674 675 unsigned NewSize = NewVT.getStoreSizeInBits(); 676 677 // If we are reducing to a 32-bit load, this is always better. 678 if (NewSize == 32) 679 return true; 680 681 EVT OldVT = N->getValueType(0); 682 unsigned OldSize = OldVT.getStoreSizeInBits(); 683 684 MemSDNode *MN = cast<MemSDNode>(N); 685 unsigned AS = MN->getAddressSpace(); 686 // Do not shrink an aligned scalar load to sub-dword. 687 // Scalar engine cannot do sub-dword loads. 688 if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 && 689 (AS == AMDGPUAS::CONSTANT_ADDRESS || 690 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 691 (isa<LoadSDNode>(N) && 692 AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) && 693 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand())) 694 return false; 695 696 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar 697 // extloads, so doing one requires using a buffer_load. In cases where we 698 // still couldn't use a scalar load, using the wider load shouldn't really 699 // hurt anything. 700 701 // If the old size already had to be an extload, there's no harm in continuing 702 // to reduce the width. 703 return (OldSize < 32); 704 } 705 706 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, 707 EVT CastTy) const { 708 709 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); 710 711 if (LoadTy.getScalarType() == MVT::i32) 712 return false; 713 714 unsigned LScalarSize = LoadTy.getScalarSizeInBits(); 715 unsigned CastScalarSize = CastTy.getScalarSizeInBits(); 716 717 return (LScalarSize < CastScalarSize) || 718 (CastScalarSize >= 32); 719 } 720 721 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also 722 // profitable with the expansion for 64-bit since it's generally good to 723 // speculate things. 724 // FIXME: These should really have the size as a parameter. 725 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const { 726 return true; 727 } 728 729 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { 730 return true; 731 } 732 733 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { 734 switch (N->getOpcode()) { 735 default: 736 return false; 737 case ISD::EntryToken: 738 case ISD::TokenFactor: 739 return true; 740 case ISD::INTRINSIC_WO_CHAIN: 741 { 742 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 743 switch (IntrID) { 744 default: 745 return false; 746 case Intrinsic::amdgcn_readfirstlane: 747 case Intrinsic::amdgcn_readlane: 748 return true; 749 } 750 } 751 break; 752 case ISD::LOAD: 753 { 754 const LoadSDNode * L = dyn_cast<LoadSDNode>(N); 755 if (L->getMemOperand()->getAddrSpace() 756 == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 757 return true; 758 return false; 759 } 760 break; 761 } 762 } 763 764 //===---------------------------------------------------------------------===// 765 // Target Properties 766 //===---------------------------------------------------------------------===// 767 768 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { 769 assert(VT.isFloatingPoint()); 770 771 // Packed operations do not have a fabs modifier. 772 return VT == MVT::f32 || VT == MVT::f64 || 773 (Subtarget->has16BitInsts() && VT == MVT::f16); 774 } 775 776 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { 777 assert(VT.isFloatingPoint()); 778 return VT == MVT::f32 || VT == MVT::f64 || 779 (Subtarget->has16BitInsts() && VT == MVT::f16) || 780 (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16); 781 } 782 783 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, 784 unsigned NumElem, 785 unsigned AS) const { 786 return true; 787 } 788 789 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const { 790 // There are few operations which truly have vector input operands. Any vector 791 // operation is going to involve operations on each component, and a 792 // build_vector will be a copy per element, so it always makes sense to use a 793 // build_vector input in place of the extracted element to avoid a copy into a 794 // super register. 795 // 796 // We should probably only do this if all users are extracts only, but this 797 // should be the common case. 798 return true; 799 } 800 801 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { 802 // Truncate is just accessing a subregister. 803 804 unsigned SrcSize = Source.getSizeInBits(); 805 unsigned DestSize = Dest.getSizeInBits(); 806 807 return DestSize < SrcSize && DestSize % 32 == 0 ; 808 } 809 810 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { 811 // Truncate is just accessing a subregister. 812 813 unsigned SrcSize = Source->getScalarSizeInBits(); 814 unsigned DestSize = Dest->getScalarSizeInBits(); 815 816 if (DestSize== 16 && Subtarget->has16BitInsts()) 817 return SrcSize >= 32; 818 819 return DestSize < SrcSize && DestSize % 32 == 0; 820 } 821 822 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { 823 unsigned SrcSize = Src->getScalarSizeInBits(); 824 unsigned DestSize = Dest->getScalarSizeInBits(); 825 826 if (SrcSize == 16 && Subtarget->has16BitInsts()) 827 return DestSize >= 32; 828 829 return SrcSize == 32 && DestSize == 64; 830 } 831 832 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { 833 // Any register load of a 64-bit value really requires 2 32-bit moves. For all 834 // practical purposes, the extra mov 0 to load a 64-bit is free. As used, 835 // this will enable reducing 64-bit operations the 32-bit, which is always 836 // good. 837 838 if (Src == MVT::i16) 839 return Dest == MVT::i32 ||Dest == MVT::i64 ; 840 841 return Src == MVT::i32 && Dest == MVT::i64; 842 } 843 844 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 845 return isZExtFree(Val.getValueType(), VT2); 846 } 847 848 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { 849 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a 850 // limited number of native 64-bit operations. Shrinking an operation to fit 851 // in a single 32-bit register should always be helpful. As currently used, 852 // this is much less general than the name suggests, and is only used in 853 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is 854 // not profitable, and may actually be harmful. 855 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; 856 } 857 858 //===---------------------------------------------------------------------===// 859 // TargetLowering Callbacks 860 //===---------------------------------------------------------------------===// 861 862 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, 863 bool IsVarArg) { 864 switch (CC) { 865 case CallingConv::AMDGPU_VS: 866 case CallingConv::AMDGPU_GS: 867 case CallingConv::AMDGPU_PS: 868 case CallingConv::AMDGPU_CS: 869 case CallingConv::AMDGPU_HS: 870 case CallingConv::AMDGPU_ES: 871 case CallingConv::AMDGPU_LS: 872 return CC_AMDGPU; 873 case CallingConv::C: 874 case CallingConv::Fast: 875 case CallingConv::Cold: 876 return CC_AMDGPU_Func; 877 case CallingConv::AMDGPU_KERNEL: 878 case CallingConv::SPIR_KERNEL: 879 default: 880 report_fatal_error("Unsupported calling convention for call"); 881 } 882 } 883 884 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, 885 bool IsVarArg) { 886 switch (CC) { 887 case CallingConv::AMDGPU_KERNEL: 888 case CallingConv::SPIR_KERNEL: 889 llvm_unreachable("kernels should not be handled here"); 890 case CallingConv::AMDGPU_VS: 891 case CallingConv::AMDGPU_GS: 892 case CallingConv::AMDGPU_PS: 893 case CallingConv::AMDGPU_CS: 894 case CallingConv::AMDGPU_HS: 895 case CallingConv::AMDGPU_ES: 896 case CallingConv::AMDGPU_LS: 897 return RetCC_SI_Shader; 898 case CallingConv::C: 899 case CallingConv::Fast: 900 case CallingConv::Cold: 901 return RetCC_AMDGPU_Func; 902 default: 903 report_fatal_error("Unsupported calling convention."); 904 } 905 } 906 907 /// The SelectionDAGBuilder will automatically promote function arguments 908 /// with illegal types. However, this does not work for the AMDGPU targets 909 /// since the function arguments are stored in memory as these illegal types. 910 /// In order to handle this properly we need to get the original types sizes 911 /// from the LLVM IR Function and fixup the ISD:InputArg values before 912 /// passing them to AnalyzeFormalArguments() 913 914 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting 915 /// input values across multiple registers. Each item in the Ins array 916 /// represents a single value that will be stored in registers. Ins[x].VT is 917 /// the value type of the value that will be stored in the register, so 918 /// whatever SDNode we lower the argument to needs to be this type. 919 /// 920 /// In order to correctly lower the arguments we need to know the size of each 921 /// argument. Since Ins[x].VT gives us the size of the register that will 922 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type 923 /// for the orignal function argument so that we can deduce the correct memory 924 /// type to use for Ins[x]. In most cases the correct memory type will be 925 /// Ins[x].ArgVT. However, this will not always be the case. If, for example, 926 /// we have a kernel argument of type v8i8, this argument will be split into 927 /// 8 parts and each part will be represented by its own item in the Ins array. 928 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of 929 /// the argument before it was split. From this, we deduce that the memory type 930 /// for each individual part is i8. We pass the memory type as LocVT to the 931 /// calling convention analysis function and the register type (Ins[x].VT) as 932 /// the ValVT. 933 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( 934 CCState &State, 935 const SmallVectorImpl<ISD::InputArg> &Ins) const { 936 const MachineFunction &MF = State.getMachineFunction(); 937 const Function &Fn = MF.getFunction(); 938 LLVMContext &Ctx = Fn.getParent()->getContext(); 939 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); 940 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn); 941 CallingConv::ID CC = Fn.getCallingConv(); 942 943 unsigned MaxAlign = 1; 944 uint64_t ExplicitArgOffset = 0; 945 const DataLayout &DL = Fn.getParent()->getDataLayout(); 946 947 unsigned InIndex = 0; 948 949 for (const Argument &Arg : Fn.args()) { 950 Type *BaseArgTy = Arg.getType(); 951 unsigned Align = DL.getABITypeAlignment(BaseArgTy); 952 MaxAlign = std::max(Align, MaxAlign); 953 unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); 954 955 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset; 956 ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; 957 958 // We're basically throwing away everything passed into us and starting over 959 // to get accurate in-memory offsets. The "PartOffset" is completely useless 960 // to us as computed in Ins. 961 // 962 // We also need to figure out what type legalization is trying to do to get 963 // the correct memory offsets. 964 965 SmallVector<EVT, 16> ValueVTs; 966 SmallVector<uint64_t, 16> Offsets; 967 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); 968 969 for (unsigned Value = 0, NumValues = ValueVTs.size(); 970 Value != NumValues; ++Value) { 971 uint64_t BasePartOffset = Offsets[Value]; 972 973 EVT ArgVT = ValueVTs[Value]; 974 EVT MemVT = ArgVT; 975 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT); 976 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT); 977 978 if (NumRegs == 1) { 979 // This argument is not split, so the IR type is the memory type. 980 if (ArgVT.isExtended()) { 981 // We have an extended type, like i24, so we should just use the 982 // register type. 983 MemVT = RegisterVT; 984 } else { 985 MemVT = ArgVT; 986 } 987 } else if (ArgVT.isVector() && RegisterVT.isVector() && 988 ArgVT.getScalarType() == RegisterVT.getScalarType()) { 989 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()); 990 // We have a vector value which has been split into a vector with 991 // the same scalar type, but fewer elements. This should handle 992 // all the floating-point vector types. 993 MemVT = RegisterVT; 994 } else if (ArgVT.isVector() && 995 ArgVT.getVectorNumElements() == NumRegs) { 996 // This arg has been split so that each element is stored in a separate 997 // register. 998 MemVT = ArgVT.getScalarType(); 999 } else if (ArgVT.isExtended()) { 1000 // We have an extended type, like i65. 1001 MemVT = RegisterVT; 1002 } else { 1003 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs; 1004 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0); 1005 if (RegisterVT.isInteger()) { 1006 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); 1007 } else if (RegisterVT.isVector()) { 1008 assert(!RegisterVT.getScalarType().isFloatingPoint()); 1009 unsigned NumElements = RegisterVT.getVectorNumElements(); 1010 assert(MemoryBits % NumElements == 0); 1011 // This vector type has been split into another vector type with 1012 // a different elements size. 1013 EVT ScalarVT = EVT::getIntegerVT(State.getContext(), 1014 MemoryBits / NumElements); 1015 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); 1016 } else { 1017 llvm_unreachable("cannot deduce memory type."); 1018 } 1019 } 1020 1021 // Convert one element vectors to scalar. 1022 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) 1023 MemVT = MemVT.getScalarType(); 1024 1025 // Round up vec3/vec5 argument. 1026 if (MemVT.isVector() && !MemVT.isPow2VectorType()) { 1027 assert(MemVT.getVectorNumElements() == 3 || 1028 MemVT.getVectorNumElements() == 5); 1029 MemVT = MemVT.getPow2VectorType(State.getContext()); 1030 } 1031 1032 unsigned PartOffset = 0; 1033 for (unsigned i = 0; i != NumRegs; ++i) { 1034 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT, 1035 BasePartOffset + PartOffset, 1036 MemVT.getSimpleVT(), 1037 CCValAssign::Full)); 1038 PartOffset += MemVT.getStoreSize(); 1039 } 1040 } 1041 } 1042 } 1043 1044 SDValue AMDGPUTargetLowering::LowerReturn( 1045 SDValue Chain, CallingConv::ID CallConv, 1046 bool isVarArg, 1047 const SmallVectorImpl<ISD::OutputArg> &Outs, 1048 const SmallVectorImpl<SDValue> &OutVals, 1049 const SDLoc &DL, SelectionDAG &DAG) const { 1050 // FIXME: Fails for r600 tests 1051 //assert(!isVarArg && Outs.empty() && OutVals.empty() && 1052 // "wave terminate should not have return values"); 1053 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); 1054 } 1055 1056 //===---------------------------------------------------------------------===// 1057 // Target specific lowering 1058 //===---------------------------------------------------------------------===// 1059 1060 /// Selects the correct CCAssignFn for a given CallingConvention value. 1061 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1062 bool IsVarArg) { 1063 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg); 1064 } 1065 1066 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1067 bool IsVarArg) { 1068 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); 1069 } 1070 1071 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain, 1072 SelectionDAG &DAG, 1073 MachineFrameInfo &MFI, 1074 int ClobberedFI) const { 1075 SmallVector<SDValue, 8> ArgChains; 1076 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); 1077 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; 1078 1079 // Include the original chain at the beginning of the list. When this is 1080 // used by target LowerCall hooks, this helps legalize find the 1081 // CALLSEQ_BEGIN node. 1082 ArgChains.push_back(Chain); 1083 1084 // Add a chain value for each stack argument corresponding 1085 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 1086 UE = DAG.getEntryNode().getNode()->use_end(); 1087 U != UE; ++U) { 1088 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) { 1089 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) { 1090 if (FI->getIndex() < 0) { 1091 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); 1092 int64_t InLastByte = InFirstByte; 1093 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; 1094 1095 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 1096 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 1097 ArgChains.push_back(SDValue(L, 1)); 1098 } 1099 } 1100 } 1101 } 1102 1103 // Build a tokenfactor for all the chains. 1104 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 1105 } 1106 1107 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, 1108 SmallVectorImpl<SDValue> &InVals, 1109 StringRef Reason) const { 1110 SDValue Callee = CLI.Callee; 1111 SelectionDAG &DAG = CLI.DAG; 1112 1113 const Function &Fn = DAG.getMachineFunction().getFunction(); 1114 1115 StringRef FuncName("<unknown>"); 1116 1117 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) 1118 FuncName = G->getSymbol(); 1119 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 1120 FuncName = G->getGlobal()->getName(); 1121 1122 DiagnosticInfoUnsupported NoCalls( 1123 Fn, Reason + FuncName, CLI.DL.getDebugLoc()); 1124 DAG.getContext()->diagnose(NoCalls); 1125 1126 if (!CLI.IsTailCall) { 1127 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) 1128 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); 1129 } 1130 1131 return DAG.getEntryNode(); 1132 } 1133 1134 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, 1135 SmallVectorImpl<SDValue> &InVals) const { 1136 return lowerUnhandledCall(CLI, InVals, "unsupported call to function "); 1137 } 1138 1139 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 1140 SelectionDAG &DAG) const { 1141 const Function &Fn = DAG.getMachineFunction().getFunction(); 1142 1143 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca", 1144 SDLoc(Op).getDebugLoc()); 1145 DAG.getContext()->diagnose(NoDynamicAlloca); 1146 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)}; 1147 return DAG.getMergeValues(Ops, SDLoc()); 1148 } 1149 1150 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, 1151 SelectionDAG &DAG) const { 1152 switch (Op.getOpcode()) { 1153 default: 1154 Op->print(errs(), &DAG); 1155 llvm_unreachable("Custom lowering code for this" 1156 "instruction is not implemented yet!"); 1157 break; 1158 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 1159 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 1160 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 1161 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); 1162 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); 1163 case ISD::FREM: return LowerFREM(Op, DAG); 1164 case ISD::FCEIL: return LowerFCEIL(Op, DAG); 1165 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); 1166 case ISD::FRINT: return LowerFRINT(Op, DAG); 1167 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); 1168 case ISD::FROUND: return LowerFROUND(Op, DAG); 1169 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); 1170 case ISD::FLOG: 1171 return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F); 1172 case ISD::FLOG10: 1173 return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F); 1174 case ISD::FEXP: 1175 return lowerFEXP(Op, DAG); 1176 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 1177 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 1178 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); 1179 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 1180 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 1181 case ISD::CTTZ: 1182 case ISD::CTTZ_ZERO_UNDEF: 1183 case ISD::CTLZ: 1184 case ISD::CTLZ_ZERO_UNDEF: 1185 return LowerCTLZ_CTTZ(Op, DAG); 1186 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 1187 } 1188 return Op; 1189 } 1190 1191 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, 1192 SmallVectorImpl<SDValue> &Results, 1193 SelectionDAG &DAG) const { 1194 switch (N->getOpcode()) { 1195 case ISD::SIGN_EXTEND_INREG: 1196 // Different parts of legalization seem to interpret which type of 1197 // sign_extend_inreg is the one to check for custom lowering. The extended 1198 // from type is what really matters, but some places check for custom 1199 // lowering of the result type. This results in trying to use 1200 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do 1201 // nothing here and let the illegal result integer be handled normally. 1202 return; 1203 default: 1204 return; 1205 } 1206 } 1207 1208 static bool hasDefinedInitializer(const GlobalValue *GV) { 1209 const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); 1210 if (!GVar || !GVar->hasInitializer()) 1211 return false; 1212 1213 return !isa<UndefValue>(GVar->getInitializer()); 1214 } 1215 1216 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, 1217 SDValue Op, 1218 SelectionDAG &DAG) const { 1219 1220 const DataLayout &DL = DAG.getDataLayout(); 1221 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); 1222 const GlobalValue *GV = G->getGlobal(); 1223 1224 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1225 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { 1226 if (!MFI->isEntryFunction()) { 1227 const Function &Fn = DAG.getMachineFunction().getFunction(); 1228 DiagnosticInfoUnsupported BadLDSDecl( 1229 Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc()); 1230 DAG.getContext()->diagnose(BadLDSDecl); 1231 } 1232 1233 // XXX: What does the value of G->getOffset() mean? 1234 assert(G->getOffset() == 0 && 1235 "Do not know what to do with an non-zero offset"); 1236 1237 // TODO: We could emit code to handle the initialization somewhere. 1238 if (!hasDefinedInitializer(GV)) { 1239 unsigned Offset = MFI->allocateLDSGlobal(DL, *GV); 1240 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); 1241 } 1242 } 1243 1244 const Function &Fn = DAG.getMachineFunction().getFunction(); 1245 DiagnosticInfoUnsupported BadInit( 1246 Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc()); 1247 DAG.getContext()->diagnose(BadInit); 1248 return SDValue(); 1249 } 1250 1251 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, 1252 SelectionDAG &DAG) const { 1253 SmallVector<SDValue, 8> Args; 1254 1255 EVT VT = Op.getValueType(); 1256 if (VT == MVT::v4i16 || VT == MVT::v4f16) { 1257 SDLoc SL(Op); 1258 SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0)); 1259 SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1)); 1260 1261 SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi }); 1262 return DAG.getNode(ISD::BITCAST, SL, VT, BV); 1263 } 1264 1265 for (const SDUse &U : Op->ops()) 1266 DAG.ExtractVectorElements(U.get(), Args); 1267 1268 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); 1269 } 1270 1271 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 1272 SelectionDAG &DAG) const { 1273 1274 SmallVector<SDValue, 8> Args; 1275 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1276 EVT VT = Op.getValueType(); 1277 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, 1278 VT.getVectorNumElements()); 1279 1280 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); 1281 } 1282 1283 /// Generate Min/Max node 1284 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, 1285 SDValue LHS, SDValue RHS, 1286 SDValue True, SDValue False, 1287 SDValue CC, 1288 DAGCombinerInfo &DCI) const { 1289 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) 1290 return SDValue(); 1291 1292 SelectionDAG &DAG = DCI.DAG; 1293 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1294 switch (CCOpcode) { 1295 case ISD::SETOEQ: 1296 case ISD::SETONE: 1297 case ISD::SETUNE: 1298 case ISD::SETNE: 1299 case ISD::SETUEQ: 1300 case ISD::SETEQ: 1301 case ISD::SETFALSE: 1302 case ISD::SETFALSE2: 1303 case ISD::SETTRUE: 1304 case ISD::SETTRUE2: 1305 case ISD::SETUO: 1306 case ISD::SETO: 1307 break; 1308 case ISD::SETULE: 1309 case ISD::SETULT: { 1310 if (LHS == True) 1311 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1312 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1313 } 1314 case ISD::SETOLE: 1315 case ISD::SETOLT: 1316 case ISD::SETLE: 1317 case ISD::SETLT: { 1318 // Ordered. Assume ordered for undefined. 1319 1320 // Only do this after legalization to avoid interfering with other combines 1321 // which might occur. 1322 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1323 !DCI.isCalledByLegalizer()) 1324 return SDValue(); 1325 1326 // We need to permute the operands to get the correct NaN behavior. The 1327 // selected operand is the second one based on the failing compare with NaN, 1328 // so permute it based on the compare type the hardware uses. 1329 if (LHS == True) 1330 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1331 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1332 } 1333 case ISD::SETUGE: 1334 case ISD::SETUGT: { 1335 if (LHS == True) 1336 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1337 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1338 } 1339 case ISD::SETGT: 1340 case ISD::SETGE: 1341 case ISD::SETOGE: 1342 case ISD::SETOGT: { 1343 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1344 !DCI.isCalledByLegalizer()) 1345 return SDValue(); 1346 1347 if (LHS == True) 1348 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1349 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1350 } 1351 case ISD::SETCC_INVALID: 1352 llvm_unreachable("Invalid setcc condcode!"); 1353 } 1354 return SDValue(); 1355 } 1356 1357 std::pair<SDValue, SDValue> 1358 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const { 1359 SDLoc SL(Op); 1360 1361 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1362 1363 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 1364 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 1365 1366 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 1367 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 1368 1369 return std::make_pair(Lo, Hi); 1370 } 1371 1372 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const { 1373 SDLoc SL(Op); 1374 1375 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1376 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 1377 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 1378 } 1379 1380 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const { 1381 SDLoc SL(Op); 1382 1383 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1384 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 1385 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 1386 } 1387 1388 // Split a vector type into two parts. The first part is a power of two vector. 1389 // The second part is whatever is left over, and is a scalar if it would 1390 // otherwise be a 1-vector. 1391 std::pair<EVT, EVT> 1392 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const { 1393 EVT LoVT, HiVT; 1394 EVT EltVT = VT.getVectorElementType(); 1395 unsigned NumElts = VT.getVectorNumElements(); 1396 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2); 1397 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts); 1398 HiVT = NumElts - LoNumElts == 1 1399 ? EltVT 1400 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts); 1401 return std::make_pair(LoVT, HiVT); 1402 } 1403 1404 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be 1405 // scalar. 1406 std::pair<SDValue, SDValue> 1407 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, 1408 const EVT &LoVT, const EVT &HiVT, 1409 SelectionDAG &DAG) const { 1410 assert(LoVT.getVectorNumElements() + 1411 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= 1412 N.getValueType().getVectorNumElements() && 1413 "More vector elements requested than available!"); 1414 auto IdxTy = getVectorIdxTy(DAG.getDataLayout()); 1415 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, 1416 DAG.getConstant(0, DL, IdxTy)); 1417 SDValue Hi = DAG.getNode( 1418 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, 1419 HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy)); 1420 return std::make_pair(Lo, Hi); 1421 } 1422 1423 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, 1424 SelectionDAG &DAG) const { 1425 LoadSDNode *Load = cast<LoadSDNode>(Op); 1426 EVT VT = Op.getValueType(); 1427 1428 1429 // If this is a 2 element vector, we really want to scalarize and not create 1430 // weird 1 element vectors. 1431 if (VT.getVectorNumElements() == 2) 1432 return scalarizeVectorLoad(Load, DAG); 1433 1434 SDValue BasePtr = Load->getBasePtr(); 1435 EVT MemVT = Load->getMemoryVT(); 1436 SDLoc SL(Op); 1437 1438 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); 1439 1440 EVT LoVT, HiVT; 1441 EVT LoMemVT, HiMemVT; 1442 SDValue Lo, Hi; 1443 1444 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); 1445 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); 1446 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG); 1447 1448 unsigned Size = LoMemVT.getStoreSize(); 1449 unsigned BaseAlign = Load->getAlignment(); 1450 unsigned HiAlign = MinAlign(BaseAlign, Size); 1451 1452 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, 1453 Load->getChain(), BasePtr, SrcValue, LoMemVT, 1454 BaseAlign, Load->getMemOperand()->getFlags()); 1455 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size); 1456 SDValue HiLoad = 1457 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), 1458 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), 1459 HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); 1460 1461 auto IdxTy = getVectorIdxTy(DAG.getDataLayout()); 1462 SDValue Join; 1463 if (LoVT == HiVT) { 1464 // This is the case that the vector is power of two so was evenly split. 1465 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad); 1466 } else { 1467 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad, 1468 DAG.getConstant(0, SL, IdxTy)); 1469 Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR 1470 : ISD::INSERT_VECTOR_ELT, 1471 SL, VT, Join, HiLoad, 1472 DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy)); 1473 } 1474 1475 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 1476 LoLoad.getValue(1), HiLoad.getValue(1))}; 1477 1478 return DAG.getMergeValues(Ops, SL); 1479 } 1480 1481 // Widen a vector load from vec3 to vec4. 1482 SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op, 1483 SelectionDAG &DAG) const { 1484 LoadSDNode *Load = cast<LoadSDNode>(Op); 1485 EVT VT = Op.getValueType(); 1486 assert(VT.getVectorNumElements() == 3); 1487 SDValue BasePtr = Load->getBasePtr(); 1488 EVT MemVT = Load->getMemoryVT(); 1489 SDLoc SL(Op); 1490 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); 1491 unsigned BaseAlign = Load->getAlignment(); 1492 1493 EVT WideVT = 1494 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); 1495 EVT WideMemVT = 1496 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4); 1497 SDValue WideLoad = DAG.getExtLoad( 1498 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue, 1499 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags()); 1500 return DAG.getMergeValues( 1501 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad, 1502 DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))), 1503 WideLoad.getValue(1)}, 1504 SL); 1505 } 1506 1507 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, 1508 SelectionDAG &DAG) const { 1509 StoreSDNode *Store = cast<StoreSDNode>(Op); 1510 SDValue Val = Store->getValue(); 1511 EVT VT = Val.getValueType(); 1512 1513 // If this is a 2 element vector, we really want to scalarize and not create 1514 // weird 1 element vectors. 1515 if (VT.getVectorNumElements() == 2) 1516 return scalarizeVectorStore(Store, DAG); 1517 1518 EVT MemVT = Store->getMemoryVT(); 1519 SDValue Chain = Store->getChain(); 1520 SDValue BasePtr = Store->getBasePtr(); 1521 SDLoc SL(Op); 1522 1523 EVT LoVT, HiVT; 1524 EVT LoMemVT, HiMemVT; 1525 SDValue Lo, Hi; 1526 1527 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); 1528 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); 1529 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG); 1530 1531 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize()); 1532 1533 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); 1534 unsigned BaseAlign = Store->getAlignment(); 1535 unsigned Size = LoMemVT.getStoreSize(); 1536 unsigned HiAlign = MinAlign(BaseAlign, Size); 1537 1538 SDValue LoStore = 1539 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign, 1540 Store->getMemOperand()->getFlags()); 1541 SDValue HiStore = 1542 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), 1543 HiMemVT, HiAlign, Store->getMemOperand()->getFlags()); 1544 1545 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); 1546 } 1547 1548 // This is a shortcut for integer division because we have fast i32<->f32 1549 // conversions, and fast f32 reciprocal instructions. The fractional part of a 1550 // float is enough to accurately represent up to a 24-bit signed integer. 1551 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, 1552 bool Sign) const { 1553 SDLoc DL(Op); 1554 EVT VT = Op.getValueType(); 1555 SDValue LHS = Op.getOperand(0); 1556 SDValue RHS = Op.getOperand(1); 1557 MVT IntVT = MVT::i32; 1558 MVT FltVT = MVT::f32; 1559 1560 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS); 1561 if (LHSSignBits < 9) 1562 return SDValue(); 1563 1564 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS); 1565 if (RHSSignBits < 9) 1566 return SDValue(); 1567 1568 unsigned BitSize = VT.getSizeInBits(); 1569 unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 1570 unsigned DivBits = BitSize - SignBits; 1571 if (Sign) 1572 ++DivBits; 1573 1574 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; 1575 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; 1576 1577 SDValue jq = DAG.getConstant(1, DL, IntVT); 1578 1579 if (Sign) { 1580 // char|short jq = ia ^ ib; 1581 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); 1582 1583 // jq = jq >> (bitsize - 2) 1584 jq = DAG.getNode(ISD::SRA, DL, VT, jq, 1585 DAG.getConstant(BitSize - 2, DL, VT)); 1586 1587 // jq = jq | 0x1 1588 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); 1589 } 1590 1591 // int ia = (int)LHS; 1592 SDValue ia = LHS; 1593 1594 // int ib, (int)RHS; 1595 SDValue ib = RHS; 1596 1597 // float fa = (float)ia; 1598 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); 1599 1600 // float fb = (float)ib; 1601 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); 1602 1603 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, 1604 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); 1605 1606 // fq = trunc(fq); 1607 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); 1608 1609 // float fqneg = -fq; 1610 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); 1611 1612 // float fr = mad(fqneg, fb, fa); 1613 unsigned OpCode = Subtarget->hasFP32Denormals() ? 1614 (unsigned)AMDGPUISD::FMAD_FTZ : 1615 (unsigned)ISD::FMAD; 1616 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); 1617 1618 // int iq = (int)fq; 1619 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); 1620 1621 // fr = fabs(fr); 1622 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); 1623 1624 // fb = fabs(fb); 1625 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); 1626 1627 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 1628 1629 // int cv = fr >= fb; 1630 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); 1631 1632 // jq = (cv ? jq : 0); 1633 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); 1634 1635 // dst = iq + jq; 1636 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); 1637 1638 // Rem needs compensation, it's easier to recompute it 1639 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); 1640 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); 1641 1642 // Truncate to number of bits this divide really is. 1643 if (Sign) { 1644 SDValue InRegSize 1645 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits)); 1646 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize); 1647 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize); 1648 } else { 1649 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT); 1650 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask); 1651 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask); 1652 } 1653 1654 return DAG.getMergeValues({ Div, Rem }, DL); 1655 } 1656 1657 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, 1658 SelectionDAG &DAG, 1659 SmallVectorImpl<SDValue> &Results) const { 1660 SDLoc DL(Op); 1661 EVT VT = Op.getValueType(); 1662 1663 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64"); 1664 1665 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 1666 1667 SDValue One = DAG.getConstant(1, DL, HalfVT); 1668 SDValue Zero = DAG.getConstant(0, DL, HalfVT); 1669 1670 //HiLo split 1671 SDValue LHS = Op.getOperand(0); 1672 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); 1673 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One); 1674 1675 SDValue RHS = Op.getOperand(1); 1676 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); 1677 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One); 1678 1679 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && 1680 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { 1681 1682 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 1683 LHS_Lo, RHS_Lo); 1684 1685 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero}); 1686 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero}); 1687 1688 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV)); 1689 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM)); 1690 return; 1691 } 1692 1693 if (isTypeLegal(MVT::i64)) { 1694 // Compute denominator reciprocal. 1695 unsigned FMAD = Subtarget->hasFP32Denormals() ? 1696 (unsigned)AMDGPUISD::FMAD_FTZ : 1697 (unsigned)ISD::FMAD; 1698 1699 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); 1700 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); 1701 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi, 1702 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32), 1703 Cvt_Lo); 1704 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1); 1705 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp, 1706 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32)); 1707 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1, 1708 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32)); 1709 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2); 1710 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc, 1711 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32), 1712 Mul1); 1713 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2); 1714 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc); 1715 SDValue Rcp64 = DAG.getBitcast(VT, 1716 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi})); 1717 1718 SDValue Zero64 = DAG.getConstant(0, DL, VT); 1719 SDValue One64 = DAG.getConstant(1, DL, VT); 1720 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1); 1721 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1); 1722 1723 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS); 1724 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64); 1725 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1); 1726 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, 1727 Zero); 1728 SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, 1729 One); 1730 1731 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo, 1732 Mulhi1_Lo, Zero1); 1733 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi, 1734 Mulhi1_Hi, Add1_Lo.getValue(1)); 1735 SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi); 1736 SDValue Add1 = DAG.getBitcast(VT, 1737 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi})); 1738 1739 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1); 1740 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2); 1741 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, 1742 Zero); 1743 SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, 1744 One); 1745 1746 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo, 1747 Mulhi2_Lo, Zero1); 1748 SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc, 1749 Mulhi2_Hi, Add1_Lo.getValue(1)); 1750 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC, 1751 Zero, Add2_Lo.getValue(1)); 1752 SDValue Add2 = DAG.getBitcast(VT, 1753 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi})); 1754 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2); 1755 1756 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3); 1757 1758 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero); 1759 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One); 1760 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo, 1761 Mul3_Lo, Zero1); 1762 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi, 1763 Mul3_Hi, Sub1_Lo.getValue(1)); 1764 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi); 1765 SDValue Sub1 = DAG.getBitcast(VT, 1766 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi})); 1767 1768 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT); 1769 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero, 1770 ISD::SETUGE); 1771 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero, 1772 ISD::SETUGE); 1773 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ); 1774 1775 // TODO: Here and below portions of the code can be enclosed into if/endif. 1776 // Currently control flow is unconditional and we have 4 selects after 1777 // potential endif to substitute PHIs. 1778 1779 // if C3 != 0 ... 1780 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo, 1781 RHS_Lo, Zero1); 1782 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi, 1783 RHS_Hi, Sub1_Lo.getValue(1)); 1784 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi, 1785 Zero, Sub2_Lo.getValue(1)); 1786 SDValue Sub2 = DAG.getBitcast(VT, 1787 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi})); 1788 1789 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64); 1790 1791 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero, 1792 ISD::SETUGE); 1793 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero, 1794 ISD::SETUGE); 1795 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ); 1796 1797 // if (C6 != 0) 1798 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64); 1799 1800 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo, 1801 RHS_Lo, Zero1); 1802 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi, 1803 RHS_Hi, Sub2_Lo.getValue(1)); 1804 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi, 1805 Zero, Sub3_Lo.getValue(1)); 1806 SDValue Sub3 = DAG.getBitcast(VT, 1807 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi})); 1808 1809 // endif C6 1810 // endif C3 1811 1812 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE); 1813 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE); 1814 1815 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE); 1816 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE); 1817 1818 Results.push_back(Div); 1819 Results.push_back(Rem); 1820 1821 return; 1822 } 1823 1824 // r600 expandion. 1825 // Get Speculative values 1826 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); 1827 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); 1828 1829 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ); 1830 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero}); 1831 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM); 1832 1833 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ); 1834 SDValue DIV_Lo = Zero; 1835 1836 const unsigned halfBitWidth = HalfVT.getSizeInBits(); 1837 1838 for (unsigned i = 0; i < halfBitWidth; ++i) { 1839 const unsigned bitPos = halfBitWidth - i - 1; 1840 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); 1841 // Get value of high bit 1842 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); 1843 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One); 1844 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); 1845 1846 // Shift 1847 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); 1848 // Add LHS high bit 1849 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); 1850 1851 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT); 1852 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE); 1853 1854 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); 1855 1856 // Update REM 1857 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); 1858 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); 1859 } 1860 1861 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi}); 1862 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV); 1863 Results.push_back(DIV); 1864 Results.push_back(REM); 1865 } 1866 1867 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, 1868 SelectionDAG &DAG) const { 1869 SDLoc DL(Op); 1870 EVT VT = Op.getValueType(); 1871 1872 if (VT == MVT::i64) { 1873 SmallVector<SDValue, 2> Results; 1874 LowerUDIVREM64(Op, DAG, Results); 1875 return DAG.getMergeValues(Results, DL); 1876 } 1877 1878 if (VT == MVT::i32) { 1879 if (SDValue Res = LowerDIVREM24(Op, DAG, false)) 1880 return Res; 1881 } 1882 1883 SDValue Num = Op.getOperand(0); 1884 SDValue Den = Op.getOperand(1); 1885 1886 // RCP = URECIP(Den) = 2^32 / Den + e 1887 // e is rounding error. 1888 SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); 1889 1890 // RCP_LO = mul(RCP, Den) */ 1891 SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); 1892 1893 // RCP_HI = mulhu (RCP, Den) */ 1894 SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); 1895 1896 // NEG_RCP_LO = -RCP_LO 1897 SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 1898 RCP_LO); 1899 1900 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 1901 SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), 1902 NEG_RCP_LO, RCP_LO, 1903 ISD::SETEQ); 1904 // Calculate the rounding error from the URECIP instruction 1905 // E = mulhu(ABS_RCP_LO, RCP) 1906 SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); 1907 1908 // RCP_A_E = RCP + E 1909 SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); 1910 1911 // RCP_S_E = RCP - E 1912 SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); 1913 1914 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 1915 SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), 1916 RCP_A_E, RCP_S_E, 1917 ISD::SETEQ); 1918 // Quotient = mulhu(Tmp0, Num) 1919 SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); 1920 1921 // Num_S_Remainder = Quotient * Den 1922 SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); 1923 1924 // Remainder = Num - Num_S_Remainder 1925 SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); 1926 1927 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) 1928 SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, 1929 DAG.getConstant(-1, DL, VT), 1930 DAG.getConstant(0, DL, VT), 1931 ISD::SETUGE); 1932 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) 1933 SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, 1934 Num_S_Remainder, 1935 DAG.getConstant(-1, DL, VT), 1936 DAG.getConstant(0, DL, VT), 1937 ISD::SETUGE); 1938 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 1939 SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, 1940 Remainder_GE_Zero); 1941 1942 // Calculate Division result: 1943 1944 // Quotient_A_One = Quotient + 1 1945 SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, 1946 DAG.getConstant(1, DL, VT)); 1947 1948 // Quotient_S_One = Quotient - 1 1949 SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, 1950 DAG.getConstant(1, DL, VT)); 1951 1952 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) 1953 SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), 1954 Quotient, Quotient_A_One, ISD::SETEQ); 1955 1956 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) 1957 Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), 1958 Quotient_S_One, Div, ISD::SETEQ); 1959 1960 // Calculate Rem result: 1961 1962 // Remainder_S_Den = Remainder - Den 1963 SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); 1964 1965 // Remainder_A_Den = Remainder + Den 1966 SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); 1967 1968 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) 1969 SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), 1970 Remainder, Remainder_S_Den, ISD::SETEQ); 1971 1972 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) 1973 Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), 1974 Remainder_A_Den, Rem, ISD::SETEQ); 1975 SDValue Ops[2] = { 1976 Div, 1977 Rem 1978 }; 1979 return DAG.getMergeValues(Ops, DL); 1980 } 1981 1982 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, 1983 SelectionDAG &DAG) const { 1984 SDLoc DL(Op); 1985 EVT VT = Op.getValueType(); 1986 1987 SDValue LHS = Op.getOperand(0); 1988 SDValue RHS = Op.getOperand(1); 1989 1990 SDValue Zero = DAG.getConstant(0, DL, VT); 1991 SDValue NegOne = DAG.getConstant(-1, DL, VT); 1992 1993 if (VT == MVT::i32) { 1994 if (SDValue Res = LowerDIVREM24(Op, DAG, true)) 1995 return Res; 1996 } 1997 1998 if (VT == MVT::i64 && 1999 DAG.ComputeNumSignBits(LHS) > 32 && 2000 DAG.ComputeNumSignBits(RHS) > 32) { 2001 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 2002 2003 //HiLo split 2004 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); 2005 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); 2006 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 2007 LHS_Lo, RHS_Lo); 2008 SDValue Res[2] = { 2009 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), 2010 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) 2011 }; 2012 return DAG.getMergeValues(Res, DL); 2013 } 2014 2015 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); 2016 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); 2017 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); 2018 SDValue RSign = LHSign; // Remainder sign is the same as LHS 2019 2020 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); 2021 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); 2022 2023 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); 2024 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); 2025 2026 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); 2027 SDValue Rem = Div.getValue(1); 2028 2029 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); 2030 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); 2031 2032 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); 2033 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); 2034 2035 SDValue Res[2] = { 2036 Div, 2037 Rem 2038 }; 2039 return DAG.getMergeValues(Res, DL); 2040 } 2041 2042 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) 2043 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { 2044 SDLoc SL(Op); 2045 EVT VT = Op.getValueType(); 2046 SDValue X = Op.getOperand(0); 2047 SDValue Y = Op.getOperand(1); 2048 2049 // TODO: Should this propagate fast-math-flags? 2050 2051 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); 2052 SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); 2053 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); 2054 2055 return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); 2056 } 2057 2058 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { 2059 SDLoc SL(Op); 2060 SDValue Src = Op.getOperand(0); 2061 2062 // result = trunc(src) 2063 // if (src > 0.0 && src != result) 2064 // result += 1.0 2065 2066 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2067 2068 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); 2069 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 2070 2071 EVT SetCCVT = 2072 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2073 2074 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); 2075 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2076 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2077 2078 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); 2079 // TODO: Should this propagate fast-math-flags? 2080 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2081 } 2082 2083 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, 2084 SelectionDAG &DAG) { 2085 const unsigned FractBits = 52; 2086 const unsigned ExpBits = 11; 2087 2088 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, 2089 Hi, 2090 DAG.getConstant(FractBits - 32, SL, MVT::i32), 2091 DAG.getConstant(ExpBits, SL, MVT::i32)); 2092 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, 2093 DAG.getConstant(1023, SL, MVT::i32)); 2094 2095 return Exp; 2096 } 2097 2098 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { 2099 SDLoc SL(Op); 2100 SDValue Src = Op.getOperand(0); 2101 2102 assert(Op.getValueType() == MVT::f64); 2103 2104 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 2105 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 2106 2107 SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); 2108 2109 // Extract the upper half, since this is where we will find the sign and 2110 // exponent. 2111 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); 2112 2113 SDValue Exp = extractF64Exponent(Hi, SL, DAG); 2114 2115 const unsigned FractBits = 52; 2116 2117 // Extract the sign bit. 2118 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); 2119 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); 2120 2121 // Extend back to 64-bits. 2122 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit}); 2123 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); 2124 2125 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); 2126 const SDValue FractMask 2127 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); 2128 2129 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); 2130 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); 2131 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); 2132 2133 EVT SetCCVT = 2134 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); 2135 2136 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); 2137 2138 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); 2139 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); 2140 2141 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); 2142 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); 2143 2144 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); 2145 } 2146 2147 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { 2148 SDLoc SL(Op); 2149 SDValue Src = Op.getOperand(0); 2150 2151 assert(Op.getValueType() == MVT::f64); 2152 2153 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2154 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); 2155 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); 2156 2157 // TODO: Should this propagate fast-math-flags? 2158 2159 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); 2160 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); 2161 2162 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); 2163 2164 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2165 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); 2166 2167 EVT SetCCVT = 2168 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2169 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); 2170 2171 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); 2172 } 2173 2174 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { 2175 // FNEARBYINT and FRINT are the same, except in their handling of FP 2176 // exceptions. Those aren't really meaningful for us, and OpenCL only has 2177 // rint, so just treat them as equivalent. 2178 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); 2179 } 2180 2181 // XXX - May require not supporting f32 denormals? 2182 2183 // Don't handle v2f16. The extra instructions to scalarize and repack around the 2184 // compare and vselect end up producing worse code than scalarizing the whole 2185 // operation. 2186 SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const { 2187 SDLoc SL(Op); 2188 SDValue X = Op.getOperand(0); 2189 EVT VT = Op.getValueType(); 2190 2191 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X); 2192 2193 // TODO: Should this propagate fast-math-flags? 2194 2195 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T); 2196 2197 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff); 2198 2199 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2200 const SDValue One = DAG.getConstantFP(1.0, SL, VT); 2201 const SDValue Half = DAG.getConstantFP(0.5, SL, VT); 2202 2203 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X); 2204 2205 EVT SetCCVT = 2206 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2207 2208 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); 2209 2210 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero); 2211 2212 return DAG.getNode(ISD::FADD, SL, VT, T, Sel); 2213 } 2214 2215 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { 2216 SDLoc SL(Op); 2217 SDValue X = Op.getOperand(0); 2218 2219 SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); 2220 2221 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 2222 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 2223 const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32); 2224 const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); 2225 EVT SetCCVT = 2226 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); 2227 2228 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 2229 2230 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); 2231 2232 SDValue Exp = extractF64Exponent(Hi, SL, DAG); 2233 2234 const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL, 2235 MVT::i64); 2236 2237 SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); 2238 SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, 2239 DAG.getConstant(INT64_C(0x0008000000000000), SL, 2240 MVT::i64), 2241 Exp); 2242 2243 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); 2244 SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, 2245 DAG.getConstant(0, SL, MVT::i64), Tmp0, 2246 ISD::SETNE); 2247 2248 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, 2249 D, DAG.getConstant(0, SL, MVT::i64)); 2250 SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); 2251 2252 K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); 2253 K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); 2254 2255 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); 2256 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); 2257 SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); 2258 2259 SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, 2260 ExpEqNegOne, 2261 DAG.getConstantFP(1.0, SL, MVT::f64), 2262 DAG.getConstantFP(0.0, SL, MVT::f64)); 2263 2264 SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); 2265 2266 K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); 2267 K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); 2268 2269 return K; 2270 } 2271 2272 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2273 EVT VT = Op.getValueType(); 2274 2275 if (VT == MVT::f32 || VT == MVT::f16) 2276 return LowerFROUND32_16(Op, DAG); 2277 2278 if (VT == MVT::f64) 2279 return LowerFROUND64(Op, DAG); 2280 2281 llvm_unreachable("unhandled type"); 2282 } 2283 2284 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { 2285 SDLoc SL(Op); 2286 SDValue Src = Op.getOperand(0); 2287 2288 // result = trunc(src); 2289 // if (src < 0.0 && src != result) 2290 // result += -1.0. 2291 2292 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2293 2294 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); 2295 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); 2296 2297 EVT SetCCVT = 2298 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2299 2300 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); 2301 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2302 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2303 2304 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); 2305 // TODO: Should this propagate fast-math-flags? 2306 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2307 } 2308 2309 SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG, 2310 double Log2BaseInverted) const { 2311 EVT VT = Op.getValueType(); 2312 2313 SDLoc SL(Op); 2314 SDValue Operand = Op.getOperand(0); 2315 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand); 2316 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT); 2317 2318 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand); 2319 } 2320 2321 // Return M_LOG2E of appropriate type 2322 static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) { 2323 switch (VT.getScalarType().getSimpleVT().SimpleTy) { 2324 case MVT::f32: 2325 return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT); 2326 case MVT::f16: 2327 return DAG.getConstantFP( 2328 APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"), 2329 SL, VT); 2330 case MVT::f64: 2331 return DAG.getConstantFP( 2332 APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT); 2333 default: 2334 llvm_unreachable("unsupported fp type"); 2335 } 2336 } 2337 2338 // exp2(M_LOG2E_F * f); 2339 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { 2340 EVT VT = Op.getValueType(); 2341 SDLoc SL(Op); 2342 SDValue Src = Op.getOperand(0); 2343 2344 const SDValue K = getLog2EVal(DAG, SL, VT); 2345 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags()); 2346 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags()); 2347 } 2348 2349 static bool isCtlzOpc(unsigned Opc) { 2350 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; 2351 } 2352 2353 static bool isCttzOpc(unsigned Opc) { 2354 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; 2355 } 2356 2357 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const { 2358 SDLoc SL(Op); 2359 SDValue Src = Op.getOperand(0); 2360 bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF || 2361 Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; 2362 2363 unsigned ISDOpc, NewOpc; 2364 if (isCtlzOpc(Op.getOpcode())) { 2365 ISDOpc = ISD::CTLZ_ZERO_UNDEF; 2366 NewOpc = AMDGPUISD::FFBH_U32; 2367 } else if (isCttzOpc(Op.getOpcode())) { 2368 ISDOpc = ISD::CTTZ_ZERO_UNDEF; 2369 NewOpc = AMDGPUISD::FFBL_B32; 2370 } else 2371 llvm_unreachable("Unexpected OPCode!!!"); 2372 2373 2374 if (ZeroUndef && Src.getValueType() == MVT::i32) 2375 return DAG.getNode(NewOpc, SL, MVT::i32, Src); 2376 2377 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); 2378 2379 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 2380 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 2381 2382 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 2383 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 2384 2385 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), 2386 *DAG.getContext(), MVT::i32); 2387 2388 SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo; 2389 SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ); 2390 2391 SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo); 2392 SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi); 2393 2394 const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); 2395 SDValue Add, NewOpr; 2396 if (isCtlzOpc(Op.getOpcode())) { 2397 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32); 2398 // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) 2399 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi); 2400 } else { 2401 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32); 2402 // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x)) 2403 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo); 2404 } 2405 2406 if (!ZeroUndef) { 2407 // Test if the full 64-bit input is zero. 2408 2409 // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32, 2410 // which we probably don't want. 2411 SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi; 2412 SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ); 2413 SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0); 2414 2415 // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction 2416 // with the same cycles, otherwise it is slower. 2417 // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src, 2418 // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ); 2419 2420 const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32); 2421 2422 // The instruction returns -1 for 0 input, but the defined intrinsic 2423 // behavior is to return the number of bits. 2424 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, 2425 SrcIsZero, Bits32, NewOpr); 2426 } 2427 2428 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); 2429 } 2430 2431 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, 2432 bool Signed) const { 2433 // Unsigned 2434 // cul2f(ulong u) 2435 //{ 2436 // uint lz = clz(u); 2437 // uint e = (u != 0) ? 127U + 63U - lz : 0; 2438 // u = (u << lz) & 0x7fffffffffffffffUL; 2439 // ulong t = u & 0xffffffffffUL; 2440 // uint v = (e << 23) | (uint)(u >> 40); 2441 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U); 2442 // return as_float(v + r); 2443 //} 2444 // Signed 2445 // cl2f(long l) 2446 //{ 2447 // long s = l >> 63; 2448 // float r = cul2f((l + s) ^ s); 2449 // return s ? -r : r; 2450 //} 2451 2452 SDLoc SL(Op); 2453 SDValue Src = Op.getOperand(0); 2454 SDValue L = Src; 2455 2456 SDValue S; 2457 if (Signed) { 2458 const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64); 2459 S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit); 2460 2461 SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S); 2462 L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S); 2463 } 2464 2465 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), 2466 *DAG.getContext(), MVT::f32); 2467 2468 2469 SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32); 2470 SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64); 2471 SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L); 2472 LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ); 2473 2474 SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32); 2475 SDValue E = DAG.getSelect(SL, MVT::i32, 2476 DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE), 2477 DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ), 2478 ZeroI32); 2479 2480 SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64, 2481 DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ), 2482 DAG.getConstant((-1ULL) >> 1, SL, MVT::i64)); 2483 2484 SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U, 2485 DAG.getConstant(0xffffffffffULL, SL, MVT::i64)); 2486 2487 SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64, 2488 U, DAG.getConstant(40, SL, MVT::i64)); 2489 2490 SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32, 2491 DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)), 2492 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl)); 2493 2494 SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64); 2495 SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT); 2496 SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ); 2497 2498 SDValue One = DAG.getConstant(1, SL, MVT::i32); 2499 2500 SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One); 2501 2502 SDValue R = DAG.getSelect(SL, MVT::i32, 2503 RCmp, 2504 One, 2505 DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32)); 2506 R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R); 2507 R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R); 2508 2509 if (!Signed) 2510 return R; 2511 2512 SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R); 2513 return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R); 2514 } 2515 2516 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, 2517 bool Signed) const { 2518 SDLoc SL(Op); 2519 SDValue Src = Op.getOperand(0); 2520 2521 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); 2522 2523 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, 2524 DAG.getConstant(0, SL, MVT::i32)); 2525 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, 2526 DAG.getConstant(1, SL, MVT::i32)); 2527 2528 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, 2529 SL, MVT::f64, Hi); 2530 2531 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); 2532 2533 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, 2534 DAG.getConstant(32, SL, MVT::i32)); 2535 // TODO: Should this propagate fast-math-flags? 2536 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); 2537 } 2538 2539 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, 2540 SelectionDAG &DAG) const { 2541 assert(Op.getOperand(0).getValueType() == MVT::i64 && 2542 "operation should be legal"); 2543 2544 // TODO: Factor out code common with LowerSINT_TO_FP. 2545 2546 EVT DestVT = Op.getValueType(); 2547 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { 2548 SDLoc DL(Op); 2549 SDValue Src = Op.getOperand(0); 2550 2551 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); 2552 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op)); 2553 SDValue FPRound = 2554 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); 2555 2556 return FPRound; 2557 } 2558 2559 if (DestVT == MVT::f32) 2560 return LowerINT_TO_FP32(Op, DAG, false); 2561 2562 assert(DestVT == MVT::f64); 2563 return LowerINT_TO_FP64(Op, DAG, false); 2564 } 2565 2566 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, 2567 SelectionDAG &DAG) const { 2568 assert(Op.getOperand(0).getValueType() == MVT::i64 && 2569 "operation should be legal"); 2570 2571 // TODO: Factor out code common with LowerUINT_TO_FP. 2572 2573 EVT DestVT = Op.getValueType(); 2574 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { 2575 SDLoc DL(Op); 2576 SDValue Src = Op.getOperand(0); 2577 2578 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); 2579 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op)); 2580 SDValue FPRound = 2581 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); 2582 2583 return FPRound; 2584 } 2585 2586 if (DestVT == MVT::f32) 2587 return LowerINT_TO_FP32(Op, DAG, true); 2588 2589 assert(DestVT == MVT::f64); 2590 return LowerINT_TO_FP64(Op, DAG, true); 2591 } 2592 2593 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, 2594 bool Signed) const { 2595 SDLoc SL(Op); 2596 2597 SDValue Src = Op.getOperand(0); 2598 2599 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2600 2601 SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL, 2602 MVT::f64); 2603 SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, 2604 MVT::f64); 2605 // TODO: Should this propagate fast-math-flags? 2606 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); 2607 2608 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); 2609 2610 2611 SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc); 2612 2613 SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL, 2614 MVT::i32, FloorMul); 2615 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); 2616 2617 SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}); 2618 2619 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); 2620 } 2621 2622 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { 2623 SDLoc DL(Op); 2624 SDValue N0 = Op.getOperand(0); 2625 2626 // Convert to target node to get known bits 2627 if (N0.getValueType() == MVT::f32) 2628 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); 2629 2630 if (getTargetMachine().Options.UnsafeFPMath) { 2631 // There is a generic expand for FP_TO_FP16 with unsafe fast math. 2632 return SDValue(); 2633 } 2634 2635 assert(N0.getSimpleValueType() == MVT::f64); 2636 2637 // f64 -> f16 conversion using round-to-nearest-even rounding mode. 2638 const unsigned ExpMask = 0x7ff; 2639 const unsigned ExpBiasf64 = 1023; 2640 const unsigned ExpBiasf16 = 15; 2641 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 2642 SDValue One = DAG.getConstant(1, DL, MVT::i32); 2643 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0); 2644 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U, 2645 DAG.getConstant(32, DL, MVT::i64)); 2646 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32); 2647 U = DAG.getZExtOrTrunc(U, DL, MVT::i32); 2648 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 2649 DAG.getConstant(20, DL, MVT::i64)); 2650 E = DAG.getNode(ISD::AND, DL, MVT::i32, E, 2651 DAG.getConstant(ExpMask, DL, MVT::i32)); 2652 // Subtract the fp64 exponent bias (1023) to get the real exponent and 2653 // add the f16 bias (15) to get the biased exponent for the f16 format. 2654 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E, 2655 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32)); 2656 2657 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 2658 DAG.getConstant(8, DL, MVT::i32)); 2659 M = DAG.getNode(ISD::AND, DL, MVT::i32, M, 2660 DAG.getConstant(0xffe, DL, MVT::i32)); 2661 2662 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH, 2663 DAG.getConstant(0x1ff, DL, MVT::i32)); 2664 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U); 2665 2666 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ); 2667 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set); 2668 2669 // (M != 0 ? 0x0200 : 0) | 0x7c00; 2670 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32, 2671 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32), 2672 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32)); 2673 2674 // N = M | (E << 12); 2675 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M, 2676 DAG.getNode(ISD::SHL, DL, MVT::i32, E, 2677 DAG.getConstant(12, DL, MVT::i32))); 2678 2679 // B = clamp(1-E, 0, 13); 2680 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32, 2681 One, E); 2682 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero); 2683 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B, 2684 DAG.getConstant(13, DL, MVT::i32)); 2685 2686 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M, 2687 DAG.getConstant(0x1000, DL, MVT::i32)); 2688 2689 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B); 2690 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B); 2691 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE); 2692 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1); 2693 2694 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT); 2695 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V, 2696 DAG.getConstant(0x7, DL, MVT::i32)); 2697 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V, 2698 DAG.getConstant(2, DL, MVT::i32)); 2699 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32), 2700 One, Zero, ISD::SETEQ); 2701 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32), 2702 One, Zero, ISD::SETGT); 2703 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1); 2704 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1); 2705 2706 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32), 2707 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT); 2708 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32), 2709 I, V, ISD::SETEQ); 2710 2711 // Extract the sign bit. 2712 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 2713 DAG.getConstant(16, DL, MVT::i32)); 2714 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign, 2715 DAG.getConstant(0x8000, DL, MVT::i32)); 2716 2717 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V); 2718 return DAG.getZExtOrTrunc(V, DL, Op.getValueType()); 2719 } 2720 2721 SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, 2722 SelectionDAG &DAG) const { 2723 SDValue Src = Op.getOperand(0); 2724 2725 // TODO: Factor out code common with LowerFP_TO_UINT. 2726 2727 EVT SrcVT = Src.getValueType(); 2728 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { 2729 SDLoc DL(Op); 2730 2731 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); 2732 SDValue FpToInt32 = 2733 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); 2734 2735 return FpToInt32; 2736 } 2737 2738 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) 2739 return LowerFP64_TO_INT(Op, DAG, true); 2740 2741 return SDValue(); 2742 } 2743 2744 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, 2745 SelectionDAG &DAG) const { 2746 SDValue Src = Op.getOperand(0); 2747 2748 // TODO: Factor out code common with LowerFP_TO_SINT. 2749 2750 EVT SrcVT = Src.getValueType(); 2751 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { 2752 SDLoc DL(Op); 2753 2754 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); 2755 SDValue FpToInt32 = 2756 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); 2757 2758 return FpToInt32; 2759 } 2760 2761 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) 2762 return LowerFP64_TO_INT(Op, DAG, false); 2763 2764 return SDValue(); 2765 } 2766 2767 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 2768 SelectionDAG &DAG) const { 2769 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 2770 MVT VT = Op.getSimpleValueType(); 2771 MVT ScalarVT = VT.getScalarType(); 2772 2773 assert(VT.isVector()); 2774 2775 SDValue Src = Op.getOperand(0); 2776 SDLoc DL(Op); 2777 2778 // TODO: Don't scalarize on Evergreen? 2779 unsigned NElts = VT.getVectorNumElements(); 2780 SmallVector<SDValue, 8> Args; 2781 DAG.ExtractVectorElements(Src, Args, 0, NElts); 2782 2783 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); 2784 for (unsigned I = 0; I < NElts; ++I) 2785 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); 2786 2787 return DAG.getBuildVector(VT, DL, Args); 2788 } 2789 2790 //===----------------------------------------------------------------------===// 2791 // Custom DAG optimizations 2792 //===----------------------------------------------------------------------===// 2793 2794 static bool isU24(SDValue Op, SelectionDAG &DAG) { 2795 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24; 2796 } 2797 2798 static bool isI24(SDValue Op, SelectionDAG &DAG) { 2799 EVT VT = Op.getValueType(); 2800 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated 2801 // as unsigned 24-bit values. 2802 AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24; 2803 } 2804 2805 static SDValue simplifyI24(SDNode *Node24, 2806 TargetLowering::DAGCombinerInfo &DCI) { 2807 SelectionDAG &DAG = DCI.DAG; 2808 SDValue LHS = Node24->getOperand(0); 2809 SDValue RHS = Node24->getOperand(1); 2810 2811 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); 2812 2813 // First try to simplify using GetDemandedBits which allows the operands to 2814 // have other uses, but will only perform simplifications that involve 2815 // bypassing some nodes for this user. 2816 SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded); 2817 SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded); 2818 if (DemandedLHS || DemandedRHS) 2819 return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(), 2820 DemandedLHS ? DemandedLHS : LHS, 2821 DemandedRHS ? DemandedRHS : RHS); 2822 2823 // Now try SimplifyDemandedBits which can simplify the nodes used by our 2824 // operands if this node is the only user. 2825 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 2826 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI)) 2827 return SDValue(Node24, 0); 2828 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI)) 2829 return SDValue(Node24, 0); 2830 2831 return SDValue(); 2832 } 2833 2834 template <typename IntTy> 2835 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, 2836 uint32_t Width, const SDLoc &DL) { 2837 if (Width + Offset < 32) { 2838 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); 2839 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); 2840 return DAG.getConstant(Result, DL, MVT::i32); 2841 } 2842 2843 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); 2844 } 2845 2846 static bool hasVolatileUser(SDNode *Val) { 2847 for (SDNode *U : Val->uses()) { 2848 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) { 2849 if (M->isVolatile()) 2850 return true; 2851 } 2852 } 2853 2854 return false; 2855 } 2856 2857 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { 2858 // i32 vectors are the canonical memory type. 2859 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT)) 2860 return false; 2861 2862 if (!VT.isByteSized()) 2863 return false; 2864 2865 unsigned Size = VT.getStoreSize(); 2866 2867 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector()) 2868 return false; 2869 2870 if (Size == 3 || (Size > 4 && (Size % 4 != 0))) 2871 return false; 2872 2873 return true; 2874 } 2875 2876 // Replace load of an illegal type with a store of a bitcast to a friendlier 2877 // type. 2878 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, 2879 DAGCombinerInfo &DCI) const { 2880 if (!DCI.isBeforeLegalize()) 2881 return SDValue(); 2882 2883 LoadSDNode *LN = cast<LoadSDNode>(N); 2884 if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) 2885 return SDValue(); 2886 2887 SDLoc SL(N); 2888 SelectionDAG &DAG = DCI.DAG; 2889 EVT VT = LN->getMemoryVT(); 2890 2891 unsigned Size = VT.getStoreSize(); 2892 unsigned Align = LN->getAlignment(); 2893 if (Align < Size && isTypeLegal(VT)) { 2894 bool IsFast; 2895 unsigned AS = LN->getAddressSpace(); 2896 2897 // Expand unaligned loads earlier than legalization. Due to visitation order 2898 // problems during legalization, the emitted instructions to pack and unpack 2899 // the bytes again are not eliminated in the case of an unaligned copy. 2900 if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { 2901 if (VT.isVector()) 2902 return scalarizeVectorLoad(LN, DAG); 2903 2904 SDValue Ops[2]; 2905 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); 2906 return DAG.getMergeValues(Ops, SDLoc(N)); 2907 } 2908 2909 if (!IsFast) 2910 return SDValue(); 2911 } 2912 2913 if (!shouldCombineMemoryType(VT)) 2914 return SDValue(); 2915 2916 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 2917 2918 SDValue NewLoad 2919 = DAG.getLoad(NewVT, SL, LN->getChain(), 2920 LN->getBasePtr(), LN->getMemOperand()); 2921 2922 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad); 2923 DCI.CombineTo(N, BC, NewLoad.getValue(1)); 2924 return SDValue(N, 0); 2925 } 2926 2927 // Replace store of an illegal type with a store of a bitcast to a friendlier 2928 // type. 2929 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, 2930 DAGCombinerInfo &DCI) const { 2931 if (!DCI.isBeforeLegalize()) 2932 return SDValue(); 2933 2934 StoreSDNode *SN = cast<StoreSDNode>(N); 2935 if (SN->isVolatile() || !ISD::isNormalStore(SN)) 2936 return SDValue(); 2937 2938 EVT VT = SN->getMemoryVT(); 2939 unsigned Size = VT.getStoreSize(); 2940 2941 SDLoc SL(N); 2942 SelectionDAG &DAG = DCI.DAG; 2943 unsigned Align = SN->getAlignment(); 2944 if (Align < Size && isTypeLegal(VT)) { 2945 bool IsFast; 2946 unsigned AS = SN->getAddressSpace(); 2947 2948 // Expand unaligned stores earlier than legalization. Due to visitation 2949 // order problems during legalization, the emitted instructions to pack and 2950 // unpack the bytes again are not eliminated in the case of an unaligned 2951 // copy. 2952 if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { 2953 if (VT.isVector()) 2954 return scalarizeVectorStore(SN, DAG); 2955 2956 return expandUnalignedStore(SN, DAG); 2957 } 2958 2959 if (!IsFast) 2960 return SDValue(); 2961 } 2962 2963 if (!shouldCombineMemoryType(VT)) 2964 return SDValue(); 2965 2966 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 2967 SDValue Val = SN->getValue(); 2968 2969 //DCI.AddToWorklist(Val.getNode()); 2970 2971 bool OtherUses = !Val.hasOneUse(); 2972 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); 2973 if (OtherUses) { 2974 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal); 2975 DAG.ReplaceAllUsesOfValueWith(Val, CastBack); 2976 } 2977 2978 return DAG.getStore(SN->getChain(), SL, CastVal, 2979 SN->getBasePtr(), SN->getMemOperand()); 2980 } 2981 2982 // FIXME: This should go in generic DAG combiner with an isTruncateFree check, 2983 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU 2984 // issues. 2985 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, 2986 DAGCombinerInfo &DCI) const { 2987 SelectionDAG &DAG = DCI.DAG; 2988 SDValue N0 = N->getOperand(0); 2989 2990 // (vt2 (assertzext (truncate vt0:x), vt1)) -> 2991 // (vt2 (truncate (assertzext vt0:x, vt1))) 2992 if (N0.getOpcode() == ISD::TRUNCATE) { 2993 SDValue N1 = N->getOperand(1); 2994 EVT ExtVT = cast<VTSDNode>(N1)->getVT(); 2995 SDLoc SL(N); 2996 2997 SDValue Src = N0.getOperand(0); 2998 EVT SrcVT = Src.getValueType(); 2999 if (SrcVT.bitsGE(ExtVT)) { 3000 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1); 3001 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg); 3002 } 3003 } 3004 3005 return SDValue(); 3006 } 3007 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the 3008 /// binary operation \p Opc to it with the corresponding constant operands. 3009 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( 3010 DAGCombinerInfo &DCI, const SDLoc &SL, 3011 unsigned Opc, SDValue LHS, 3012 uint32_t ValLo, uint32_t ValHi) const { 3013 SelectionDAG &DAG = DCI.DAG; 3014 SDValue Lo, Hi; 3015 std::tie(Lo, Hi) = split64BitValue(LHS, DAG); 3016 3017 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32); 3018 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32); 3019 3020 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS); 3021 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS); 3022 3023 // Re-visit the ands. It's possible we eliminated one of them and it could 3024 // simplify the vector. 3025 DCI.AddToWorklist(Lo.getNode()); 3026 DCI.AddToWorklist(Hi.getNode()); 3027 3028 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd}); 3029 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 3030 } 3031 3032 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, 3033 DAGCombinerInfo &DCI) const { 3034 EVT VT = N->getValueType(0); 3035 3036 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 3037 if (!RHS) 3038 return SDValue(); 3039 3040 SDValue LHS = N->getOperand(0); 3041 unsigned RHSVal = RHS->getZExtValue(); 3042 if (!RHSVal) 3043 return LHS; 3044 3045 SDLoc SL(N); 3046 SelectionDAG &DAG = DCI.DAG; 3047 3048 switch (LHS->getOpcode()) { 3049 default: 3050 break; 3051 case ISD::ZERO_EXTEND: 3052 case ISD::SIGN_EXTEND: 3053 case ISD::ANY_EXTEND: { 3054 SDValue X = LHS->getOperand(0); 3055 3056 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && 3057 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { 3058 // Prefer build_vector as the canonical form if packed types are legal. 3059 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x 3060 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, 3061 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) }); 3062 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); 3063 } 3064 3065 // shl (ext x) => zext (shl x), if shift does not overflow int 3066 if (VT != MVT::i64) 3067 break; 3068 KnownBits Known = DAG.computeKnownBits(X); 3069 unsigned LZ = Known.countMinLeadingZeros(); 3070 if (LZ < RHSVal) 3071 break; 3072 EVT XVT = X.getValueType(); 3073 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); 3074 return DAG.getZExtOrTrunc(Shl, SL, VT); 3075 } 3076 } 3077 3078 if (VT != MVT::i64) 3079 return SDValue(); 3080 3081 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) 3082 3083 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 3084 // common case, splitting this into a move and a 32-bit shift is faster and 3085 // the same code size. 3086 if (RHSVal < 32) 3087 return SDValue(); 3088 3089 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32); 3090 3091 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); 3092 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt); 3093 3094 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 3095 3096 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift}); 3097 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 3098 } 3099 3100 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, 3101 DAGCombinerInfo &DCI) const { 3102 if (N->getValueType(0) != MVT::i64) 3103 return SDValue(); 3104 3105 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 3106 if (!RHS) 3107 return SDValue(); 3108 3109 SelectionDAG &DAG = DCI.DAG; 3110 SDLoc SL(N); 3111 unsigned RHSVal = RHS->getZExtValue(); 3112 3113 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31) 3114 if (RHSVal == 32) { 3115 SDValue Hi = getHiHalf64(N->getOperand(0), DAG); 3116 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, 3117 DAG.getConstant(31, SL, MVT::i32)); 3118 3119 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift}); 3120 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); 3121 } 3122 3123 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31) 3124 if (RHSVal == 63) { 3125 SDValue Hi = getHiHalf64(N->getOperand(0), DAG); 3126 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, 3127 DAG.getConstant(31, SL, MVT::i32)); 3128 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift}); 3129 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); 3130 } 3131 3132 return SDValue(); 3133 } 3134 3135 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, 3136 DAGCombinerInfo &DCI) const { 3137 if (N->getValueType(0) != MVT::i64) 3138 return SDValue(); 3139 3140 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 3141 if (!RHS) 3142 return SDValue(); 3143 3144 unsigned ShiftAmt = RHS->getZExtValue(); 3145 if (ShiftAmt < 32) 3146 return SDValue(); 3147 3148 // srl i64:x, C for C >= 32 3149 // => 3150 // build_pair (srl hi_32(x), C - 32), 0 3151 3152 SelectionDAG &DAG = DCI.DAG; 3153 SDLoc SL(N); 3154 3155 SDValue One = DAG.getConstant(1, SL, MVT::i32); 3156 SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 3157 3158 SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0)); 3159 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, 3160 VecOp, One); 3161 3162 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32); 3163 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst); 3164 3165 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero}); 3166 3167 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); 3168 } 3169 3170 SDValue AMDGPUTargetLowering::performTruncateCombine( 3171 SDNode *N, DAGCombinerInfo &DCI) const { 3172 SDLoc SL(N); 3173 SelectionDAG &DAG = DCI.DAG; 3174 EVT VT = N->getValueType(0); 3175 SDValue Src = N->getOperand(0); 3176 3177 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x) 3178 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) { 3179 SDValue Vec = Src.getOperand(0); 3180 if (Vec.getOpcode() == ISD::BUILD_VECTOR) { 3181 SDValue Elt0 = Vec.getOperand(0); 3182 EVT EltVT = Elt0.getValueType(); 3183 if (VT.getSizeInBits() <= EltVT.getSizeInBits()) { 3184 if (EltVT.isFloatingPoint()) { 3185 Elt0 = DAG.getNode(ISD::BITCAST, SL, 3186 EltVT.changeTypeToInteger(), Elt0); 3187 } 3188 3189 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0); 3190 } 3191 } 3192 } 3193 3194 // Equivalent of above for accessing the high element of a vector as an 3195 // integer operation. 3196 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y) 3197 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) { 3198 if (auto K = isConstOrConstSplat(Src.getOperand(1))) { 3199 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) { 3200 SDValue BV = stripBitcast(Src.getOperand(0)); 3201 if (BV.getOpcode() == ISD::BUILD_VECTOR && 3202 BV.getValueType().getVectorNumElements() == 2) { 3203 SDValue SrcElt = BV.getOperand(1); 3204 EVT SrcEltVT = SrcElt.getValueType(); 3205 if (SrcEltVT.isFloatingPoint()) { 3206 SrcElt = DAG.getNode(ISD::BITCAST, SL, 3207 SrcEltVT.changeTypeToInteger(), SrcElt); 3208 } 3209 3210 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); 3211 } 3212 } 3213 } 3214 } 3215 3216 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit. 3217 // 3218 // i16 (trunc (srl i64:x, K)), K <= 16 -> 3219 // i16 (trunc (srl (i32 (trunc x), K))) 3220 if (VT.getScalarSizeInBits() < 32) { 3221 EVT SrcVT = Src.getValueType(); 3222 if (SrcVT.getScalarSizeInBits() > 32 && 3223 (Src.getOpcode() == ISD::SRL || 3224 Src.getOpcode() == ISD::SRA || 3225 Src.getOpcode() == ISD::SHL)) { 3226 SDValue Amt = Src.getOperand(1); 3227 KnownBits Known = DAG.computeKnownBits(Amt); 3228 unsigned Size = VT.getScalarSizeInBits(); 3229 if ((Known.isConstant() && Known.getConstant().ule(Size)) || 3230 (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) { 3231 EVT MidVT = VT.isVector() ? 3232 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 3233 VT.getVectorNumElements()) : MVT::i32; 3234 3235 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout()); 3236 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT, 3237 Src.getOperand(0)); 3238 DCI.AddToWorklist(Trunc.getNode()); 3239 3240 if (Amt.getValueType() != NewShiftVT) { 3241 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT); 3242 DCI.AddToWorklist(Amt.getNode()); 3243 } 3244 3245 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT, 3246 Trunc, Amt); 3247 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift); 3248 } 3249 } 3250 } 3251 3252 return SDValue(); 3253 } 3254 3255 // We need to specifically handle i64 mul here to avoid unnecessary conversion 3256 // instructions. If we only match on the legalized i64 mul expansion, 3257 // SimplifyDemandedBits will be unable to remove them because there will be 3258 // multiple uses due to the separate mul + mulh[su]. 3259 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, 3260 SDValue N0, SDValue N1, unsigned Size, bool Signed) { 3261 if (Size <= 32) { 3262 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 3263 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1); 3264 } 3265 3266 // Because we want to eliminate extension instructions before the 3267 // operation, we need to create a single user here (i.e. not the separate 3268 // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it. 3269 3270 unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24; 3271 3272 SDValue Mul = DAG.getNode(MulOpc, SL, 3273 DAG.getVTList(MVT::i32, MVT::i32), N0, N1); 3274 3275 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, 3276 Mul.getValue(0), Mul.getValue(1)); 3277 } 3278 3279 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, 3280 DAGCombinerInfo &DCI) const { 3281 EVT VT = N->getValueType(0); 3282 3283 unsigned Size = VT.getSizeInBits(); 3284 if (VT.isVector() || Size > 64) 3285 return SDValue(); 3286 3287 // There are i16 integer mul/mad. 3288 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) 3289 return SDValue(); 3290 3291 SelectionDAG &DAG = DCI.DAG; 3292 SDLoc DL(N); 3293 3294 SDValue N0 = N->getOperand(0); 3295 SDValue N1 = N->getOperand(1); 3296 3297 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends 3298 // in the source into any_extends if the result of the mul is truncated. Since 3299 // we can assume the high bits are whatever we want, use the underlying value 3300 // to avoid the unknown high bits from interfering. 3301 if (N0.getOpcode() == ISD::ANY_EXTEND) 3302 N0 = N0.getOperand(0); 3303 3304 if (N1.getOpcode() == ISD::ANY_EXTEND) 3305 N1 = N1.getOperand(0); 3306 3307 SDValue Mul; 3308 3309 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 3310 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 3311 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 3312 Mul = getMul24(DAG, DL, N0, N1, Size, false); 3313 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 3314 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 3315 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 3316 Mul = getMul24(DAG, DL, N0, N1, Size, true); 3317 } else { 3318 return SDValue(); 3319 } 3320 3321 // We need to use sext even for MUL_U24, because MUL_U24 is used 3322 // for signed multiply of 8 and 16-bit types. 3323 return DAG.getSExtOrTrunc(Mul, DL, VT); 3324 } 3325 3326 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N, 3327 DAGCombinerInfo &DCI) const { 3328 EVT VT = N->getValueType(0); 3329 3330 if (!Subtarget->hasMulI24() || VT.isVector()) 3331 return SDValue(); 3332 3333 SelectionDAG &DAG = DCI.DAG; 3334 SDLoc DL(N); 3335 3336 SDValue N0 = N->getOperand(0); 3337 SDValue N1 = N->getOperand(1); 3338 3339 if (!isI24(N0, DAG) || !isI24(N1, DAG)) 3340 return SDValue(); 3341 3342 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 3343 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 3344 3345 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1); 3346 DCI.AddToWorklist(Mulhi.getNode()); 3347 return DAG.getSExtOrTrunc(Mulhi, DL, VT); 3348 } 3349 3350 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, 3351 DAGCombinerInfo &DCI) const { 3352 EVT VT = N->getValueType(0); 3353 3354 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32) 3355 return SDValue(); 3356 3357 SelectionDAG &DAG = DCI.DAG; 3358 SDLoc DL(N); 3359 3360 SDValue N0 = N->getOperand(0); 3361 SDValue N1 = N->getOperand(1); 3362 3363 if (!isU24(N0, DAG) || !isU24(N1, DAG)) 3364 return SDValue(); 3365 3366 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 3367 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 3368 3369 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1); 3370 DCI.AddToWorklist(Mulhi.getNode()); 3371 return DAG.getZExtOrTrunc(Mulhi, DL, VT); 3372 } 3373 3374 SDValue AMDGPUTargetLowering::performMulLoHi24Combine( 3375 SDNode *N, DAGCombinerInfo &DCI) const { 3376 SelectionDAG &DAG = DCI.DAG; 3377 3378 // Simplify demanded bits before splitting into multiple users. 3379 if (SDValue V = simplifyI24(N, DCI)) 3380 return V; 3381 3382 SDValue N0 = N->getOperand(0); 3383 SDValue N1 = N->getOperand(1); 3384 3385 bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24); 3386 3387 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 3388 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24; 3389 3390 SDLoc SL(N); 3391 3392 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1); 3393 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1); 3394 return DAG.getMergeValues({ MulLo, MulHi }, SL); 3395 } 3396 3397 static bool isNegativeOne(SDValue Val) { 3398 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) 3399 return C->isAllOnesValue(); 3400 return false; 3401 } 3402 3403 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, 3404 SDValue Op, 3405 const SDLoc &DL, 3406 unsigned Opc) const { 3407 EVT VT = Op.getValueType(); 3408 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); 3409 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && 3410 LegalVT != MVT::i16)) 3411 return SDValue(); 3412 3413 if (VT != MVT::i32) 3414 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); 3415 3416 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op); 3417 if (VT != MVT::i32) 3418 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX); 3419 3420 return FFBX; 3421 } 3422 3423 // The native instructions return -1 on 0 input. Optimize out a select that 3424 // produces -1 on 0. 3425 // 3426 // TODO: If zero is not undef, we could also do this if the output is compared 3427 // against the bitwidth. 3428 // 3429 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. 3430 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, 3431 SDValue LHS, SDValue RHS, 3432 DAGCombinerInfo &DCI) const { 3433 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 3434 if (!CmpRhs || !CmpRhs->isNullValue()) 3435 return SDValue(); 3436 3437 SelectionDAG &DAG = DCI.DAG; 3438 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 3439 SDValue CmpLHS = Cond.getOperand(0); 3440 3441 unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : 3442 AMDGPUISD::FFBH_U32; 3443 3444 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x 3445 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x 3446 if (CCOpcode == ISD::SETEQ && 3447 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && 3448 RHS.getOperand(0) == CmpLHS && 3449 isNegativeOne(LHS)) { 3450 return getFFBX_U32(DAG, CmpLHS, SL, Opc); 3451 } 3452 3453 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x 3454 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x 3455 if (CCOpcode == ISD::SETNE && 3456 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && 3457 LHS.getOperand(0) == CmpLHS && 3458 isNegativeOne(RHS)) { 3459 return getFFBX_U32(DAG, CmpLHS, SL, Opc); 3460 } 3461 3462 return SDValue(); 3463 } 3464 3465 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, 3466 unsigned Op, 3467 const SDLoc &SL, 3468 SDValue Cond, 3469 SDValue N1, 3470 SDValue N2) { 3471 SelectionDAG &DAG = DCI.DAG; 3472 EVT VT = N1.getValueType(); 3473 3474 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, 3475 N1.getOperand(0), N2.getOperand(0)); 3476 DCI.AddToWorklist(NewSelect.getNode()); 3477 return DAG.getNode(Op, SL, VT, NewSelect); 3478 } 3479 3480 // Pull a free FP operation out of a select so it may fold into uses. 3481 // 3482 // select c, (fneg x), (fneg y) -> fneg (select c, x, y) 3483 // select c, (fneg x), k -> fneg (select c, x, (fneg k)) 3484 // 3485 // select c, (fabs x), (fabs y) -> fabs (select c, x, y) 3486 // select c, (fabs x), +k -> fabs (select c, x, k) 3487 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, 3488 SDValue N) { 3489 SelectionDAG &DAG = DCI.DAG; 3490 SDValue Cond = N.getOperand(0); 3491 SDValue LHS = N.getOperand(1); 3492 SDValue RHS = N.getOperand(2); 3493 3494 EVT VT = N.getValueType(); 3495 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || 3496 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { 3497 return distributeOpThroughSelect(DCI, LHS.getOpcode(), 3498 SDLoc(N), Cond, LHS, RHS); 3499 } 3500 3501 bool Inv = false; 3502 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) { 3503 std::swap(LHS, RHS); 3504 Inv = true; 3505 } 3506 3507 // TODO: Support vector constants. 3508 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 3509 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) { 3510 SDLoc SL(N); 3511 // If one side is an fneg/fabs and the other is a constant, we can push the 3512 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative. 3513 SDValue NewLHS = LHS.getOperand(0); 3514 SDValue NewRHS = RHS; 3515 3516 // Careful: if the neg can be folded up, don't try to pull it back down. 3517 bool ShouldFoldNeg = true; 3518 3519 if (NewLHS.hasOneUse()) { 3520 unsigned Opc = NewLHS.getOpcode(); 3521 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc)) 3522 ShouldFoldNeg = false; 3523 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL) 3524 ShouldFoldNeg = false; 3525 } 3526 3527 if (ShouldFoldNeg) { 3528 if (LHS.getOpcode() == ISD::FNEG) 3529 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 3530 else if (CRHS->isNegative()) 3531 return SDValue(); 3532 3533 if (Inv) 3534 std::swap(NewLHS, NewRHS); 3535 3536 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, 3537 Cond, NewLHS, NewRHS); 3538 DCI.AddToWorklist(NewSelect.getNode()); 3539 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); 3540 } 3541 } 3542 3543 return SDValue(); 3544 } 3545 3546 3547 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, 3548 DAGCombinerInfo &DCI) const { 3549 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) 3550 return Folded; 3551 3552 SDValue Cond = N->getOperand(0); 3553 if (Cond.getOpcode() != ISD::SETCC) 3554 return SDValue(); 3555 3556 EVT VT = N->getValueType(0); 3557 SDValue LHS = Cond.getOperand(0); 3558 SDValue RHS = Cond.getOperand(1); 3559 SDValue CC = Cond.getOperand(2); 3560 3561 SDValue True = N->getOperand(1); 3562 SDValue False = N->getOperand(2); 3563 3564 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses. 3565 SelectionDAG &DAG = DCI.DAG; 3566 if ((DAG.isConstantValueOfAnyType(True) || 3567 DAG.isConstantValueOfAnyType(True)) && 3568 (!DAG.isConstantValueOfAnyType(False) && 3569 !DAG.isConstantValueOfAnyType(False))) { 3570 // Swap cmp + select pair to move constant to false input. 3571 // This will allow using VOPC cndmasks more often. 3572 // select (setcc x, y), k, x -> select (setcc y, x) x, x 3573 3574 SDLoc SL(N); 3575 ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), 3576 LHS.getValueType().isInteger()); 3577 3578 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC); 3579 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True); 3580 } 3581 3582 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) { 3583 SDValue MinMax 3584 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); 3585 // Revisit this node so we can catch min3/max3/med3 patterns. 3586 //DCI.AddToWorklist(MinMax.getNode()); 3587 return MinMax; 3588 } 3589 } 3590 3591 // There's no reason to not do this if the condition has other uses. 3592 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); 3593 } 3594 3595 static bool isInv2Pi(const APFloat &APF) { 3596 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); 3597 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); 3598 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882)); 3599 3600 return APF.bitwiseIsEqual(KF16) || 3601 APF.bitwiseIsEqual(KF32) || 3602 APF.bitwiseIsEqual(KF64); 3603 } 3604 3605 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an 3606 // additional cost to negate them. 3607 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const { 3608 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) { 3609 if (C->isZero() && !C->isNegative()) 3610 return true; 3611 3612 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF())) 3613 return true; 3614 } 3615 3616 return false; 3617 } 3618 3619 static unsigned inverseMinMax(unsigned Opc) { 3620 switch (Opc) { 3621 case ISD::FMAXNUM: 3622 return ISD::FMINNUM; 3623 case ISD::FMINNUM: 3624 return ISD::FMAXNUM; 3625 case ISD::FMAXNUM_IEEE: 3626 return ISD::FMINNUM_IEEE; 3627 case ISD::FMINNUM_IEEE: 3628 return ISD::FMAXNUM_IEEE; 3629 case AMDGPUISD::FMAX_LEGACY: 3630 return AMDGPUISD::FMIN_LEGACY; 3631 case AMDGPUISD::FMIN_LEGACY: 3632 return AMDGPUISD::FMAX_LEGACY; 3633 default: 3634 llvm_unreachable("invalid min/max opcode"); 3635 } 3636 } 3637 3638 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, 3639 DAGCombinerInfo &DCI) const { 3640 SelectionDAG &DAG = DCI.DAG; 3641 SDValue N0 = N->getOperand(0); 3642 EVT VT = N->getValueType(0); 3643 3644 unsigned Opc = N0.getOpcode(); 3645 3646 // If the input has multiple uses and we can either fold the negate down, or 3647 // the other uses cannot, give up. This both prevents unprofitable 3648 // transformations and infinite loops: we won't repeatedly try to fold around 3649 // a negate that has no 'good' form. 3650 if (N0.hasOneUse()) { 3651 // This may be able to fold into the source, but at a code size cost. Don't 3652 // fold if the fold into the user is free. 3653 if (allUsesHaveSourceMods(N, 0)) 3654 return SDValue(); 3655 } else { 3656 if (fnegFoldsIntoOp(Opc) && 3657 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode()))) 3658 return SDValue(); 3659 } 3660 3661 SDLoc SL(N); 3662 switch (Opc) { 3663 case ISD::FADD: { 3664 if (!mayIgnoreSignedZero(N0)) 3665 return SDValue(); 3666 3667 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) 3668 SDValue LHS = N0.getOperand(0); 3669 SDValue RHS = N0.getOperand(1); 3670 3671 if (LHS.getOpcode() != ISD::FNEG) 3672 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); 3673 else 3674 LHS = LHS.getOperand(0); 3675 3676 if (RHS.getOpcode() != ISD::FNEG) 3677 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 3678 else 3679 RHS = RHS.getOperand(0); 3680 3681 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags()); 3682 if (!N0.hasOneUse()) 3683 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 3684 return Res; 3685 } 3686 case ISD::FMUL: 3687 case AMDGPUISD::FMUL_LEGACY: { 3688 // (fneg (fmul x, y)) -> (fmul x, (fneg y)) 3689 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y)) 3690 SDValue LHS = N0.getOperand(0); 3691 SDValue RHS = N0.getOperand(1); 3692 3693 if (LHS.getOpcode() == ISD::FNEG) 3694 LHS = LHS.getOperand(0); 3695 else if (RHS.getOpcode() == ISD::FNEG) 3696 RHS = RHS.getOperand(0); 3697 else 3698 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 3699 3700 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags()); 3701 if (!N0.hasOneUse()) 3702 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 3703 return Res; 3704 } 3705 case ISD::FMA: 3706 case ISD::FMAD: { 3707 if (!mayIgnoreSignedZero(N0)) 3708 return SDValue(); 3709 3710 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z)) 3711 SDValue LHS = N0.getOperand(0); 3712 SDValue MHS = N0.getOperand(1); 3713 SDValue RHS = N0.getOperand(2); 3714 3715 if (LHS.getOpcode() == ISD::FNEG) 3716 LHS = LHS.getOperand(0); 3717 else if (MHS.getOpcode() == ISD::FNEG) 3718 MHS = MHS.getOperand(0); 3719 else 3720 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS); 3721 3722 if (RHS.getOpcode() != ISD::FNEG) 3723 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 3724 else 3725 RHS = RHS.getOperand(0); 3726 3727 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); 3728 if (!N0.hasOneUse()) 3729 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 3730 return Res; 3731 } 3732 case ISD::FMAXNUM: 3733 case ISD::FMINNUM: 3734 case ISD::FMAXNUM_IEEE: 3735 case ISD::FMINNUM_IEEE: 3736 case AMDGPUISD::FMAX_LEGACY: 3737 case AMDGPUISD::FMIN_LEGACY: { 3738 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) 3739 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y) 3740 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y) 3741 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y) 3742 3743 SDValue LHS = N0.getOperand(0); 3744 SDValue RHS = N0.getOperand(1); 3745 3746 // 0 doesn't have a negated inline immediate. 3747 // TODO: This constant check should be generalized to other operations. 3748 if (isConstantCostlierToNegate(RHS)) 3749 return SDValue(); 3750 3751 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); 3752 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 3753 unsigned Opposite = inverseMinMax(Opc); 3754 3755 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags()); 3756 if (!N0.hasOneUse()) 3757 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 3758 return Res; 3759 } 3760 case AMDGPUISD::FMED3: { 3761 SDValue Ops[3]; 3762 for (unsigned I = 0; I < 3; ++I) 3763 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags()); 3764 3765 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags()); 3766 if (!N0.hasOneUse()) 3767 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 3768 return Res; 3769 } 3770 case ISD::FP_EXTEND: 3771 case ISD::FTRUNC: 3772 case ISD::FRINT: 3773 case ISD::FNEARBYINT: // XXX - Should fround be handled? 3774 case ISD::FSIN: 3775 case ISD::FCANONICALIZE: 3776 case AMDGPUISD::RCP: 3777 case AMDGPUISD::RCP_LEGACY: 3778 case AMDGPUISD::RCP_IFLAG: 3779 case AMDGPUISD::SIN_HW: { 3780 SDValue CvtSrc = N0.getOperand(0); 3781 if (CvtSrc.getOpcode() == ISD::FNEG) { 3782 // (fneg (fp_extend (fneg x))) -> (fp_extend x) 3783 // (fneg (rcp (fneg x))) -> (rcp x) 3784 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0)); 3785 } 3786 3787 if (!N0.hasOneUse()) 3788 return SDValue(); 3789 3790 // (fneg (fp_extend x)) -> (fp_extend (fneg x)) 3791 // (fneg (rcp x)) -> (rcp (fneg x)) 3792 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); 3793 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags()); 3794 } 3795 case ISD::FP_ROUND: { 3796 SDValue CvtSrc = N0.getOperand(0); 3797 3798 if (CvtSrc.getOpcode() == ISD::FNEG) { 3799 // (fneg (fp_round (fneg x))) -> (fp_round x) 3800 return DAG.getNode(ISD::FP_ROUND, SL, VT, 3801 CvtSrc.getOperand(0), N0.getOperand(1)); 3802 } 3803 3804 if (!N0.hasOneUse()) 3805 return SDValue(); 3806 3807 // (fneg (fp_round x)) -> (fp_round (fneg x)) 3808 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); 3809 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); 3810 } 3811 case ISD::FP16_TO_FP: { 3812 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal 3813 // f16, but legalization of f16 fneg ends up pulling it out of the source. 3814 // Put the fneg back as a legal source operation that can be matched later. 3815 SDLoc SL(N); 3816 3817 SDValue Src = N0.getOperand(0); 3818 EVT SrcVT = Src.getValueType(); 3819 3820 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000) 3821 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src, 3822 DAG.getConstant(0x8000, SL, SrcVT)); 3823 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); 3824 } 3825 default: 3826 return SDValue(); 3827 } 3828 } 3829 3830 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, 3831 DAGCombinerInfo &DCI) const { 3832 SelectionDAG &DAG = DCI.DAG; 3833 SDValue N0 = N->getOperand(0); 3834 3835 if (!N0.hasOneUse()) 3836 return SDValue(); 3837 3838 switch (N0.getOpcode()) { 3839 case ISD::FP16_TO_FP: { 3840 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal"); 3841 SDLoc SL(N); 3842 SDValue Src = N0.getOperand(0); 3843 EVT SrcVT = Src.getValueType(); 3844 3845 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff) 3846 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src, 3847 DAG.getConstant(0x7fff, SL, SrcVT)); 3848 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs); 3849 } 3850 default: 3851 return SDValue(); 3852 } 3853 } 3854 3855 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N, 3856 DAGCombinerInfo &DCI) const { 3857 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 3858 if (!CFP) 3859 return SDValue(); 3860 3861 // XXX - Should this flush denormals? 3862 const APFloat &Val = CFP->getValueAPF(); 3863 APFloat One(Val.getSemantics(), "1.0"); 3864 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); 3865 } 3866 3867 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, 3868 DAGCombinerInfo &DCI) const { 3869 SelectionDAG &DAG = DCI.DAG; 3870 SDLoc DL(N); 3871 3872 switch(N->getOpcode()) { 3873 default: 3874 break; 3875 case ISD::BITCAST: { 3876 EVT DestVT = N->getValueType(0); 3877 3878 // Push casts through vector builds. This helps avoid emitting a large 3879 // number of copies when materializing floating point vector constants. 3880 // 3881 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) => 3882 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y)) 3883 if (DestVT.isVector()) { 3884 SDValue Src = N->getOperand(0); 3885 if (Src.getOpcode() == ISD::BUILD_VECTOR) { 3886 EVT SrcVT = Src.getValueType(); 3887 unsigned NElts = DestVT.getVectorNumElements(); 3888 3889 if (SrcVT.getVectorNumElements() == NElts) { 3890 EVT DestEltVT = DestVT.getVectorElementType(); 3891 3892 SmallVector<SDValue, 8> CastedElts; 3893 SDLoc SL(N); 3894 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) { 3895 SDValue Elt = Src.getOperand(I); 3896 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt)); 3897 } 3898 3899 return DAG.getBuildVector(DestVT, SL, CastedElts); 3900 } 3901 } 3902 } 3903 3904 if (DestVT.getSizeInBits() != 64 && !DestVT.isVector()) 3905 break; 3906 3907 // Fold bitcasts of constants. 3908 // 3909 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k) 3910 // TODO: Generalize and move to DAGCombiner 3911 SDValue Src = N->getOperand(0); 3912 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) { 3913 if (Src.getValueType() == MVT::i64) { 3914 SDLoc SL(N); 3915 uint64_t CVal = C->getZExtValue(); 3916 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 3917 DAG.getConstant(Lo_32(CVal), SL, MVT::i32), 3918 DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); 3919 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV); 3920 } 3921 } 3922 3923 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) { 3924 const APInt &Val = C->getValueAPF().bitcastToAPInt(); 3925 SDLoc SL(N); 3926 uint64_t CVal = Val.getZExtValue(); 3927 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 3928 DAG.getConstant(Lo_32(CVal), SL, MVT::i32), 3929 DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); 3930 3931 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec); 3932 } 3933 3934 break; 3935 } 3936 case ISD::SHL: { 3937 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 3938 break; 3939 3940 return performShlCombine(N, DCI); 3941 } 3942 case ISD::SRL: { 3943 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 3944 break; 3945 3946 return performSrlCombine(N, DCI); 3947 } 3948 case ISD::SRA: { 3949 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 3950 break; 3951 3952 return performSraCombine(N, DCI); 3953 } 3954 case ISD::TRUNCATE: 3955 return performTruncateCombine(N, DCI); 3956 case ISD::MUL: 3957 return performMulCombine(N, DCI); 3958 case ISD::MULHS: 3959 return performMulhsCombine(N, DCI); 3960 case ISD::MULHU: 3961 return performMulhuCombine(N, DCI); 3962 case AMDGPUISD::MUL_I24: 3963 case AMDGPUISD::MUL_U24: 3964 case AMDGPUISD::MULHI_I24: 3965 case AMDGPUISD::MULHI_U24: { 3966 if (SDValue V = simplifyI24(N, DCI)) 3967 return V; 3968 return SDValue(); 3969 } 3970 case AMDGPUISD::MUL_LOHI_I24: 3971 case AMDGPUISD::MUL_LOHI_U24: 3972 return performMulLoHi24Combine(N, DCI); 3973 case ISD::SELECT: 3974 return performSelectCombine(N, DCI); 3975 case ISD::FNEG: 3976 return performFNegCombine(N, DCI); 3977 case ISD::FABS: 3978 return performFAbsCombine(N, DCI); 3979 case AMDGPUISD::BFE_I32: 3980 case AMDGPUISD::BFE_U32: { 3981 assert(!N->getValueType(0).isVector() && 3982 "Vector handling of BFE not implemented"); 3983 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 3984 if (!Width) 3985 break; 3986 3987 uint32_t WidthVal = Width->getZExtValue() & 0x1f; 3988 if (WidthVal == 0) 3989 return DAG.getConstant(0, DL, MVT::i32); 3990 3991 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 3992 if (!Offset) 3993 break; 3994 3995 SDValue BitsFrom = N->getOperand(0); 3996 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; 3997 3998 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; 3999 4000 if (OffsetVal == 0) { 4001 // This is already sign / zero extended, so try to fold away extra BFEs. 4002 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); 4003 4004 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); 4005 if (OpSignBits >= SignBits) 4006 return BitsFrom; 4007 4008 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); 4009 if (Signed) { 4010 // This is a sign_extend_inreg. Replace it to take advantage of existing 4011 // DAG Combines. If not eliminated, we will match back to BFE during 4012 // selection. 4013 4014 // TODO: The sext_inreg of extended types ends, although we can could 4015 // handle them in a single BFE. 4016 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, 4017 DAG.getValueType(SmallVT)); 4018 } 4019 4020 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); 4021 } 4022 4023 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) { 4024 if (Signed) { 4025 return constantFoldBFE<int32_t>(DAG, 4026 CVal->getSExtValue(), 4027 OffsetVal, 4028 WidthVal, 4029 DL); 4030 } 4031 4032 return constantFoldBFE<uint32_t>(DAG, 4033 CVal->getZExtValue(), 4034 OffsetVal, 4035 WidthVal, 4036 DL); 4037 } 4038 4039 if ((OffsetVal + WidthVal) >= 32 && 4040 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) { 4041 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); 4042 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, 4043 BitsFrom, ShiftVal); 4044 } 4045 4046 if (BitsFrom.hasOneUse()) { 4047 APInt Demanded = APInt::getBitsSet(32, 4048 OffsetVal, 4049 OffsetVal + WidthVal); 4050 4051 KnownBits Known; 4052 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 4053 !DCI.isBeforeLegalizeOps()); 4054 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 4055 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) || 4056 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) { 4057 DCI.CommitTargetLoweringOpt(TLO); 4058 } 4059 } 4060 4061 break; 4062 } 4063 case ISD::LOAD: 4064 return performLoadCombine(N, DCI); 4065 case ISD::STORE: 4066 return performStoreCombine(N, DCI); 4067 case AMDGPUISD::RCP: 4068 case AMDGPUISD::RCP_IFLAG: 4069 return performRcpCombine(N, DCI); 4070 case ISD::AssertZext: 4071 case ISD::AssertSext: 4072 return performAssertSZExtCombine(N, DCI); 4073 } 4074 return SDValue(); 4075 } 4076 4077 //===----------------------------------------------------------------------===// 4078 // Helper functions 4079 //===----------------------------------------------------------------------===// 4080 4081 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 4082 const TargetRegisterClass *RC, 4083 unsigned Reg, EVT VT, 4084 const SDLoc &SL, 4085 bool RawReg) const { 4086 MachineFunction &MF = DAG.getMachineFunction(); 4087 MachineRegisterInfo &MRI = MF.getRegInfo(); 4088 unsigned VReg; 4089 4090 if (!MRI.isLiveIn(Reg)) { 4091 VReg = MRI.createVirtualRegister(RC); 4092 MRI.addLiveIn(Reg, VReg); 4093 } else { 4094 VReg = MRI.getLiveInVirtReg(Reg); 4095 } 4096 4097 if (RawReg) 4098 return DAG.getRegister(VReg, VT); 4099 4100 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); 4101 } 4102 4103 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, 4104 EVT VT, 4105 const SDLoc &SL, 4106 int64_t Offset) const { 4107 MachineFunction &MF = DAG.getMachineFunction(); 4108 MachineFrameInfo &MFI = MF.getFrameInfo(); 4109 4110 int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true); 4111 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); 4112 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); 4113 4114 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4, 4115 MachineMemOperand::MODereferenceable | 4116 MachineMemOperand::MOInvariant); 4117 } 4118 4119 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, 4120 const SDLoc &SL, 4121 SDValue Chain, 4122 SDValue ArgVal, 4123 int64_t Offset) const { 4124 MachineFunction &MF = DAG.getMachineFunction(); 4125 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); 4126 4127 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); 4128 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, 4129 MachineMemOperand::MODereferenceable); 4130 return Store; 4131 } 4132 4133 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, 4134 const TargetRegisterClass *RC, 4135 EVT VT, const SDLoc &SL, 4136 const ArgDescriptor &Arg) const { 4137 assert(Arg && "Attempting to load missing argument"); 4138 4139 if (Arg.isRegister()) 4140 return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL); 4141 return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); 4142 } 4143 4144 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( 4145 const MachineFunction &MF, const ImplicitParameter Param) const { 4146 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 4147 const AMDGPUSubtarget &ST = 4148 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction()); 4149 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction()); 4150 unsigned Alignment = ST.getAlignmentForImplicitArgPtr(); 4151 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) + 4152 ExplicitArgOffset; 4153 switch (Param) { 4154 case GRID_DIM: 4155 return ArgOffset; 4156 case GRID_OFFSET: 4157 return ArgOffset + 4; 4158 } 4159 llvm_unreachable("unexpected implicit parameter type"); 4160 } 4161 4162 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; 4163 4164 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { 4165 switch ((AMDGPUISD::NodeType)Opcode) { 4166 case AMDGPUISD::FIRST_NUMBER: break; 4167 // AMDIL DAG nodes 4168 NODE_NAME_CASE(UMUL); 4169 NODE_NAME_CASE(BRANCH_COND); 4170 4171 // AMDGPU DAG nodes 4172 NODE_NAME_CASE(IF) 4173 NODE_NAME_CASE(ELSE) 4174 NODE_NAME_CASE(LOOP) 4175 NODE_NAME_CASE(CALL) 4176 NODE_NAME_CASE(TC_RETURN) 4177 NODE_NAME_CASE(TRAP) 4178 NODE_NAME_CASE(RET_FLAG) 4179 NODE_NAME_CASE(RETURN_TO_EPILOG) 4180 NODE_NAME_CASE(ENDPGM) 4181 NODE_NAME_CASE(DWORDADDR) 4182 NODE_NAME_CASE(FRACT) 4183 NODE_NAME_CASE(SETCC) 4184 NODE_NAME_CASE(SETREG) 4185 NODE_NAME_CASE(FMA_W_CHAIN) 4186 NODE_NAME_CASE(FMUL_W_CHAIN) 4187 NODE_NAME_CASE(CLAMP) 4188 NODE_NAME_CASE(COS_HW) 4189 NODE_NAME_CASE(SIN_HW) 4190 NODE_NAME_CASE(FMAX_LEGACY) 4191 NODE_NAME_CASE(FMIN_LEGACY) 4192 NODE_NAME_CASE(FMAX3) 4193 NODE_NAME_CASE(SMAX3) 4194 NODE_NAME_CASE(UMAX3) 4195 NODE_NAME_CASE(FMIN3) 4196 NODE_NAME_CASE(SMIN3) 4197 NODE_NAME_CASE(UMIN3) 4198 NODE_NAME_CASE(FMED3) 4199 NODE_NAME_CASE(SMED3) 4200 NODE_NAME_CASE(UMED3) 4201 NODE_NAME_CASE(FDOT2) 4202 NODE_NAME_CASE(URECIP) 4203 NODE_NAME_CASE(DIV_SCALE) 4204 NODE_NAME_CASE(DIV_FMAS) 4205 NODE_NAME_CASE(DIV_FIXUP) 4206 NODE_NAME_CASE(FMAD_FTZ) 4207 NODE_NAME_CASE(TRIG_PREOP) 4208 NODE_NAME_CASE(RCP) 4209 NODE_NAME_CASE(RSQ) 4210 NODE_NAME_CASE(RCP_LEGACY) 4211 NODE_NAME_CASE(RSQ_LEGACY) 4212 NODE_NAME_CASE(RCP_IFLAG) 4213 NODE_NAME_CASE(FMUL_LEGACY) 4214 NODE_NAME_CASE(RSQ_CLAMP) 4215 NODE_NAME_CASE(LDEXP) 4216 NODE_NAME_CASE(FP_CLASS) 4217 NODE_NAME_CASE(DOT4) 4218 NODE_NAME_CASE(CARRY) 4219 NODE_NAME_CASE(BORROW) 4220 NODE_NAME_CASE(BFE_U32) 4221 NODE_NAME_CASE(BFE_I32) 4222 NODE_NAME_CASE(BFI) 4223 NODE_NAME_CASE(BFM) 4224 NODE_NAME_CASE(FFBH_U32) 4225 NODE_NAME_CASE(FFBH_I32) 4226 NODE_NAME_CASE(FFBL_B32) 4227 NODE_NAME_CASE(MUL_U24) 4228 NODE_NAME_CASE(MUL_I24) 4229 NODE_NAME_CASE(MULHI_U24) 4230 NODE_NAME_CASE(MULHI_I24) 4231 NODE_NAME_CASE(MUL_LOHI_U24) 4232 NODE_NAME_CASE(MUL_LOHI_I24) 4233 NODE_NAME_CASE(MAD_U24) 4234 NODE_NAME_CASE(MAD_I24) 4235 NODE_NAME_CASE(MAD_I64_I32) 4236 NODE_NAME_CASE(MAD_U64_U32) 4237 NODE_NAME_CASE(PERM) 4238 NODE_NAME_CASE(TEXTURE_FETCH) 4239 NODE_NAME_CASE(EXPORT) 4240 NODE_NAME_CASE(EXPORT_DONE) 4241 NODE_NAME_CASE(R600_EXPORT) 4242 NODE_NAME_CASE(CONST_ADDRESS) 4243 NODE_NAME_CASE(REGISTER_LOAD) 4244 NODE_NAME_CASE(REGISTER_STORE) 4245 NODE_NAME_CASE(SAMPLE) 4246 NODE_NAME_CASE(SAMPLEB) 4247 NODE_NAME_CASE(SAMPLED) 4248 NODE_NAME_CASE(SAMPLEL) 4249 NODE_NAME_CASE(CVT_F32_UBYTE0) 4250 NODE_NAME_CASE(CVT_F32_UBYTE1) 4251 NODE_NAME_CASE(CVT_F32_UBYTE2) 4252 NODE_NAME_CASE(CVT_F32_UBYTE3) 4253 NODE_NAME_CASE(CVT_PKRTZ_F16_F32) 4254 NODE_NAME_CASE(CVT_PKNORM_I16_F32) 4255 NODE_NAME_CASE(CVT_PKNORM_U16_F32) 4256 NODE_NAME_CASE(CVT_PK_I16_I32) 4257 NODE_NAME_CASE(CVT_PK_U16_U32) 4258 NODE_NAME_CASE(FP_TO_FP16) 4259 NODE_NAME_CASE(FP16_ZEXT) 4260 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) 4261 NODE_NAME_CASE(CONST_DATA_PTR) 4262 NODE_NAME_CASE(PC_ADD_REL_OFFSET) 4263 NODE_NAME_CASE(KILL) 4264 NODE_NAME_CASE(DUMMY_CHAIN) 4265 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; 4266 NODE_NAME_CASE(INIT_EXEC) 4267 NODE_NAME_CASE(INIT_EXEC_FROM_INPUT) 4268 NODE_NAME_CASE(SENDMSG) 4269 NODE_NAME_CASE(SENDMSGHALT) 4270 NODE_NAME_CASE(INTERP_MOV) 4271 NODE_NAME_CASE(INTERP_P1) 4272 NODE_NAME_CASE(INTERP_P2) 4273 NODE_NAME_CASE(INTERP_P1LL_F16) 4274 NODE_NAME_CASE(INTERP_P1LV_F16) 4275 NODE_NAME_CASE(INTERP_P2_F16) 4276 NODE_NAME_CASE(LOAD_D16_HI) 4277 NODE_NAME_CASE(LOAD_D16_LO) 4278 NODE_NAME_CASE(LOAD_D16_HI_I8) 4279 NODE_NAME_CASE(LOAD_D16_HI_U8) 4280 NODE_NAME_CASE(LOAD_D16_LO_I8) 4281 NODE_NAME_CASE(LOAD_D16_LO_U8) 4282 NODE_NAME_CASE(STORE_MSKOR) 4283 NODE_NAME_CASE(LOAD_CONSTANT) 4284 NODE_NAME_CASE(TBUFFER_STORE_FORMAT) 4285 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) 4286 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) 4287 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) 4288 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) 4289 NODE_NAME_CASE(DS_ORDERED_COUNT) 4290 NODE_NAME_CASE(ATOMIC_CMP_SWAP) 4291 NODE_NAME_CASE(ATOMIC_INC) 4292 NODE_NAME_CASE(ATOMIC_DEC) 4293 NODE_NAME_CASE(ATOMIC_LOAD_FMIN) 4294 NODE_NAME_CASE(ATOMIC_LOAD_FMAX) 4295 NODE_NAME_CASE(BUFFER_LOAD) 4296 NODE_NAME_CASE(BUFFER_LOAD_UBYTE) 4297 NODE_NAME_CASE(BUFFER_LOAD_USHORT) 4298 NODE_NAME_CASE(BUFFER_LOAD_BYTE) 4299 NODE_NAME_CASE(BUFFER_LOAD_SHORT) 4300 NODE_NAME_CASE(BUFFER_LOAD_FORMAT) 4301 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) 4302 NODE_NAME_CASE(SBUFFER_LOAD) 4303 NODE_NAME_CASE(BUFFER_STORE) 4304 NODE_NAME_CASE(BUFFER_STORE_BYTE) 4305 NODE_NAME_CASE(BUFFER_STORE_SHORT) 4306 NODE_NAME_CASE(BUFFER_STORE_FORMAT) 4307 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) 4308 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) 4309 NODE_NAME_CASE(BUFFER_ATOMIC_ADD) 4310 NODE_NAME_CASE(BUFFER_ATOMIC_SUB) 4311 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) 4312 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) 4313 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) 4314 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) 4315 NODE_NAME_CASE(BUFFER_ATOMIC_AND) 4316 NODE_NAME_CASE(BUFFER_ATOMIC_OR) 4317 NODE_NAME_CASE(BUFFER_ATOMIC_XOR) 4318 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) 4319 4320 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; 4321 } 4322 return nullptr; 4323 } 4324 4325 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, 4326 SelectionDAG &DAG, int Enabled, 4327 int &RefinementSteps, 4328 bool &UseOneConstNR, 4329 bool Reciprocal) const { 4330 EVT VT = Operand.getValueType(); 4331 4332 if (VT == MVT::f32) { 4333 RefinementSteps = 0; 4334 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); 4335 } 4336 4337 // TODO: There is also f64 rsq instruction, but the documentation is less 4338 // clear on its precision. 4339 4340 return SDValue(); 4341 } 4342 4343 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, 4344 SelectionDAG &DAG, int Enabled, 4345 int &RefinementSteps) const { 4346 EVT VT = Operand.getValueType(); 4347 4348 if (VT == MVT::f32) { 4349 // Reciprocal, < 1 ulp error. 4350 // 4351 // This reciprocal approximation converges to < 0.5 ulp error with one 4352 // newton rhapson performed with two fused multiple adds (FMAs). 4353 4354 RefinementSteps = 0; 4355 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); 4356 } 4357 4358 // TODO: There is also f64 rcp instruction, but the documentation is less 4359 // clear on its precision. 4360 4361 return SDValue(); 4362 } 4363 4364 void AMDGPUTargetLowering::computeKnownBitsForTargetNode( 4365 const SDValue Op, KnownBits &Known, 4366 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { 4367 4368 Known.resetAll(); // Don't know anything. 4369 4370 unsigned Opc = Op.getOpcode(); 4371 4372 switch (Opc) { 4373 default: 4374 break; 4375 case AMDGPUISD::CARRY: 4376 case AMDGPUISD::BORROW: { 4377 Known.Zero = APInt::getHighBitsSet(32, 31); 4378 break; 4379 } 4380 4381 case AMDGPUISD::BFE_I32: 4382 case AMDGPUISD::BFE_U32: { 4383 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 4384 if (!CWidth) 4385 return; 4386 4387 uint32_t Width = CWidth->getZExtValue() & 0x1f; 4388 4389 if (Opc == AMDGPUISD::BFE_U32) 4390 Known.Zero = APInt::getHighBitsSet(32, 32 - Width); 4391 4392 break; 4393 } 4394 case AMDGPUISD::FP_TO_FP16: 4395 case AMDGPUISD::FP16_ZEXT: { 4396 unsigned BitWidth = Known.getBitWidth(); 4397 4398 // High bits are zero. 4399 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 4400 break; 4401 } 4402 case AMDGPUISD::MUL_U24: 4403 case AMDGPUISD::MUL_I24: { 4404 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 4405 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 4406 unsigned TrailZ = LHSKnown.countMinTrailingZeros() + 4407 RHSKnown.countMinTrailingZeros(); 4408 Known.Zero.setLowBits(std::min(TrailZ, 32u)); 4409 4410 // Truncate to 24 bits. 4411 LHSKnown = LHSKnown.trunc(24); 4412 RHSKnown = RHSKnown.trunc(24); 4413 4414 bool Negative = false; 4415 if (Opc == AMDGPUISD::MUL_I24) { 4416 unsigned LHSValBits = 24 - LHSKnown.countMinSignBits(); 4417 unsigned RHSValBits = 24 - RHSKnown.countMinSignBits(); 4418 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); 4419 if (MaxValBits >= 32) 4420 break; 4421 bool LHSNegative = LHSKnown.isNegative(); 4422 bool LHSPositive = LHSKnown.isNonNegative(); 4423 bool RHSNegative = RHSKnown.isNegative(); 4424 bool RHSPositive = RHSKnown.isNonNegative(); 4425 if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive)) 4426 break; 4427 Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative); 4428 if (Negative) 4429 Known.One.setHighBits(32 - MaxValBits); 4430 else 4431 Known.Zero.setHighBits(32 - MaxValBits); 4432 } else { 4433 unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros(); 4434 unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros(); 4435 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); 4436 if (MaxValBits >= 32) 4437 break; 4438 Known.Zero.setHighBits(32 - MaxValBits); 4439 } 4440 break; 4441 } 4442 case AMDGPUISD::PERM: { 4443 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 4444 if (!CMask) 4445 return; 4446 4447 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 4448 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 4449 unsigned Sel = CMask->getZExtValue(); 4450 4451 for (unsigned I = 0; I < 32; I += 8) { 4452 unsigned SelBits = Sel & 0xff; 4453 if (SelBits < 4) { 4454 SelBits *= 8; 4455 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; 4456 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; 4457 } else if (SelBits < 7) { 4458 SelBits = (SelBits & 3) * 8; 4459 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; 4460 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; 4461 } else if (SelBits == 0x0c) { 4462 Known.Zero |= 0xff << I; 4463 } else if (SelBits > 0x0c) { 4464 Known.One |= 0xff << I; 4465 } 4466 Sel >>= 8; 4467 } 4468 break; 4469 } 4470 case AMDGPUISD::BUFFER_LOAD_UBYTE: { 4471 Known.Zero.setHighBits(24); 4472 break; 4473 } 4474 case AMDGPUISD::BUFFER_LOAD_USHORT: { 4475 Known.Zero.setHighBits(16); 4476 break; 4477 } 4478 case ISD::INTRINSIC_WO_CHAIN: { 4479 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4480 switch (IID) { 4481 case Intrinsic::amdgcn_mbcnt_lo: 4482 case Intrinsic::amdgcn_mbcnt_hi: { 4483 const GCNSubtarget &ST = 4484 DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); 4485 // These return at most the wavefront size - 1. 4486 unsigned Size = Op.getValueType().getSizeInBits(); 4487 Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2()); 4488 break; 4489 } 4490 default: 4491 break; 4492 } 4493 } 4494 } 4495 } 4496 4497 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( 4498 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, 4499 unsigned Depth) const { 4500 switch (Op.getOpcode()) { 4501 case AMDGPUISD::BFE_I32: { 4502 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 4503 if (!Width) 4504 return 1; 4505 4506 unsigned SignBits = 32 - Width->getZExtValue() + 1; 4507 if (!isNullConstant(Op.getOperand(1))) 4508 return SignBits; 4509 4510 // TODO: Could probably figure something out with non-0 offsets. 4511 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 4512 return std::max(SignBits, Op0SignBits); 4513 } 4514 4515 case AMDGPUISD::BFE_U32: { 4516 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 4517 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; 4518 } 4519 4520 case AMDGPUISD::CARRY: 4521 case AMDGPUISD::BORROW: 4522 return 31; 4523 case AMDGPUISD::BUFFER_LOAD_BYTE: 4524 return 25; 4525 case AMDGPUISD::BUFFER_LOAD_SHORT: 4526 return 17; 4527 case AMDGPUISD::BUFFER_LOAD_UBYTE: 4528 return 24; 4529 case AMDGPUISD::BUFFER_LOAD_USHORT: 4530 return 16; 4531 case AMDGPUISD::FP_TO_FP16: 4532 case AMDGPUISD::FP16_ZEXT: 4533 return 16; 4534 default: 4535 return 1; 4536 } 4537 } 4538 4539 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, 4540 const SelectionDAG &DAG, 4541 bool SNaN, 4542 unsigned Depth) const { 4543 unsigned Opcode = Op.getOpcode(); 4544 switch (Opcode) { 4545 case AMDGPUISD::FMIN_LEGACY: 4546 case AMDGPUISD::FMAX_LEGACY: { 4547 if (SNaN) 4548 return true; 4549 4550 // TODO: Can check no nans on one of the operands for each one, but which 4551 // one? 4552 return false; 4553 } 4554 case AMDGPUISD::FMUL_LEGACY: 4555 case AMDGPUISD::CVT_PKRTZ_F16_F32: { 4556 if (SNaN) 4557 return true; 4558 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && 4559 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); 4560 } 4561 case AMDGPUISD::FMED3: 4562 case AMDGPUISD::FMIN3: 4563 case AMDGPUISD::FMAX3: 4564 case AMDGPUISD::FMAD_FTZ: { 4565 if (SNaN) 4566 return true; 4567 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && 4568 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 4569 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); 4570 } 4571 case AMDGPUISD::CVT_F32_UBYTE0: 4572 case AMDGPUISD::CVT_F32_UBYTE1: 4573 case AMDGPUISD::CVT_F32_UBYTE2: 4574 case AMDGPUISD::CVT_F32_UBYTE3: 4575 return true; 4576 4577 case AMDGPUISD::RCP: 4578 case AMDGPUISD::RSQ: 4579 case AMDGPUISD::RCP_LEGACY: 4580 case AMDGPUISD::RSQ_LEGACY: 4581 case AMDGPUISD::RSQ_CLAMP: { 4582 if (SNaN) 4583 return true; 4584 4585 // TODO: Need is known positive check. 4586 return false; 4587 } 4588 case AMDGPUISD::LDEXP: 4589 case AMDGPUISD::FRACT: { 4590 if (SNaN) 4591 return true; 4592 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); 4593 } 4594 case AMDGPUISD::DIV_SCALE: 4595 case AMDGPUISD::DIV_FMAS: 4596 case AMDGPUISD::DIV_FIXUP: 4597 case AMDGPUISD::TRIG_PREOP: 4598 // TODO: Refine on operands. 4599 return SNaN; 4600 case AMDGPUISD::SIN_HW: 4601 case AMDGPUISD::COS_HW: { 4602 // TODO: Need check for infinity 4603 return SNaN; 4604 } 4605 case ISD::INTRINSIC_WO_CHAIN: { 4606 unsigned IntrinsicID 4607 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4608 // TODO: Handle more intrinsics 4609 switch (IntrinsicID) { 4610 case Intrinsic::amdgcn_cubeid: 4611 return true; 4612 4613 case Intrinsic::amdgcn_frexp_mant: { 4614 if (SNaN) 4615 return true; 4616 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); 4617 } 4618 case Intrinsic::amdgcn_cvt_pkrtz: { 4619 if (SNaN) 4620 return true; 4621 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 4622 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); 4623 } 4624 case Intrinsic::amdgcn_fdot2: 4625 // TODO: Refine on operand 4626 return SNaN; 4627 default: 4628 return false; 4629 } 4630 } 4631 default: 4632 return false; 4633 } 4634 } 4635 4636 TargetLowering::AtomicExpansionKind 4637 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { 4638 switch (RMW->getOperation()) { 4639 case AtomicRMWInst::Nand: 4640 case AtomicRMWInst::FAdd: 4641 case AtomicRMWInst::FSub: 4642 return AtomicExpansionKind::CmpXChg; 4643 default: 4644 return AtomicExpansionKind::None; 4645 } 4646 } 4647