1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This is the parent TargetLowering class for hardware code gen 11 /// targets. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUISelLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUFrameLowering.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "R600MachineFunctionInfo.h" 23 #include "SIInstrInfo.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 26 #include "llvm/CodeGen/Analysis.h" 27 #include "llvm/CodeGen/CallingConvLower.h" 28 #include "llvm/CodeGen/MachineFunction.h" 29 #include "llvm/CodeGen/MachineRegisterInfo.h" 30 #include "llvm/CodeGen/SelectionDAG.h" 31 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 32 #include "llvm/IR/DataLayout.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/Support/KnownBits.h" 35 #include "llvm/Support/MathExtras.h" 36 using namespace llvm; 37 38 #include "AMDGPUGenCallingConv.inc" 39 40 static cl::opt<bool> AMDGPUBypassSlowDiv( 41 "amdgpu-bypass-slow-div", 42 cl::desc("Skip 64-bit divide for dynamic 32-bit values"), 43 cl::init(true)); 44 45 // Find a larger type to do a load / store of a vector with. 46 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { 47 unsigned StoreSize = VT.getStoreSizeInBits(); 48 if (StoreSize <= 32) 49 return EVT::getIntegerVT(Ctx, StoreSize); 50 51 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); 52 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); 53 } 54 55 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { 56 EVT VT = Op.getValueType(); 57 KnownBits Known = DAG.computeKnownBits(Op); 58 return VT.getSizeInBits() - Known.countMinLeadingZeros(); 59 } 60 61 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { 62 EVT VT = Op.getValueType(); 63 64 // In order for this to be a signed 24-bit value, bit 23, must 65 // be a sign bit. 66 return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op); 67 } 68 69 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, 70 const AMDGPUSubtarget &STI) 71 : TargetLowering(TM), Subtarget(&STI) { 72 // Lower floating point store/load to integer store/load to reduce the number 73 // of patterns in tablegen. 74 setOperationAction(ISD::LOAD, MVT::f32, Promote); 75 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); 76 77 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 78 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); 79 80 setOperationAction(ISD::LOAD, MVT::v3f32, Promote); 81 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32); 82 83 setOperationAction(ISD::LOAD, MVT::v4f32, Promote); 84 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); 85 86 setOperationAction(ISD::LOAD, MVT::v5f32, Promote); 87 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32); 88 89 setOperationAction(ISD::LOAD, MVT::v8f32, Promote); 90 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); 91 92 setOperationAction(ISD::LOAD, MVT::v16f32, Promote); 93 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); 94 95 setOperationAction(ISD::LOAD, MVT::v32f32, Promote); 96 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32); 97 98 setOperationAction(ISD::LOAD, MVT::i64, Promote); 99 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); 100 101 setOperationAction(ISD::LOAD, MVT::v2i64, Promote); 102 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); 103 104 setOperationAction(ISD::LOAD, MVT::f64, Promote); 105 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32); 106 107 setOperationAction(ISD::LOAD, MVT::v2f64, Promote); 108 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); 109 110 setOperationAction(ISD::LOAD, MVT::v4i64, Promote); 111 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32); 112 113 setOperationAction(ISD::LOAD, MVT::v4f64, Promote); 114 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32); 115 116 setOperationAction(ISD::LOAD, MVT::v8i64, Promote); 117 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32); 118 119 setOperationAction(ISD::LOAD, MVT::v8f64, Promote); 120 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32); 121 122 setOperationAction(ISD::LOAD, MVT::v16i64, Promote); 123 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32); 124 125 setOperationAction(ISD::LOAD, MVT::v16f64, Promote); 126 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32); 127 128 // There are no 64-bit extloads. These should be done as a 32-bit extload and 129 // an extension to 64-bit. 130 for (MVT VT : MVT::integer_valuetypes()) { 131 setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); 132 setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); 133 setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); 134 } 135 136 for (MVT VT : MVT::integer_valuetypes()) { 137 if (VT == MVT::i64) 138 continue; 139 140 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 141 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); 142 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); 143 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); 144 145 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 146 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); 147 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); 148 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); 149 150 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 151 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); 152 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); 153 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); 154 } 155 156 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { 157 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); 158 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); 159 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); 160 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); 161 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); 162 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); 163 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); 164 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); 165 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); 166 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand); 167 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand); 168 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand); 169 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); 170 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); 171 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); 172 } 173 174 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 175 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 176 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand); 177 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 178 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); 179 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand); 180 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand); 181 182 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 183 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 184 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 185 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); 186 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand); 187 188 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 189 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 190 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 191 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); 192 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand); 193 194 setOperationAction(ISD::STORE, MVT::f32, Promote); 195 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); 196 197 setOperationAction(ISD::STORE, MVT::v2f32, Promote); 198 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); 199 200 setOperationAction(ISD::STORE, MVT::v3f32, Promote); 201 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32); 202 203 setOperationAction(ISD::STORE, MVT::v4f32, Promote); 204 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); 205 206 setOperationAction(ISD::STORE, MVT::v5f32, Promote); 207 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32); 208 209 setOperationAction(ISD::STORE, MVT::v8f32, Promote); 210 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); 211 212 setOperationAction(ISD::STORE, MVT::v16f32, Promote); 213 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); 214 215 setOperationAction(ISD::STORE, MVT::v32f32, Promote); 216 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32); 217 218 setOperationAction(ISD::STORE, MVT::i64, Promote); 219 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); 220 221 setOperationAction(ISD::STORE, MVT::v2i64, Promote); 222 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); 223 224 setOperationAction(ISD::STORE, MVT::f64, Promote); 225 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32); 226 227 setOperationAction(ISD::STORE, MVT::v2f64, Promote); 228 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); 229 230 setOperationAction(ISD::STORE, MVT::v4i64, Promote); 231 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32); 232 233 setOperationAction(ISD::STORE, MVT::v4f64, Promote); 234 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32); 235 236 setOperationAction(ISD::STORE, MVT::v8i64, Promote); 237 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32); 238 239 setOperationAction(ISD::STORE, MVT::v8f64, Promote); 240 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32); 241 242 setOperationAction(ISD::STORE, MVT::v16i64, Promote); 243 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32); 244 245 setOperationAction(ISD::STORE, MVT::v16f64, Promote); 246 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32); 247 248 setTruncStoreAction(MVT::i64, MVT::i1, Expand); 249 setTruncStoreAction(MVT::i64, MVT::i8, Expand); 250 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 251 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 252 253 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); 254 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand); 255 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand); 256 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); 257 258 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 259 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); 260 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); 261 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); 262 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); 263 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand); 264 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand); 265 266 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 267 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 268 269 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); 270 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); 271 272 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand); 273 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand); 274 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); 275 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); 276 277 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); 278 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); 279 280 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand); 281 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand); 282 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); 283 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); 284 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); 285 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); 286 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand); 287 288 setOperationAction(ISD::Constant, MVT::i32, Legal); 289 setOperationAction(ISD::Constant, MVT::i64, Legal); 290 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 291 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 292 293 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 294 setOperationAction(ISD::BRIND, MVT::Other, Expand); 295 296 // This is totally unsupported, just custom lower to produce an error. 297 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 298 299 // Library functions. These default to Expand, but we have instructions 300 // for them. 301 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 302 setOperationAction(ISD::FEXP2, MVT::f32, Legal); 303 setOperationAction(ISD::FPOW, MVT::f32, Legal); 304 setOperationAction(ISD::FLOG2, MVT::f32, Legal); 305 setOperationAction(ISD::FABS, MVT::f32, Legal); 306 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 307 setOperationAction(ISD::FRINT, MVT::f32, Legal); 308 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 309 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 310 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 311 312 setOperationAction(ISD::FROUND, MVT::f32, Custom); 313 setOperationAction(ISD::FROUND, MVT::f64, Custom); 314 315 setOperationAction(ISD::FLOG, MVT::f32, Custom); 316 setOperationAction(ISD::FLOG10, MVT::f32, Custom); 317 setOperationAction(ISD::FEXP, MVT::f32, Custom); 318 319 320 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); 321 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); 322 323 setOperationAction(ISD::FREM, MVT::f32, Custom); 324 setOperationAction(ISD::FREM, MVT::f64, Custom); 325 326 // Expand to fneg + fadd. 327 setOperationAction(ISD::FSUB, MVT::f64, Expand); 328 329 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom); 330 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom); 331 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 332 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); 333 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom); 334 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom); 335 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); 336 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); 337 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); 338 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); 339 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom); 340 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom); 341 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); 342 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); 343 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom); 344 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom); 345 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); 346 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); 347 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom); 348 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom); 349 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom); 350 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom); 351 352 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 353 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); 354 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom); 355 356 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 357 for (MVT VT : ScalarIntVTs) { 358 // These should use [SU]DIVREM, so set them to expand 359 setOperationAction(ISD::SDIV, VT, Expand); 360 setOperationAction(ISD::UDIV, VT, Expand); 361 setOperationAction(ISD::SREM, VT, Expand); 362 setOperationAction(ISD::UREM, VT, Expand); 363 364 // GPU does not have divrem function for signed or unsigned. 365 setOperationAction(ISD::SDIVREM, VT, Custom); 366 setOperationAction(ISD::UDIVREM, VT, Custom); 367 368 // GPU does not have [S|U]MUL_LOHI functions as a single instruction. 369 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 370 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 371 372 setOperationAction(ISD::BSWAP, VT, Expand); 373 setOperationAction(ISD::CTTZ, VT, Expand); 374 setOperationAction(ISD::CTLZ, VT, Expand); 375 376 // AMDGPU uses ADDC/SUBC/ADDE/SUBE 377 setOperationAction(ISD::ADDC, VT, Legal); 378 setOperationAction(ISD::SUBC, VT, Legal); 379 setOperationAction(ISD::ADDE, VT, Legal); 380 setOperationAction(ISD::SUBE, VT, Legal); 381 } 382 383 // The hardware supports 32-bit FSHR, but not FSHL. 384 setOperationAction(ISD::FSHR, MVT::i32, Legal); 385 386 // The hardware supports 32-bit ROTR, but not ROTL. 387 setOperationAction(ISD::ROTL, MVT::i32, Expand); 388 setOperationAction(ISD::ROTL, MVT::i64, Expand); 389 setOperationAction(ISD::ROTR, MVT::i64, Expand); 390 391 setOperationAction(ISD::MUL, MVT::i64, Expand); 392 setOperationAction(ISD::MULHU, MVT::i64, Expand); 393 setOperationAction(ISD::MULHS, MVT::i64, Expand); 394 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 395 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 396 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 397 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 398 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 399 400 setOperationAction(ISD::SMIN, MVT::i32, Legal); 401 setOperationAction(ISD::UMIN, MVT::i32, Legal); 402 setOperationAction(ISD::SMAX, MVT::i32, Legal); 403 setOperationAction(ISD::UMAX, MVT::i32, Legal); 404 405 setOperationAction(ISD::CTTZ, MVT::i64, Custom); 406 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom); 407 setOperationAction(ISD::CTLZ, MVT::i64, Custom); 408 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); 409 410 static const MVT::SimpleValueType VectorIntTypes[] = { 411 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32 412 }; 413 414 for (MVT VT : VectorIntTypes) { 415 // Expand the following operations for the current type by default. 416 setOperationAction(ISD::ADD, VT, Expand); 417 setOperationAction(ISD::AND, VT, Expand); 418 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 419 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 420 setOperationAction(ISD::MUL, VT, Expand); 421 setOperationAction(ISD::MULHU, VT, Expand); 422 setOperationAction(ISD::MULHS, VT, Expand); 423 setOperationAction(ISD::OR, VT, Expand); 424 setOperationAction(ISD::SHL, VT, Expand); 425 setOperationAction(ISD::SRA, VT, Expand); 426 setOperationAction(ISD::SRL, VT, Expand); 427 setOperationAction(ISD::ROTL, VT, Expand); 428 setOperationAction(ISD::ROTR, VT, Expand); 429 setOperationAction(ISD::SUB, VT, Expand); 430 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 431 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 432 setOperationAction(ISD::SDIV, VT, Expand); 433 setOperationAction(ISD::UDIV, VT, Expand); 434 setOperationAction(ISD::SREM, VT, Expand); 435 setOperationAction(ISD::UREM, VT, Expand); 436 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 437 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 438 setOperationAction(ISD::SDIVREM, VT, Custom); 439 setOperationAction(ISD::UDIVREM, VT, Expand); 440 setOperationAction(ISD::SELECT, VT, Expand); 441 setOperationAction(ISD::VSELECT, VT, Expand); 442 setOperationAction(ISD::SELECT_CC, VT, Expand); 443 setOperationAction(ISD::XOR, VT, Expand); 444 setOperationAction(ISD::BSWAP, VT, Expand); 445 setOperationAction(ISD::CTPOP, VT, Expand); 446 setOperationAction(ISD::CTTZ, VT, Expand); 447 setOperationAction(ISD::CTLZ, VT, Expand); 448 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 449 setOperationAction(ISD::SETCC, VT, Expand); 450 } 451 452 static const MVT::SimpleValueType FloatVectorTypes[] = { 453 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32 454 }; 455 456 for (MVT VT : FloatVectorTypes) { 457 setOperationAction(ISD::FABS, VT, Expand); 458 setOperationAction(ISD::FMINNUM, VT, Expand); 459 setOperationAction(ISD::FMAXNUM, VT, Expand); 460 setOperationAction(ISD::FADD, VT, Expand); 461 setOperationAction(ISD::FCEIL, VT, Expand); 462 setOperationAction(ISD::FCOS, VT, Expand); 463 setOperationAction(ISD::FDIV, VT, Expand); 464 setOperationAction(ISD::FEXP2, VT, Expand); 465 setOperationAction(ISD::FEXP, VT, Expand); 466 setOperationAction(ISD::FLOG2, VT, Expand); 467 setOperationAction(ISD::FREM, VT, Expand); 468 setOperationAction(ISD::FLOG, VT, Expand); 469 setOperationAction(ISD::FLOG10, VT, Expand); 470 setOperationAction(ISD::FPOW, VT, Expand); 471 setOperationAction(ISD::FFLOOR, VT, Expand); 472 setOperationAction(ISD::FTRUNC, VT, Expand); 473 setOperationAction(ISD::FMUL, VT, Expand); 474 setOperationAction(ISD::FMA, VT, Expand); 475 setOperationAction(ISD::FRINT, VT, Expand); 476 setOperationAction(ISD::FNEARBYINT, VT, Expand); 477 setOperationAction(ISD::FSQRT, VT, Expand); 478 setOperationAction(ISD::FSIN, VT, Expand); 479 setOperationAction(ISD::FSUB, VT, Expand); 480 setOperationAction(ISD::FNEG, VT, Expand); 481 setOperationAction(ISD::VSELECT, VT, Expand); 482 setOperationAction(ISD::SELECT_CC, VT, Expand); 483 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 484 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 485 setOperationAction(ISD::SETCC, VT, Expand); 486 setOperationAction(ISD::FCANONICALIZE, VT, Expand); 487 } 488 489 // This causes using an unrolled select operation rather than expansion with 490 // bit operations. This is in general better, but the alternative using BFI 491 // instructions may be better if the select sources are SGPRs. 492 setOperationAction(ISD::SELECT, MVT::v2f32, Promote); 493 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32); 494 495 setOperationAction(ISD::SELECT, MVT::v3f32, Promote); 496 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32); 497 498 setOperationAction(ISD::SELECT, MVT::v4f32, Promote); 499 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); 500 501 setOperationAction(ISD::SELECT, MVT::v5f32, Promote); 502 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32); 503 504 // There are no libcalls of any kind. 505 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) 506 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr); 507 508 setSchedulingPreference(Sched::RegPressure); 509 setJumpIsExpensive(true); 510 511 // FIXME: This is only partially true. If we have to do vector compares, any 512 // SGPR pair can be a condition register. If we have a uniform condition, we 513 // are better off doing SALU operations, where there is only one SCC. For now, 514 // we don't have a way of knowing during instruction selection if a condition 515 // will be uniform and we always use vector compares. Assume we are using 516 // vector compares until that is fixed. 517 setHasMultipleConditionRegisters(true); 518 519 setMinCmpXchgSizeInBits(32); 520 setSupportsUnalignedAtomics(false); 521 522 PredictableSelectIsExpensive = false; 523 524 // We want to find all load dependencies for long chains of stores to enable 525 // merging into very wide vectors. The problem is with vectors with > 4 526 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 527 // vectors are a legal type, even though we have to split the loads 528 // usually. When we can more precisely specify load legality per address 529 // space, we should be able to make FindBetterChain/MergeConsecutiveStores 530 // smarter so that they can figure out what to do in 2 iterations without all 531 // N > 4 stores on the same chain. 532 GatherAllAliasesMaxDepth = 16; 533 534 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry 535 // about these during lowering. 536 MaxStoresPerMemcpy = 0xffffffff; 537 MaxStoresPerMemmove = 0xffffffff; 538 MaxStoresPerMemset = 0xffffffff; 539 540 // The expansion for 64-bit division is enormous. 541 if (AMDGPUBypassSlowDiv) 542 addBypassSlowDiv(64, 32); 543 544 setTargetDAGCombine(ISD::BITCAST); 545 setTargetDAGCombine(ISD::SHL); 546 setTargetDAGCombine(ISD::SRA); 547 setTargetDAGCombine(ISD::SRL); 548 setTargetDAGCombine(ISD::TRUNCATE); 549 setTargetDAGCombine(ISD::MUL); 550 setTargetDAGCombine(ISD::MULHU); 551 setTargetDAGCombine(ISD::MULHS); 552 setTargetDAGCombine(ISD::SELECT); 553 setTargetDAGCombine(ISD::SELECT_CC); 554 setTargetDAGCombine(ISD::STORE); 555 setTargetDAGCombine(ISD::FADD); 556 setTargetDAGCombine(ISD::FSUB); 557 setTargetDAGCombine(ISD::FNEG); 558 setTargetDAGCombine(ISD::FABS); 559 setTargetDAGCombine(ISD::AssertZext); 560 setTargetDAGCombine(ISD::AssertSext); 561 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 562 } 563 564 //===----------------------------------------------------------------------===// 565 // Target Information 566 //===----------------------------------------------------------------------===// 567 568 LLVM_READNONE 569 static bool fnegFoldsIntoOp(unsigned Opc) { 570 switch (Opc) { 571 case ISD::FADD: 572 case ISD::FSUB: 573 case ISD::FMUL: 574 case ISD::FMA: 575 case ISD::FMAD: 576 case ISD::FMINNUM: 577 case ISD::FMAXNUM: 578 case ISD::FMINNUM_IEEE: 579 case ISD::FMAXNUM_IEEE: 580 case ISD::FSIN: 581 case ISD::FTRUNC: 582 case ISD::FRINT: 583 case ISD::FNEARBYINT: 584 case ISD::FCANONICALIZE: 585 case AMDGPUISD::RCP: 586 case AMDGPUISD::RCP_LEGACY: 587 case AMDGPUISD::RCP_IFLAG: 588 case AMDGPUISD::SIN_HW: 589 case AMDGPUISD::FMUL_LEGACY: 590 case AMDGPUISD::FMIN_LEGACY: 591 case AMDGPUISD::FMAX_LEGACY: 592 case AMDGPUISD::FMED3: 593 return true; 594 default: 595 return false; 596 } 597 } 598 599 /// \p returns true if the operation will definitely need to use a 64-bit 600 /// encoding, and thus will use a VOP3 encoding regardless of the source 601 /// modifiers. 602 LLVM_READONLY 603 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { 604 return N->getNumOperands() > 2 || VT == MVT::f64; 605 } 606 607 // Most FP instructions support source modifiers, but this could be refined 608 // slightly. 609 LLVM_READONLY 610 static bool hasSourceMods(const SDNode *N) { 611 if (isa<MemSDNode>(N)) 612 return false; 613 614 switch (N->getOpcode()) { 615 case ISD::CopyToReg: 616 case ISD::SELECT: 617 case ISD::FDIV: 618 case ISD::FREM: 619 case ISD::INLINEASM: 620 case ISD::INLINEASM_BR: 621 case AMDGPUISD::DIV_SCALE: 622 case ISD::INTRINSIC_W_CHAIN: 623 624 // TODO: Should really be looking at the users of the bitcast. These are 625 // problematic because bitcasts are used to legalize all stores to integer 626 // types. 627 case ISD::BITCAST: 628 return false; 629 case ISD::INTRINSIC_WO_CHAIN: { 630 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) { 631 case Intrinsic::amdgcn_interp_p1: 632 case Intrinsic::amdgcn_interp_p2: 633 case Intrinsic::amdgcn_interp_mov: 634 case Intrinsic::amdgcn_interp_p1_f16: 635 case Intrinsic::amdgcn_interp_p2_f16: 636 return false; 637 default: 638 return true; 639 } 640 } 641 default: 642 return true; 643 } 644 } 645 646 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, 647 unsigned CostThreshold) { 648 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus 649 // it is truly free to use a source modifier in all cases. If there are 650 // multiple users but for each one will necessitate using VOP3, there will be 651 // a code size increase. Try to avoid increasing code size unless we know it 652 // will save on the instruction count. 653 unsigned NumMayIncreaseSize = 0; 654 MVT VT = N->getValueType(0).getScalarType().getSimpleVT(); 655 656 // XXX - Should this limit number of uses to check? 657 for (const SDNode *U : N->uses()) { 658 if (!hasSourceMods(U)) 659 return false; 660 661 if (!opMustUseVOP3Encoding(U, VT)) { 662 if (++NumMayIncreaseSize > CostThreshold) 663 return false; 664 } 665 } 666 667 return true; 668 } 669 670 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, 671 ISD::NodeType ExtendKind) const { 672 assert(!VT.isVector() && "only scalar expected"); 673 674 // Round to the next multiple of 32-bits. 675 unsigned Size = VT.getSizeInBits(); 676 if (Size <= 32) 677 return MVT::i32; 678 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32)); 679 } 680 681 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { 682 return MVT::i32; 683 } 684 685 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { 686 return true; 687 } 688 689 // The backend supports 32 and 64 bit floating point immediates. 690 // FIXME: Why are we reporting vectors of FP immediates as legal? 691 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 692 bool ForCodeSize) const { 693 EVT ScalarVT = VT.getScalarType(); 694 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 || 695 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts())); 696 } 697 698 // We don't want to shrink f64 / f32 constants. 699 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { 700 EVT ScalarVT = VT.getScalarType(); 701 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); 702 } 703 704 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, 705 ISD::LoadExtType ExtTy, 706 EVT NewVT) const { 707 // TODO: This may be worth removing. Check regression tests for diffs. 708 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT)) 709 return false; 710 711 unsigned NewSize = NewVT.getStoreSizeInBits(); 712 713 // If we are reducing to a 32-bit load or a smaller multi-dword load, 714 // this is always better. 715 if (NewSize >= 32) 716 return true; 717 718 EVT OldVT = N->getValueType(0); 719 unsigned OldSize = OldVT.getStoreSizeInBits(); 720 721 MemSDNode *MN = cast<MemSDNode>(N); 722 unsigned AS = MN->getAddressSpace(); 723 // Do not shrink an aligned scalar load to sub-dword. 724 // Scalar engine cannot do sub-dword loads. 725 if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 && 726 (AS == AMDGPUAS::CONSTANT_ADDRESS || 727 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 728 (isa<LoadSDNode>(N) && 729 AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) && 730 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand())) 731 return false; 732 733 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar 734 // extloads, so doing one requires using a buffer_load. In cases where we 735 // still couldn't use a scalar load, using the wider load shouldn't really 736 // hurt anything. 737 738 // If the old size already had to be an extload, there's no harm in continuing 739 // to reduce the width. 740 return (OldSize < 32); 741 } 742 743 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, 744 const SelectionDAG &DAG, 745 const MachineMemOperand &MMO) const { 746 747 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); 748 749 if (LoadTy.getScalarType() == MVT::i32) 750 return false; 751 752 unsigned LScalarSize = LoadTy.getScalarSizeInBits(); 753 unsigned CastScalarSize = CastTy.getScalarSizeInBits(); 754 755 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32)) 756 return false; 757 758 bool Fast = false; 759 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 760 CastTy, MMO, &Fast) && 761 Fast; 762 } 763 764 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also 765 // profitable with the expansion for 64-bit since it's generally good to 766 // speculate things. 767 // FIXME: These should really have the size as a parameter. 768 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const { 769 return true; 770 } 771 772 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { 773 return true; 774 } 775 776 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { 777 switch (N->getOpcode()) { 778 default: 779 return false; 780 case ISD::EntryToken: 781 case ISD::TokenFactor: 782 return true; 783 case ISD::INTRINSIC_WO_CHAIN: 784 { 785 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 786 switch (IntrID) { 787 default: 788 return false; 789 case Intrinsic::amdgcn_readfirstlane: 790 case Intrinsic::amdgcn_readlane: 791 return true; 792 } 793 } 794 break; 795 case ISD::LOAD: 796 { 797 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() == 798 AMDGPUAS::CONSTANT_ADDRESS_32BIT) 799 return true; 800 return false; 801 } 802 break; 803 } 804 } 805 806 TargetLowering::NegatibleCost 807 AMDGPUTargetLowering::getNegatibleCost(SDValue Op, SelectionDAG &DAG, 808 bool LegalOperations, bool ForCodeSize, 809 unsigned Depth) const { 810 switch (Op.getOpcode()) { 811 case ISD::FMA: 812 case ISD::FMAD: { 813 // Negating a fma is not free if it has users without source mods. 814 if (!allUsesHaveSourceMods(Op.getNode())) 815 return NegatibleCost::Expensive; 816 break; 817 } 818 default: 819 break; 820 } 821 822 return TargetLowering::getNegatibleCost(Op, DAG, LegalOperations, ForCodeSize, 823 Depth); 824 } 825 826 //===---------------------------------------------------------------------===// 827 // Target Properties 828 //===---------------------------------------------------------------------===// 829 830 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { 831 assert(VT.isFloatingPoint()); 832 833 // Packed operations do not have a fabs modifier. 834 return VT == MVT::f32 || VT == MVT::f64 || 835 (Subtarget->has16BitInsts() && VT == MVT::f16); 836 } 837 838 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { 839 assert(VT.isFloatingPoint()); 840 return VT == MVT::f32 || VT == MVT::f64 || 841 (Subtarget->has16BitInsts() && VT == MVT::f16) || 842 (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16); 843 } 844 845 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, 846 unsigned NumElem, 847 unsigned AS) const { 848 return true; 849 } 850 851 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const { 852 // There are few operations which truly have vector input operands. Any vector 853 // operation is going to involve operations on each component, and a 854 // build_vector will be a copy per element, so it always makes sense to use a 855 // build_vector input in place of the extracted element to avoid a copy into a 856 // super register. 857 // 858 // We should probably only do this if all users are extracts only, but this 859 // should be the common case. 860 return true; 861 } 862 863 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { 864 // Truncate is just accessing a subregister. 865 866 unsigned SrcSize = Source.getSizeInBits(); 867 unsigned DestSize = Dest.getSizeInBits(); 868 869 return DestSize < SrcSize && DestSize % 32 == 0 ; 870 } 871 872 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { 873 // Truncate is just accessing a subregister. 874 875 unsigned SrcSize = Source->getScalarSizeInBits(); 876 unsigned DestSize = Dest->getScalarSizeInBits(); 877 878 if (DestSize== 16 && Subtarget->has16BitInsts()) 879 return SrcSize >= 32; 880 881 return DestSize < SrcSize && DestSize % 32 == 0; 882 } 883 884 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { 885 unsigned SrcSize = Src->getScalarSizeInBits(); 886 unsigned DestSize = Dest->getScalarSizeInBits(); 887 888 if (SrcSize == 16 && Subtarget->has16BitInsts()) 889 return DestSize >= 32; 890 891 return SrcSize == 32 && DestSize == 64; 892 } 893 894 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { 895 // Any register load of a 64-bit value really requires 2 32-bit moves. For all 896 // practical purposes, the extra mov 0 to load a 64-bit is free. As used, 897 // this will enable reducing 64-bit operations the 32-bit, which is always 898 // good. 899 900 if (Src == MVT::i16) 901 return Dest == MVT::i32 ||Dest == MVT::i64 ; 902 903 return Src == MVT::i32 && Dest == MVT::i64; 904 } 905 906 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 907 return isZExtFree(Val.getValueType(), VT2); 908 } 909 910 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { 911 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a 912 // limited number of native 64-bit operations. Shrinking an operation to fit 913 // in a single 32-bit register should always be helpful. As currently used, 914 // this is much less general than the name suggests, and is only used in 915 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is 916 // not profitable, and may actually be harmful. 917 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; 918 } 919 920 //===---------------------------------------------------------------------===// 921 // TargetLowering Callbacks 922 //===---------------------------------------------------------------------===// 923 924 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, 925 bool IsVarArg) { 926 switch (CC) { 927 case CallingConv::AMDGPU_VS: 928 case CallingConv::AMDGPU_GS: 929 case CallingConv::AMDGPU_PS: 930 case CallingConv::AMDGPU_CS: 931 case CallingConv::AMDGPU_HS: 932 case CallingConv::AMDGPU_ES: 933 case CallingConv::AMDGPU_LS: 934 return CC_AMDGPU; 935 case CallingConv::C: 936 case CallingConv::Fast: 937 case CallingConv::Cold: 938 return CC_AMDGPU_Func; 939 case CallingConv::AMDGPU_KERNEL: 940 case CallingConv::SPIR_KERNEL: 941 default: 942 report_fatal_error("Unsupported calling convention for call"); 943 } 944 } 945 946 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, 947 bool IsVarArg) { 948 switch (CC) { 949 case CallingConv::AMDGPU_KERNEL: 950 case CallingConv::SPIR_KERNEL: 951 llvm_unreachable("kernels should not be handled here"); 952 case CallingConv::AMDGPU_VS: 953 case CallingConv::AMDGPU_GS: 954 case CallingConv::AMDGPU_PS: 955 case CallingConv::AMDGPU_CS: 956 case CallingConv::AMDGPU_HS: 957 case CallingConv::AMDGPU_ES: 958 case CallingConv::AMDGPU_LS: 959 return RetCC_SI_Shader; 960 case CallingConv::C: 961 case CallingConv::Fast: 962 case CallingConv::Cold: 963 return RetCC_AMDGPU_Func; 964 default: 965 report_fatal_error("Unsupported calling convention."); 966 } 967 } 968 969 /// The SelectionDAGBuilder will automatically promote function arguments 970 /// with illegal types. However, this does not work for the AMDGPU targets 971 /// since the function arguments are stored in memory as these illegal types. 972 /// In order to handle this properly we need to get the original types sizes 973 /// from the LLVM IR Function and fixup the ISD:InputArg values before 974 /// passing them to AnalyzeFormalArguments() 975 976 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting 977 /// input values across multiple registers. Each item in the Ins array 978 /// represents a single value that will be stored in registers. Ins[x].VT is 979 /// the value type of the value that will be stored in the register, so 980 /// whatever SDNode we lower the argument to needs to be this type. 981 /// 982 /// In order to correctly lower the arguments we need to know the size of each 983 /// argument. Since Ins[x].VT gives us the size of the register that will 984 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type 985 /// for the orignal function argument so that we can deduce the correct memory 986 /// type to use for Ins[x]. In most cases the correct memory type will be 987 /// Ins[x].ArgVT. However, this will not always be the case. If, for example, 988 /// we have a kernel argument of type v8i8, this argument will be split into 989 /// 8 parts and each part will be represented by its own item in the Ins array. 990 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of 991 /// the argument before it was split. From this, we deduce that the memory type 992 /// for each individual part is i8. We pass the memory type as LocVT to the 993 /// calling convention analysis function and the register type (Ins[x].VT) as 994 /// the ValVT. 995 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( 996 CCState &State, 997 const SmallVectorImpl<ISD::InputArg> &Ins) const { 998 const MachineFunction &MF = State.getMachineFunction(); 999 const Function &Fn = MF.getFunction(); 1000 LLVMContext &Ctx = Fn.getParent()->getContext(); 1001 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); 1002 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn); 1003 CallingConv::ID CC = Fn.getCallingConv(); 1004 1005 unsigned MaxAlign = 1; 1006 uint64_t ExplicitArgOffset = 0; 1007 const DataLayout &DL = Fn.getParent()->getDataLayout(); 1008 1009 unsigned InIndex = 0; 1010 1011 for (const Argument &Arg : Fn.args()) { 1012 Type *BaseArgTy = Arg.getType(); 1013 unsigned Align = DL.getABITypeAlignment(BaseArgTy); 1014 MaxAlign = std::max(Align, MaxAlign); 1015 unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); 1016 1017 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset; 1018 ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; 1019 1020 // We're basically throwing away everything passed into us and starting over 1021 // to get accurate in-memory offsets. The "PartOffset" is completely useless 1022 // to us as computed in Ins. 1023 // 1024 // We also need to figure out what type legalization is trying to do to get 1025 // the correct memory offsets. 1026 1027 SmallVector<EVT, 16> ValueVTs; 1028 SmallVector<uint64_t, 16> Offsets; 1029 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); 1030 1031 for (unsigned Value = 0, NumValues = ValueVTs.size(); 1032 Value != NumValues; ++Value) { 1033 uint64_t BasePartOffset = Offsets[Value]; 1034 1035 EVT ArgVT = ValueVTs[Value]; 1036 EVT MemVT = ArgVT; 1037 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT); 1038 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT); 1039 1040 if (NumRegs == 1) { 1041 // This argument is not split, so the IR type is the memory type. 1042 if (ArgVT.isExtended()) { 1043 // We have an extended type, like i24, so we should just use the 1044 // register type. 1045 MemVT = RegisterVT; 1046 } else { 1047 MemVT = ArgVT; 1048 } 1049 } else if (ArgVT.isVector() && RegisterVT.isVector() && 1050 ArgVT.getScalarType() == RegisterVT.getScalarType()) { 1051 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()); 1052 // We have a vector value which has been split into a vector with 1053 // the same scalar type, but fewer elements. This should handle 1054 // all the floating-point vector types. 1055 MemVT = RegisterVT; 1056 } else if (ArgVT.isVector() && 1057 ArgVT.getVectorNumElements() == NumRegs) { 1058 // This arg has been split so that each element is stored in a separate 1059 // register. 1060 MemVT = ArgVT.getScalarType(); 1061 } else if (ArgVT.isExtended()) { 1062 // We have an extended type, like i65. 1063 MemVT = RegisterVT; 1064 } else { 1065 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs; 1066 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0); 1067 if (RegisterVT.isInteger()) { 1068 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); 1069 } else if (RegisterVT.isVector()) { 1070 assert(!RegisterVT.getScalarType().isFloatingPoint()); 1071 unsigned NumElements = RegisterVT.getVectorNumElements(); 1072 assert(MemoryBits % NumElements == 0); 1073 // This vector type has been split into another vector type with 1074 // a different elements size. 1075 EVT ScalarVT = EVT::getIntegerVT(State.getContext(), 1076 MemoryBits / NumElements); 1077 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); 1078 } else { 1079 llvm_unreachable("cannot deduce memory type."); 1080 } 1081 } 1082 1083 // Convert one element vectors to scalar. 1084 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) 1085 MemVT = MemVT.getScalarType(); 1086 1087 // Round up vec3/vec5 argument. 1088 if (MemVT.isVector() && !MemVT.isPow2VectorType()) { 1089 assert(MemVT.getVectorNumElements() == 3 || 1090 MemVT.getVectorNumElements() == 5); 1091 MemVT = MemVT.getPow2VectorType(State.getContext()); 1092 } else if (!MemVT.isSimple() && !MemVT.isVector()) { 1093 MemVT = MemVT.getRoundIntegerType(State.getContext()); 1094 } 1095 1096 unsigned PartOffset = 0; 1097 for (unsigned i = 0; i != NumRegs; ++i) { 1098 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT, 1099 BasePartOffset + PartOffset, 1100 MemVT.getSimpleVT(), 1101 CCValAssign::Full)); 1102 PartOffset += MemVT.getStoreSize(); 1103 } 1104 } 1105 } 1106 } 1107 1108 SDValue AMDGPUTargetLowering::LowerReturn( 1109 SDValue Chain, CallingConv::ID CallConv, 1110 bool isVarArg, 1111 const SmallVectorImpl<ISD::OutputArg> &Outs, 1112 const SmallVectorImpl<SDValue> &OutVals, 1113 const SDLoc &DL, SelectionDAG &DAG) const { 1114 // FIXME: Fails for r600 tests 1115 //assert(!isVarArg && Outs.empty() && OutVals.empty() && 1116 // "wave terminate should not have return values"); 1117 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); 1118 } 1119 1120 //===---------------------------------------------------------------------===// 1121 // Target specific lowering 1122 //===---------------------------------------------------------------------===// 1123 1124 /// Selects the correct CCAssignFn for a given CallingConvention value. 1125 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1126 bool IsVarArg) { 1127 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg); 1128 } 1129 1130 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1131 bool IsVarArg) { 1132 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); 1133 } 1134 1135 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain, 1136 SelectionDAG &DAG, 1137 MachineFrameInfo &MFI, 1138 int ClobberedFI) const { 1139 SmallVector<SDValue, 8> ArgChains; 1140 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); 1141 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; 1142 1143 // Include the original chain at the beginning of the list. When this is 1144 // used by target LowerCall hooks, this helps legalize find the 1145 // CALLSEQ_BEGIN node. 1146 ArgChains.push_back(Chain); 1147 1148 // Add a chain value for each stack argument corresponding 1149 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 1150 UE = DAG.getEntryNode().getNode()->use_end(); 1151 U != UE; ++U) { 1152 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) { 1153 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) { 1154 if (FI->getIndex() < 0) { 1155 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); 1156 int64_t InLastByte = InFirstByte; 1157 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; 1158 1159 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 1160 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 1161 ArgChains.push_back(SDValue(L, 1)); 1162 } 1163 } 1164 } 1165 } 1166 1167 // Build a tokenfactor for all the chains. 1168 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 1169 } 1170 1171 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, 1172 SmallVectorImpl<SDValue> &InVals, 1173 StringRef Reason) const { 1174 SDValue Callee = CLI.Callee; 1175 SelectionDAG &DAG = CLI.DAG; 1176 1177 const Function &Fn = DAG.getMachineFunction().getFunction(); 1178 1179 StringRef FuncName("<unknown>"); 1180 1181 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) 1182 FuncName = G->getSymbol(); 1183 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 1184 FuncName = G->getGlobal()->getName(); 1185 1186 DiagnosticInfoUnsupported NoCalls( 1187 Fn, Reason + FuncName, CLI.DL.getDebugLoc()); 1188 DAG.getContext()->diagnose(NoCalls); 1189 1190 if (!CLI.IsTailCall) { 1191 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) 1192 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); 1193 } 1194 1195 return DAG.getEntryNode(); 1196 } 1197 1198 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, 1199 SmallVectorImpl<SDValue> &InVals) const { 1200 return lowerUnhandledCall(CLI, InVals, "unsupported call to function "); 1201 } 1202 1203 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 1204 SelectionDAG &DAG) const { 1205 const Function &Fn = DAG.getMachineFunction().getFunction(); 1206 1207 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca", 1208 SDLoc(Op).getDebugLoc()); 1209 DAG.getContext()->diagnose(NoDynamicAlloca); 1210 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)}; 1211 return DAG.getMergeValues(Ops, SDLoc()); 1212 } 1213 1214 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, 1215 SelectionDAG &DAG) const { 1216 switch (Op.getOpcode()) { 1217 default: 1218 Op->print(errs(), &DAG); 1219 llvm_unreachable("Custom lowering code for this" 1220 "instruction is not implemented yet!"); 1221 break; 1222 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 1223 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 1224 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 1225 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); 1226 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); 1227 case ISD::FREM: return LowerFREM(Op, DAG); 1228 case ISD::FCEIL: return LowerFCEIL(Op, DAG); 1229 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); 1230 case ISD::FRINT: return LowerFRINT(Op, DAG); 1231 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); 1232 case ISD::FROUND: return LowerFROUND(Op, DAG); 1233 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); 1234 case ISD::FLOG: 1235 return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef); 1236 case ISD::FLOG10: 1237 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f); 1238 case ISD::FEXP: 1239 return lowerFEXP(Op, DAG); 1240 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 1241 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 1242 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); 1243 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 1244 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 1245 case ISD::CTTZ: 1246 case ISD::CTTZ_ZERO_UNDEF: 1247 case ISD::CTLZ: 1248 case ISD::CTLZ_ZERO_UNDEF: 1249 return LowerCTLZ_CTTZ(Op, DAG); 1250 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 1251 } 1252 return Op; 1253 } 1254 1255 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, 1256 SmallVectorImpl<SDValue> &Results, 1257 SelectionDAG &DAG) const { 1258 switch (N->getOpcode()) { 1259 case ISD::SIGN_EXTEND_INREG: 1260 // Different parts of legalization seem to interpret which type of 1261 // sign_extend_inreg is the one to check for custom lowering. The extended 1262 // from type is what really matters, but some places check for custom 1263 // lowering of the result type. This results in trying to use 1264 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do 1265 // nothing here and let the illegal result integer be handled normally. 1266 return; 1267 default: 1268 return; 1269 } 1270 } 1271 1272 bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) { 1273 const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); 1274 if (!GVar || !GVar->hasInitializer()) 1275 return false; 1276 1277 return !isa<UndefValue>(GVar->getInitializer()); 1278 } 1279 1280 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, 1281 SDValue Op, 1282 SelectionDAG &DAG) const { 1283 1284 const DataLayout &DL = DAG.getDataLayout(); 1285 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); 1286 const GlobalValue *GV = G->getGlobal(); 1287 1288 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1289 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { 1290 if (!MFI->isEntryFunction()) { 1291 SDLoc DL(Op); 1292 const Function &Fn = DAG.getMachineFunction().getFunction(); 1293 DiagnosticInfoUnsupported BadLDSDecl( 1294 Fn, "local memory global used by non-kernel function", 1295 DL.getDebugLoc(), DS_Warning); 1296 DAG.getContext()->diagnose(BadLDSDecl); 1297 1298 // We currently don't have a way to correctly allocate LDS objects that 1299 // aren't directly associated with a kernel. We do force inlining of 1300 // functions that use local objects. However, if these dead functions are 1301 // not eliminated, we don't want a compile time error. Just emit a warning 1302 // and a trap, since there should be no callable path here. 1303 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode()); 1304 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 1305 Trap, DAG.getRoot()); 1306 DAG.setRoot(OutputChain); 1307 return DAG.getUNDEF(Op.getValueType()); 1308 } 1309 1310 // XXX: What does the value of G->getOffset() mean? 1311 assert(G->getOffset() == 0 && 1312 "Do not know what to do with an non-zero offset"); 1313 1314 // TODO: We could emit code to handle the initialization somewhere. 1315 if (!hasDefinedInitializer(GV)) { 1316 unsigned Offset = MFI->allocateLDSGlobal(DL, *GV); 1317 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); 1318 } 1319 } 1320 1321 const Function &Fn = DAG.getMachineFunction().getFunction(); 1322 DiagnosticInfoUnsupported BadInit( 1323 Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc()); 1324 DAG.getContext()->diagnose(BadInit); 1325 return SDValue(); 1326 } 1327 1328 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, 1329 SelectionDAG &DAG) const { 1330 SmallVector<SDValue, 8> Args; 1331 1332 EVT VT = Op.getValueType(); 1333 if (VT == MVT::v4i16 || VT == MVT::v4f16) { 1334 SDLoc SL(Op); 1335 SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0)); 1336 SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1)); 1337 1338 SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi }); 1339 return DAG.getNode(ISD::BITCAST, SL, VT, BV); 1340 } 1341 1342 for (const SDUse &U : Op->ops()) 1343 DAG.ExtractVectorElements(U.get(), Args); 1344 1345 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); 1346 } 1347 1348 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 1349 SelectionDAG &DAG) const { 1350 1351 SmallVector<SDValue, 8> Args; 1352 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1353 EVT VT = Op.getValueType(); 1354 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, 1355 VT.getVectorNumElements()); 1356 1357 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); 1358 } 1359 1360 /// Generate Min/Max node 1361 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, 1362 SDValue LHS, SDValue RHS, 1363 SDValue True, SDValue False, 1364 SDValue CC, 1365 DAGCombinerInfo &DCI) const { 1366 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) 1367 return SDValue(); 1368 1369 SelectionDAG &DAG = DCI.DAG; 1370 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1371 switch (CCOpcode) { 1372 case ISD::SETOEQ: 1373 case ISD::SETONE: 1374 case ISD::SETUNE: 1375 case ISD::SETNE: 1376 case ISD::SETUEQ: 1377 case ISD::SETEQ: 1378 case ISD::SETFALSE: 1379 case ISD::SETFALSE2: 1380 case ISD::SETTRUE: 1381 case ISD::SETTRUE2: 1382 case ISD::SETUO: 1383 case ISD::SETO: 1384 break; 1385 case ISD::SETULE: 1386 case ISD::SETULT: { 1387 if (LHS == True) 1388 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1389 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1390 } 1391 case ISD::SETOLE: 1392 case ISD::SETOLT: 1393 case ISD::SETLE: 1394 case ISD::SETLT: { 1395 // Ordered. Assume ordered for undefined. 1396 1397 // Only do this after legalization to avoid interfering with other combines 1398 // which might occur. 1399 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1400 !DCI.isCalledByLegalizer()) 1401 return SDValue(); 1402 1403 // We need to permute the operands to get the correct NaN behavior. The 1404 // selected operand is the second one based on the failing compare with NaN, 1405 // so permute it based on the compare type the hardware uses. 1406 if (LHS == True) 1407 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1408 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1409 } 1410 case ISD::SETUGE: 1411 case ISD::SETUGT: { 1412 if (LHS == True) 1413 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1414 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1415 } 1416 case ISD::SETGT: 1417 case ISD::SETGE: 1418 case ISD::SETOGE: 1419 case ISD::SETOGT: { 1420 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1421 !DCI.isCalledByLegalizer()) 1422 return SDValue(); 1423 1424 if (LHS == True) 1425 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1426 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1427 } 1428 case ISD::SETCC_INVALID: 1429 llvm_unreachable("Invalid setcc condcode!"); 1430 } 1431 return SDValue(); 1432 } 1433 1434 std::pair<SDValue, SDValue> 1435 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const { 1436 SDLoc SL(Op); 1437 1438 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1439 1440 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 1441 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 1442 1443 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 1444 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 1445 1446 return std::make_pair(Lo, Hi); 1447 } 1448 1449 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const { 1450 SDLoc SL(Op); 1451 1452 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1453 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 1454 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 1455 } 1456 1457 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const { 1458 SDLoc SL(Op); 1459 1460 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1461 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 1462 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 1463 } 1464 1465 // Split a vector type into two parts. The first part is a power of two vector. 1466 // The second part is whatever is left over, and is a scalar if it would 1467 // otherwise be a 1-vector. 1468 std::pair<EVT, EVT> 1469 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const { 1470 EVT LoVT, HiVT; 1471 EVT EltVT = VT.getVectorElementType(); 1472 unsigned NumElts = VT.getVectorNumElements(); 1473 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2); 1474 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts); 1475 HiVT = NumElts - LoNumElts == 1 1476 ? EltVT 1477 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts); 1478 return std::make_pair(LoVT, HiVT); 1479 } 1480 1481 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be 1482 // scalar. 1483 std::pair<SDValue, SDValue> 1484 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, 1485 const EVT &LoVT, const EVT &HiVT, 1486 SelectionDAG &DAG) const { 1487 assert(LoVT.getVectorNumElements() + 1488 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= 1489 N.getValueType().getVectorNumElements() && 1490 "More vector elements requested than available!"); 1491 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, 1492 DAG.getVectorIdxConstant(0, DL)); 1493 SDValue Hi = DAG.getNode( 1494 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, 1495 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL)); 1496 return std::make_pair(Lo, Hi); 1497 } 1498 1499 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, 1500 SelectionDAG &DAG) const { 1501 LoadSDNode *Load = cast<LoadSDNode>(Op); 1502 EVT VT = Op.getValueType(); 1503 SDLoc SL(Op); 1504 1505 1506 // If this is a 2 element vector, we really want to scalarize and not create 1507 // weird 1 element vectors. 1508 if (VT.getVectorNumElements() == 2) { 1509 SDValue Ops[2]; 1510 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG); 1511 return DAG.getMergeValues(Ops, SL); 1512 } 1513 1514 SDValue BasePtr = Load->getBasePtr(); 1515 EVT MemVT = Load->getMemoryVT(); 1516 1517 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); 1518 1519 EVT LoVT, HiVT; 1520 EVT LoMemVT, HiMemVT; 1521 SDValue Lo, Hi; 1522 1523 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); 1524 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); 1525 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG); 1526 1527 unsigned Size = LoMemVT.getStoreSize(); 1528 unsigned BaseAlign = Load->getAlignment(); 1529 unsigned HiAlign = MinAlign(BaseAlign, Size); 1530 1531 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, 1532 Load->getChain(), BasePtr, SrcValue, LoMemVT, 1533 BaseAlign, Load->getMemOperand()->getFlags()); 1534 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size); 1535 SDValue HiLoad = 1536 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), 1537 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), 1538 HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); 1539 1540 SDValue Join; 1541 if (LoVT == HiVT) { 1542 // This is the case that the vector is power of two so was evenly split. 1543 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad); 1544 } else { 1545 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad, 1546 DAG.getVectorIdxConstant(0, SL)); 1547 Join = DAG.getNode( 1548 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL, 1549 VT, Join, HiLoad, 1550 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL)); 1551 } 1552 1553 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 1554 LoLoad.getValue(1), HiLoad.getValue(1))}; 1555 1556 return DAG.getMergeValues(Ops, SL); 1557 } 1558 1559 // Widen a vector load from vec3 to vec4. 1560 SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op, 1561 SelectionDAG &DAG) const { 1562 LoadSDNode *Load = cast<LoadSDNode>(Op); 1563 EVT VT = Op.getValueType(); 1564 assert(VT.getVectorNumElements() == 3); 1565 SDValue BasePtr = Load->getBasePtr(); 1566 EVT MemVT = Load->getMemoryVT(); 1567 SDLoc SL(Op); 1568 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); 1569 unsigned BaseAlign = Load->getAlignment(); 1570 1571 EVT WideVT = 1572 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); 1573 EVT WideMemVT = 1574 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4); 1575 SDValue WideLoad = DAG.getExtLoad( 1576 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue, 1577 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags()); 1578 return DAG.getMergeValues( 1579 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad, 1580 DAG.getVectorIdxConstant(0, SL)), 1581 WideLoad.getValue(1)}, 1582 SL); 1583 } 1584 1585 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, 1586 SelectionDAG &DAG) const { 1587 StoreSDNode *Store = cast<StoreSDNode>(Op); 1588 SDValue Val = Store->getValue(); 1589 EVT VT = Val.getValueType(); 1590 1591 // If this is a 2 element vector, we really want to scalarize and not create 1592 // weird 1 element vectors. 1593 if (VT.getVectorNumElements() == 2) 1594 return scalarizeVectorStore(Store, DAG); 1595 1596 EVT MemVT = Store->getMemoryVT(); 1597 SDValue Chain = Store->getChain(); 1598 SDValue BasePtr = Store->getBasePtr(); 1599 SDLoc SL(Op); 1600 1601 EVT LoVT, HiVT; 1602 EVT LoMemVT, HiMemVT; 1603 SDValue Lo, Hi; 1604 1605 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); 1606 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); 1607 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG); 1608 1609 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize()); 1610 1611 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); 1612 unsigned BaseAlign = Store->getAlignment(); 1613 unsigned Size = LoMemVT.getStoreSize(); 1614 unsigned HiAlign = MinAlign(BaseAlign, Size); 1615 1616 SDValue LoStore = 1617 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign, 1618 Store->getMemOperand()->getFlags()); 1619 SDValue HiStore = 1620 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), 1621 HiMemVT, HiAlign, Store->getMemOperand()->getFlags()); 1622 1623 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); 1624 } 1625 1626 // This is a shortcut for integer division because we have fast i32<->f32 1627 // conversions, and fast f32 reciprocal instructions. The fractional part of a 1628 // float is enough to accurately represent up to a 24-bit signed integer. 1629 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, 1630 bool Sign) const { 1631 SDLoc DL(Op); 1632 EVT VT = Op.getValueType(); 1633 SDValue LHS = Op.getOperand(0); 1634 SDValue RHS = Op.getOperand(1); 1635 MVT IntVT = MVT::i32; 1636 MVT FltVT = MVT::f32; 1637 1638 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS); 1639 if (LHSSignBits < 9) 1640 return SDValue(); 1641 1642 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS); 1643 if (RHSSignBits < 9) 1644 return SDValue(); 1645 1646 unsigned BitSize = VT.getSizeInBits(); 1647 unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 1648 unsigned DivBits = BitSize - SignBits; 1649 if (Sign) 1650 ++DivBits; 1651 1652 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; 1653 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; 1654 1655 SDValue jq = DAG.getConstant(1, DL, IntVT); 1656 1657 if (Sign) { 1658 // char|short jq = ia ^ ib; 1659 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); 1660 1661 // jq = jq >> (bitsize - 2) 1662 jq = DAG.getNode(ISD::SRA, DL, VT, jq, 1663 DAG.getConstant(BitSize - 2, DL, VT)); 1664 1665 // jq = jq | 0x1 1666 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); 1667 } 1668 1669 // int ia = (int)LHS; 1670 SDValue ia = LHS; 1671 1672 // int ib, (int)RHS; 1673 SDValue ib = RHS; 1674 1675 // float fa = (float)ia; 1676 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); 1677 1678 // float fb = (float)ib; 1679 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); 1680 1681 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, 1682 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); 1683 1684 // fq = trunc(fq); 1685 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); 1686 1687 // float fqneg = -fq; 1688 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); 1689 1690 MachineFunction &MF = DAG.getMachineFunction(); 1691 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 1692 1693 // float fr = mad(fqneg, fb, fa); 1694 unsigned OpCode = !MFI->getMode().allFP32Denormals() ? 1695 (unsigned)ISD::FMAD : 1696 (unsigned)AMDGPUISD::FMAD_FTZ; 1697 1698 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); 1699 1700 // int iq = (int)fq; 1701 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); 1702 1703 // fr = fabs(fr); 1704 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); 1705 1706 // fb = fabs(fb); 1707 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); 1708 1709 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 1710 1711 // int cv = fr >= fb; 1712 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); 1713 1714 // jq = (cv ? jq : 0); 1715 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); 1716 1717 // dst = iq + jq; 1718 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); 1719 1720 // Rem needs compensation, it's easier to recompute it 1721 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); 1722 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); 1723 1724 // Truncate to number of bits this divide really is. 1725 if (Sign) { 1726 SDValue InRegSize 1727 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits)); 1728 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize); 1729 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize); 1730 } else { 1731 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT); 1732 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask); 1733 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask); 1734 } 1735 1736 return DAG.getMergeValues({ Div, Rem }, DL); 1737 } 1738 1739 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, 1740 SelectionDAG &DAG, 1741 SmallVectorImpl<SDValue> &Results) const { 1742 SDLoc DL(Op); 1743 EVT VT = Op.getValueType(); 1744 1745 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64"); 1746 1747 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 1748 1749 SDValue One = DAG.getConstant(1, DL, HalfVT); 1750 SDValue Zero = DAG.getConstant(0, DL, HalfVT); 1751 1752 //HiLo split 1753 SDValue LHS = Op.getOperand(0); 1754 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); 1755 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One); 1756 1757 SDValue RHS = Op.getOperand(1); 1758 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); 1759 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One); 1760 1761 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && 1762 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { 1763 1764 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 1765 LHS_Lo, RHS_Lo); 1766 1767 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero}); 1768 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero}); 1769 1770 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV)); 1771 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM)); 1772 return; 1773 } 1774 1775 if (isTypeLegal(MVT::i64)) { 1776 MachineFunction &MF = DAG.getMachineFunction(); 1777 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1778 1779 // Compute denominator reciprocal. 1780 unsigned FMAD = !MFI->getMode().allFP32Denormals() ? 1781 (unsigned)ISD::FMAD : 1782 (unsigned)AMDGPUISD::FMAD_FTZ; 1783 1784 1785 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); 1786 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); 1787 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi, 1788 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32), 1789 Cvt_Lo); 1790 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1); 1791 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp, 1792 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32)); 1793 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1, 1794 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32)); 1795 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2); 1796 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc, 1797 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32), 1798 Mul1); 1799 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2); 1800 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc); 1801 SDValue Rcp64 = DAG.getBitcast(VT, 1802 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi})); 1803 1804 SDValue Zero64 = DAG.getConstant(0, DL, VT); 1805 SDValue One64 = DAG.getConstant(1, DL, VT); 1806 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1); 1807 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1); 1808 1809 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS); 1810 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64); 1811 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1); 1812 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, 1813 Zero); 1814 SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, 1815 One); 1816 1817 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo, 1818 Mulhi1_Lo, Zero1); 1819 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi, 1820 Mulhi1_Hi, Add1_Lo.getValue(1)); 1821 SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi); 1822 SDValue Add1 = DAG.getBitcast(VT, 1823 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi})); 1824 1825 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1); 1826 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2); 1827 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, 1828 Zero); 1829 SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, 1830 One); 1831 1832 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo, 1833 Mulhi2_Lo, Zero1); 1834 SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc, 1835 Mulhi2_Hi, Add1_Lo.getValue(1)); 1836 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC, 1837 Zero, Add2_Lo.getValue(1)); 1838 SDValue Add2 = DAG.getBitcast(VT, 1839 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi})); 1840 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2); 1841 1842 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3); 1843 1844 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero); 1845 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One); 1846 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo, 1847 Mul3_Lo, Zero1); 1848 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi, 1849 Mul3_Hi, Sub1_Lo.getValue(1)); 1850 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi); 1851 SDValue Sub1 = DAG.getBitcast(VT, 1852 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi})); 1853 1854 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT); 1855 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero, 1856 ISD::SETUGE); 1857 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero, 1858 ISD::SETUGE); 1859 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ); 1860 1861 // TODO: Here and below portions of the code can be enclosed into if/endif. 1862 // Currently control flow is unconditional and we have 4 selects after 1863 // potential endif to substitute PHIs. 1864 1865 // if C3 != 0 ... 1866 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo, 1867 RHS_Lo, Zero1); 1868 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi, 1869 RHS_Hi, Sub1_Lo.getValue(1)); 1870 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi, 1871 Zero, Sub2_Lo.getValue(1)); 1872 SDValue Sub2 = DAG.getBitcast(VT, 1873 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi})); 1874 1875 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64); 1876 1877 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero, 1878 ISD::SETUGE); 1879 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero, 1880 ISD::SETUGE); 1881 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ); 1882 1883 // if (C6 != 0) 1884 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64); 1885 1886 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo, 1887 RHS_Lo, Zero1); 1888 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi, 1889 RHS_Hi, Sub2_Lo.getValue(1)); 1890 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi, 1891 Zero, Sub3_Lo.getValue(1)); 1892 SDValue Sub3 = DAG.getBitcast(VT, 1893 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi})); 1894 1895 // endif C6 1896 // endif C3 1897 1898 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE); 1899 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE); 1900 1901 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE); 1902 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE); 1903 1904 Results.push_back(Div); 1905 Results.push_back(Rem); 1906 1907 return; 1908 } 1909 1910 // r600 expandion. 1911 // Get Speculative values 1912 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); 1913 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); 1914 1915 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ); 1916 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero}); 1917 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM); 1918 1919 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ); 1920 SDValue DIV_Lo = Zero; 1921 1922 const unsigned halfBitWidth = HalfVT.getSizeInBits(); 1923 1924 for (unsigned i = 0; i < halfBitWidth; ++i) { 1925 const unsigned bitPos = halfBitWidth - i - 1; 1926 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); 1927 // Get value of high bit 1928 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); 1929 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One); 1930 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); 1931 1932 // Shift 1933 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); 1934 // Add LHS high bit 1935 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); 1936 1937 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT); 1938 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE); 1939 1940 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); 1941 1942 // Update REM 1943 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); 1944 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); 1945 } 1946 1947 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi}); 1948 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV); 1949 Results.push_back(DIV); 1950 Results.push_back(REM); 1951 } 1952 1953 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, 1954 SelectionDAG &DAG) const { 1955 SDLoc DL(Op); 1956 EVT VT = Op.getValueType(); 1957 1958 if (VT == MVT::i64) { 1959 SmallVector<SDValue, 2> Results; 1960 LowerUDIVREM64(Op, DAG, Results); 1961 return DAG.getMergeValues(Results, DL); 1962 } 1963 1964 if (VT == MVT::i32) { 1965 if (SDValue Res = LowerDIVREM24(Op, DAG, false)) 1966 return Res; 1967 } 1968 1969 SDValue Num = Op.getOperand(0); 1970 SDValue Den = Op.getOperand(1); 1971 1972 // RCP = URECIP(Den) = 2^32 / Den + e 1973 // e is rounding error. 1974 SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); 1975 1976 // RCP_LO = mul(RCP, Den) */ 1977 SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); 1978 1979 // RCP_HI = mulhu (RCP, Den) */ 1980 SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); 1981 1982 // NEG_RCP_LO = -RCP_LO 1983 SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 1984 RCP_LO); 1985 1986 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 1987 SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), 1988 NEG_RCP_LO, RCP_LO, 1989 ISD::SETEQ); 1990 // Calculate the rounding error from the URECIP instruction 1991 // E = mulhu(ABS_RCP_LO, RCP) 1992 SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); 1993 1994 // RCP_A_E = RCP + E 1995 SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); 1996 1997 // RCP_S_E = RCP - E 1998 SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); 1999 2000 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2001 SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), 2002 RCP_A_E, RCP_S_E, 2003 ISD::SETEQ); 2004 // Quotient = mulhu(Tmp0, Num) 2005 SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); 2006 2007 // Num_S_Remainder = Quotient * Den 2008 SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); 2009 2010 // Remainder = Num - Num_S_Remainder 2011 SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); 2012 2013 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) 2014 SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, 2015 DAG.getConstant(-1, DL, VT), 2016 DAG.getConstant(0, DL, VT), 2017 ISD::SETUGE); 2018 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) 2019 SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, 2020 Num_S_Remainder, 2021 DAG.getConstant(-1, DL, VT), 2022 DAG.getConstant(0, DL, VT), 2023 ISD::SETUGE); 2024 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2025 SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, 2026 Remainder_GE_Zero); 2027 2028 // Calculate Division result: 2029 2030 // Quotient_A_One = Quotient + 1 2031 SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, 2032 DAG.getConstant(1, DL, VT)); 2033 2034 // Quotient_S_One = Quotient - 1 2035 SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, 2036 DAG.getConstant(1, DL, VT)); 2037 2038 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) 2039 SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), 2040 Quotient, Quotient_A_One, ISD::SETEQ); 2041 2042 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) 2043 Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), 2044 Quotient_S_One, Div, ISD::SETEQ); 2045 2046 // Calculate Rem result: 2047 2048 // Remainder_S_Den = Remainder - Den 2049 SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); 2050 2051 // Remainder_A_Den = Remainder + Den 2052 SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); 2053 2054 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) 2055 SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), 2056 Remainder, Remainder_S_Den, ISD::SETEQ); 2057 2058 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) 2059 Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), 2060 Remainder_A_Den, Rem, ISD::SETEQ); 2061 SDValue Ops[2] = { 2062 Div, 2063 Rem 2064 }; 2065 return DAG.getMergeValues(Ops, DL); 2066 } 2067 2068 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, 2069 SelectionDAG &DAG) const { 2070 SDLoc DL(Op); 2071 EVT VT = Op.getValueType(); 2072 2073 SDValue LHS = Op.getOperand(0); 2074 SDValue RHS = Op.getOperand(1); 2075 2076 SDValue Zero = DAG.getConstant(0, DL, VT); 2077 SDValue NegOne = DAG.getConstant(-1, DL, VT); 2078 2079 if (VT == MVT::i32) { 2080 if (SDValue Res = LowerDIVREM24(Op, DAG, true)) 2081 return Res; 2082 } 2083 2084 if (VT == MVT::i64 && 2085 DAG.ComputeNumSignBits(LHS) > 32 && 2086 DAG.ComputeNumSignBits(RHS) > 32) { 2087 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 2088 2089 //HiLo split 2090 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); 2091 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); 2092 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 2093 LHS_Lo, RHS_Lo); 2094 SDValue Res[2] = { 2095 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), 2096 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) 2097 }; 2098 return DAG.getMergeValues(Res, DL); 2099 } 2100 2101 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); 2102 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); 2103 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); 2104 SDValue RSign = LHSign; // Remainder sign is the same as LHS 2105 2106 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); 2107 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); 2108 2109 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); 2110 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); 2111 2112 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); 2113 SDValue Rem = Div.getValue(1); 2114 2115 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); 2116 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); 2117 2118 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); 2119 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); 2120 2121 SDValue Res[2] = { 2122 Div, 2123 Rem 2124 }; 2125 return DAG.getMergeValues(Res, DL); 2126 } 2127 2128 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) 2129 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { 2130 SDLoc SL(Op); 2131 EVT VT = Op.getValueType(); 2132 SDValue X = Op.getOperand(0); 2133 SDValue Y = Op.getOperand(1); 2134 2135 // TODO: Should this propagate fast-math-flags? 2136 2137 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); 2138 SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); 2139 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); 2140 2141 return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); 2142 } 2143 2144 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { 2145 SDLoc SL(Op); 2146 SDValue Src = Op.getOperand(0); 2147 2148 // result = trunc(src) 2149 // if (src > 0.0 && src != result) 2150 // result += 1.0 2151 2152 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2153 2154 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); 2155 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 2156 2157 EVT SetCCVT = 2158 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2159 2160 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); 2161 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2162 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2163 2164 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); 2165 // TODO: Should this propagate fast-math-flags? 2166 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2167 } 2168 2169 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, 2170 SelectionDAG &DAG) { 2171 const unsigned FractBits = 52; 2172 const unsigned ExpBits = 11; 2173 2174 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, 2175 Hi, 2176 DAG.getConstant(FractBits - 32, SL, MVT::i32), 2177 DAG.getConstant(ExpBits, SL, MVT::i32)); 2178 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, 2179 DAG.getConstant(1023, SL, MVT::i32)); 2180 2181 return Exp; 2182 } 2183 2184 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { 2185 SDLoc SL(Op); 2186 SDValue Src = Op.getOperand(0); 2187 2188 assert(Op.getValueType() == MVT::f64); 2189 2190 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 2191 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 2192 2193 SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); 2194 2195 // Extract the upper half, since this is where we will find the sign and 2196 // exponent. 2197 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); 2198 2199 SDValue Exp = extractF64Exponent(Hi, SL, DAG); 2200 2201 const unsigned FractBits = 52; 2202 2203 // Extract the sign bit. 2204 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); 2205 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); 2206 2207 // Extend back to 64-bits. 2208 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit}); 2209 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); 2210 2211 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); 2212 const SDValue FractMask 2213 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); 2214 2215 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); 2216 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); 2217 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); 2218 2219 EVT SetCCVT = 2220 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); 2221 2222 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); 2223 2224 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); 2225 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); 2226 2227 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); 2228 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); 2229 2230 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); 2231 } 2232 2233 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { 2234 SDLoc SL(Op); 2235 SDValue Src = Op.getOperand(0); 2236 2237 assert(Op.getValueType() == MVT::f64); 2238 2239 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2240 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); 2241 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); 2242 2243 // TODO: Should this propagate fast-math-flags? 2244 2245 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); 2246 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); 2247 2248 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); 2249 2250 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2251 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); 2252 2253 EVT SetCCVT = 2254 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2255 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); 2256 2257 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); 2258 } 2259 2260 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { 2261 // FNEARBYINT and FRINT are the same, except in their handling of FP 2262 // exceptions. Those aren't really meaningful for us, and OpenCL only has 2263 // rint, so just treat them as equivalent. 2264 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); 2265 } 2266 2267 // XXX - May require not supporting f32 denormals? 2268 2269 // Don't handle v2f16. The extra instructions to scalarize and repack around the 2270 // compare and vselect end up producing worse code than scalarizing the whole 2271 // operation. 2272 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2273 SDLoc SL(Op); 2274 SDValue X = Op.getOperand(0); 2275 EVT VT = Op.getValueType(); 2276 2277 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X); 2278 2279 // TODO: Should this propagate fast-math-flags? 2280 2281 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T); 2282 2283 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff); 2284 2285 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2286 const SDValue One = DAG.getConstantFP(1.0, SL, VT); 2287 const SDValue Half = DAG.getConstantFP(0.5, SL, VT); 2288 2289 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X); 2290 2291 EVT SetCCVT = 2292 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2293 2294 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); 2295 2296 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero); 2297 2298 return DAG.getNode(ISD::FADD, SL, VT, T, Sel); 2299 } 2300 2301 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { 2302 SDLoc SL(Op); 2303 SDValue Src = Op.getOperand(0); 2304 2305 // result = trunc(src); 2306 // if (src < 0.0 && src != result) 2307 // result += -1.0. 2308 2309 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2310 2311 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); 2312 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); 2313 2314 EVT SetCCVT = 2315 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2316 2317 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); 2318 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2319 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2320 2321 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); 2322 // TODO: Should this propagate fast-math-flags? 2323 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2324 } 2325 2326 SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG, 2327 double Log2BaseInverted) const { 2328 EVT VT = Op.getValueType(); 2329 2330 SDLoc SL(Op); 2331 SDValue Operand = Op.getOperand(0); 2332 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand); 2333 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT); 2334 2335 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand); 2336 } 2337 2338 // exp2(M_LOG2E_F * f); 2339 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { 2340 EVT VT = Op.getValueType(); 2341 SDLoc SL(Op); 2342 SDValue Src = Op.getOperand(0); 2343 2344 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT); 2345 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags()); 2346 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags()); 2347 } 2348 2349 static bool isCtlzOpc(unsigned Opc) { 2350 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; 2351 } 2352 2353 static bool isCttzOpc(unsigned Opc) { 2354 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; 2355 } 2356 2357 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const { 2358 SDLoc SL(Op); 2359 SDValue Src = Op.getOperand(0); 2360 bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF || 2361 Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; 2362 2363 unsigned ISDOpc, NewOpc; 2364 if (isCtlzOpc(Op.getOpcode())) { 2365 ISDOpc = ISD::CTLZ_ZERO_UNDEF; 2366 NewOpc = AMDGPUISD::FFBH_U32; 2367 } else if (isCttzOpc(Op.getOpcode())) { 2368 ISDOpc = ISD::CTTZ_ZERO_UNDEF; 2369 NewOpc = AMDGPUISD::FFBL_B32; 2370 } else 2371 llvm_unreachable("Unexpected OPCode!!!"); 2372 2373 2374 if (ZeroUndef && Src.getValueType() == MVT::i32) 2375 return DAG.getNode(NewOpc, SL, MVT::i32, Src); 2376 2377 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); 2378 2379 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 2380 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 2381 2382 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 2383 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 2384 2385 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), 2386 *DAG.getContext(), MVT::i32); 2387 2388 SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo; 2389 SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ); 2390 2391 SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo); 2392 SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi); 2393 2394 const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); 2395 SDValue Add, NewOpr; 2396 if (isCtlzOpc(Op.getOpcode())) { 2397 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32); 2398 // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) 2399 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi); 2400 } else { 2401 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32); 2402 // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x)) 2403 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo); 2404 } 2405 2406 if (!ZeroUndef) { 2407 // Test if the full 64-bit input is zero. 2408 2409 // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32, 2410 // which we probably don't want. 2411 SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi; 2412 SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ); 2413 SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0); 2414 2415 // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction 2416 // with the same cycles, otherwise it is slower. 2417 // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src, 2418 // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ); 2419 2420 const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32); 2421 2422 // The instruction returns -1 for 0 input, but the defined intrinsic 2423 // behavior is to return the number of bits. 2424 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, 2425 SrcIsZero, Bits32, NewOpr); 2426 } 2427 2428 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); 2429 } 2430 2431 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, 2432 bool Signed) const { 2433 // Unsigned 2434 // cul2f(ulong u) 2435 //{ 2436 // uint lz = clz(u); 2437 // uint e = (u != 0) ? 127U + 63U - lz : 0; 2438 // u = (u << lz) & 0x7fffffffffffffffUL; 2439 // ulong t = u & 0xffffffffffUL; 2440 // uint v = (e << 23) | (uint)(u >> 40); 2441 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U); 2442 // return as_float(v + r); 2443 //} 2444 // Signed 2445 // cl2f(long l) 2446 //{ 2447 // long s = l >> 63; 2448 // float r = cul2f((l + s) ^ s); 2449 // return s ? -r : r; 2450 //} 2451 2452 SDLoc SL(Op); 2453 SDValue Src = Op.getOperand(0); 2454 SDValue L = Src; 2455 2456 SDValue S; 2457 if (Signed) { 2458 const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64); 2459 S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit); 2460 2461 SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S); 2462 L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S); 2463 } 2464 2465 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), 2466 *DAG.getContext(), MVT::f32); 2467 2468 2469 SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32); 2470 SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64); 2471 SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L); 2472 LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ); 2473 2474 SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32); 2475 SDValue E = DAG.getSelect(SL, MVT::i32, 2476 DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE), 2477 DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ), 2478 ZeroI32); 2479 2480 SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64, 2481 DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ), 2482 DAG.getConstant((-1ULL) >> 1, SL, MVT::i64)); 2483 2484 SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U, 2485 DAG.getConstant(0xffffffffffULL, SL, MVT::i64)); 2486 2487 SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64, 2488 U, DAG.getConstant(40, SL, MVT::i64)); 2489 2490 SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32, 2491 DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)), 2492 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl)); 2493 2494 SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64); 2495 SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT); 2496 SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ); 2497 2498 SDValue One = DAG.getConstant(1, SL, MVT::i32); 2499 2500 SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One); 2501 2502 SDValue R = DAG.getSelect(SL, MVT::i32, 2503 RCmp, 2504 One, 2505 DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32)); 2506 R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R); 2507 R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R); 2508 2509 if (!Signed) 2510 return R; 2511 2512 SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R); 2513 return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R); 2514 } 2515 2516 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, 2517 bool Signed) const { 2518 SDLoc SL(Op); 2519 SDValue Src = Op.getOperand(0); 2520 2521 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); 2522 2523 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, 2524 DAG.getConstant(0, SL, MVT::i32)); 2525 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, 2526 DAG.getConstant(1, SL, MVT::i32)); 2527 2528 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, 2529 SL, MVT::f64, Hi); 2530 2531 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); 2532 2533 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, 2534 DAG.getConstant(32, SL, MVT::i32)); 2535 // TODO: Should this propagate fast-math-flags? 2536 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); 2537 } 2538 2539 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, 2540 SelectionDAG &DAG) const { 2541 // TODO: Factor out code common with LowerSINT_TO_FP. 2542 EVT DestVT = Op.getValueType(); 2543 SDValue Src = Op.getOperand(0); 2544 EVT SrcVT = Src.getValueType(); 2545 2546 if (SrcVT == MVT::i16) { 2547 if (DestVT == MVT::f16) 2548 return Op; 2549 SDLoc DL(Op); 2550 2551 // Promote src to i32 2552 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src); 2553 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext); 2554 } 2555 2556 assert(SrcVT == MVT::i64 && "operation should be legal"); 2557 2558 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { 2559 SDLoc DL(Op); 2560 2561 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); 2562 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op)); 2563 SDValue FPRound = 2564 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); 2565 2566 return FPRound; 2567 } 2568 2569 if (DestVT == MVT::f32) 2570 return LowerINT_TO_FP32(Op, DAG, false); 2571 2572 assert(DestVT == MVT::f64); 2573 return LowerINT_TO_FP64(Op, DAG, false); 2574 } 2575 2576 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, 2577 SelectionDAG &DAG) const { 2578 EVT DestVT = Op.getValueType(); 2579 2580 SDValue Src = Op.getOperand(0); 2581 EVT SrcVT = Src.getValueType(); 2582 2583 if (SrcVT == MVT::i16) { 2584 if (DestVT == MVT::f16) 2585 return Op; 2586 2587 SDLoc DL(Op); 2588 // Promote src to i32 2589 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src); 2590 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext); 2591 } 2592 2593 assert(SrcVT == MVT::i64 && "operation should be legal"); 2594 2595 // TODO: Factor out code common with LowerUINT_TO_FP. 2596 2597 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { 2598 SDLoc DL(Op); 2599 SDValue Src = Op.getOperand(0); 2600 2601 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); 2602 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op)); 2603 SDValue FPRound = 2604 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); 2605 2606 return FPRound; 2607 } 2608 2609 if (DestVT == MVT::f32) 2610 return LowerINT_TO_FP32(Op, DAG, true); 2611 2612 assert(DestVT == MVT::f64); 2613 return LowerINT_TO_FP64(Op, DAG, true); 2614 } 2615 2616 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, 2617 bool Signed) const { 2618 SDLoc SL(Op); 2619 2620 SDValue Src = Op.getOperand(0); 2621 2622 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2623 2624 SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL, 2625 MVT::f64); 2626 SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, 2627 MVT::f64); 2628 // TODO: Should this propagate fast-math-flags? 2629 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); 2630 2631 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); 2632 2633 2634 SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc); 2635 2636 SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL, 2637 MVT::i32, FloorMul); 2638 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); 2639 2640 SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}); 2641 2642 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); 2643 } 2644 2645 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { 2646 SDLoc DL(Op); 2647 SDValue N0 = Op.getOperand(0); 2648 2649 // Convert to target node to get known bits 2650 if (N0.getValueType() == MVT::f32) 2651 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); 2652 2653 if (getTargetMachine().Options.UnsafeFPMath) { 2654 // There is a generic expand for FP_TO_FP16 with unsafe fast math. 2655 return SDValue(); 2656 } 2657 2658 assert(N0.getSimpleValueType() == MVT::f64); 2659 2660 // f64 -> f16 conversion using round-to-nearest-even rounding mode. 2661 const unsigned ExpMask = 0x7ff; 2662 const unsigned ExpBiasf64 = 1023; 2663 const unsigned ExpBiasf16 = 15; 2664 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 2665 SDValue One = DAG.getConstant(1, DL, MVT::i32); 2666 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0); 2667 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U, 2668 DAG.getConstant(32, DL, MVT::i64)); 2669 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32); 2670 U = DAG.getZExtOrTrunc(U, DL, MVT::i32); 2671 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 2672 DAG.getConstant(20, DL, MVT::i64)); 2673 E = DAG.getNode(ISD::AND, DL, MVT::i32, E, 2674 DAG.getConstant(ExpMask, DL, MVT::i32)); 2675 // Subtract the fp64 exponent bias (1023) to get the real exponent and 2676 // add the f16 bias (15) to get the biased exponent for the f16 format. 2677 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E, 2678 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32)); 2679 2680 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 2681 DAG.getConstant(8, DL, MVT::i32)); 2682 M = DAG.getNode(ISD::AND, DL, MVT::i32, M, 2683 DAG.getConstant(0xffe, DL, MVT::i32)); 2684 2685 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH, 2686 DAG.getConstant(0x1ff, DL, MVT::i32)); 2687 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U); 2688 2689 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ); 2690 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set); 2691 2692 // (M != 0 ? 0x0200 : 0) | 0x7c00; 2693 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32, 2694 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32), 2695 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32)); 2696 2697 // N = M | (E << 12); 2698 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M, 2699 DAG.getNode(ISD::SHL, DL, MVT::i32, E, 2700 DAG.getConstant(12, DL, MVT::i32))); 2701 2702 // B = clamp(1-E, 0, 13); 2703 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32, 2704 One, E); 2705 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero); 2706 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B, 2707 DAG.getConstant(13, DL, MVT::i32)); 2708 2709 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M, 2710 DAG.getConstant(0x1000, DL, MVT::i32)); 2711 2712 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B); 2713 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B); 2714 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE); 2715 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1); 2716 2717 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT); 2718 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V, 2719 DAG.getConstant(0x7, DL, MVT::i32)); 2720 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V, 2721 DAG.getConstant(2, DL, MVT::i32)); 2722 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32), 2723 One, Zero, ISD::SETEQ); 2724 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32), 2725 One, Zero, ISD::SETGT); 2726 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1); 2727 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1); 2728 2729 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32), 2730 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT); 2731 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32), 2732 I, V, ISD::SETEQ); 2733 2734 // Extract the sign bit. 2735 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 2736 DAG.getConstant(16, DL, MVT::i32)); 2737 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign, 2738 DAG.getConstant(0x8000, DL, MVT::i32)); 2739 2740 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V); 2741 return DAG.getZExtOrTrunc(V, DL, Op.getValueType()); 2742 } 2743 2744 SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, 2745 SelectionDAG &DAG) const { 2746 SDValue Src = Op.getOperand(0); 2747 2748 // TODO: Factor out code common with LowerFP_TO_UINT. 2749 2750 EVT SrcVT = Src.getValueType(); 2751 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { 2752 SDLoc DL(Op); 2753 2754 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); 2755 SDValue FpToInt32 = 2756 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); 2757 2758 return FpToInt32; 2759 } 2760 2761 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) 2762 return LowerFP64_TO_INT(Op, DAG, true); 2763 2764 return SDValue(); 2765 } 2766 2767 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, 2768 SelectionDAG &DAG) const { 2769 SDValue Src = Op.getOperand(0); 2770 2771 // TODO: Factor out code common with LowerFP_TO_SINT. 2772 2773 EVT SrcVT = Src.getValueType(); 2774 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { 2775 SDLoc DL(Op); 2776 2777 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); 2778 SDValue FpToInt32 = 2779 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); 2780 2781 return FpToInt32; 2782 } 2783 2784 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) 2785 return LowerFP64_TO_INT(Op, DAG, false); 2786 2787 return SDValue(); 2788 } 2789 2790 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 2791 SelectionDAG &DAG) const { 2792 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 2793 MVT VT = Op.getSimpleValueType(); 2794 MVT ScalarVT = VT.getScalarType(); 2795 2796 assert(VT.isVector()); 2797 2798 SDValue Src = Op.getOperand(0); 2799 SDLoc DL(Op); 2800 2801 // TODO: Don't scalarize on Evergreen? 2802 unsigned NElts = VT.getVectorNumElements(); 2803 SmallVector<SDValue, 8> Args; 2804 DAG.ExtractVectorElements(Src, Args, 0, NElts); 2805 2806 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); 2807 for (unsigned I = 0; I < NElts; ++I) 2808 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); 2809 2810 return DAG.getBuildVector(VT, DL, Args); 2811 } 2812 2813 //===----------------------------------------------------------------------===// 2814 // Custom DAG optimizations 2815 //===----------------------------------------------------------------------===// 2816 2817 static bool isU24(SDValue Op, SelectionDAG &DAG) { 2818 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24; 2819 } 2820 2821 static bool isI24(SDValue Op, SelectionDAG &DAG) { 2822 EVT VT = Op.getValueType(); 2823 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated 2824 // as unsigned 24-bit values. 2825 AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24; 2826 } 2827 2828 static SDValue simplifyI24(SDNode *Node24, 2829 TargetLowering::DAGCombinerInfo &DCI) { 2830 SelectionDAG &DAG = DCI.DAG; 2831 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 2832 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN; 2833 2834 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0); 2835 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1); 2836 unsigned NewOpcode = Node24->getOpcode(); 2837 if (IsIntrin) { 2838 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue(); 2839 NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ? 2840 AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 2841 } 2842 2843 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); 2844 2845 // First try to simplify using SimplifyMultipleUseDemandedBits which allows 2846 // the operands to have other uses, but will only perform simplifications that 2847 // involve bypassing some nodes for this user. 2848 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG); 2849 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG); 2850 if (DemandedLHS || DemandedRHS) 2851 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(), 2852 DemandedLHS ? DemandedLHS : LHS, 2853 DemandedRHS ? DemandedRHS : RHS); 2854 2855 // Now try SimplifyDemandedBits which can simplify the nodes used by our 2856 // operands if this node is the only user. 2857 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI)) 2858 return SDValue(Node24, 0); 2859 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI)) 2860 return SDValue(Node24, 0); 2861 2862 return SDValue(); 2863 } 2864 2865 template <typename IntTy> 2866 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, 2867 uint32_t Width, const SDLoc &DL) { 2868 if (Width + Offset < 32) { 2869 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); 2870 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); 2871 return DAG.getConstant(Result, DL, MVT::i32); 2872 } 2873 2874 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); 2875 } 2876 2877 static bool hasVolatileUser(SDNode *Val) { 2878 for (SDNode *U : Val->uses()) { 2879 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) { 2880 if (M->isVolatile()) 2881 return true; 2882 } 2883 } 2884 2885 return false; 2886 } 2887 2888 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { 2889 // i32 vectors are the canonical memory type. 2890 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT)) 2891 return false; 2892 2893 if (!VT.isByteSized()) 2894 return false; 2895 2896 unsigned Size = VT.getStoreSize(); 2897 2898 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector()) 2899 return false; 2900 2901 if (Size == 3 || (Size > 4 && (Size % 4 != 0))) 2902 return false; 2903 2904 return true; 2905 } 2906 2907 // Replace load of an illegal type with a store of a bitcast to a friendlier 2908 // type. 2909 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, 2910 DAGCombinerInfo &DCI) const { 2911 if (!DCI.isBeforeLegalize()) 2912 return SDValue(); 2913 2914 LoadSDNode *LN = cast<LoadSDNode>(N); 2915 if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) 2916 return SDValue(); 2917 2918 SDLoc SL(N); 2919 SelectionDAG &DAG = DCI.DAG; 2920 EVT VT = LN->getMemoryVT(); 2921 2922 unsigned Size = VT.getStoreSize(); 2923 unsigned Align = LN->getAlignment(); 2924 if (Align < Size && isTypeLegal(VT)) { 2925 bool IsFast; 2926 unsigned AS = LN->getAddressSpace(); 2927 2928 // Expand unaligned loads earlier than legalization. Due to visitation order 2929 // problems during legalization, the emitted instructions to pack and unpack 2930 // the bytes again are not eliminated in the case of an unaligned copy. 2931 if (!allowsMisalignedMemoryAccesses( 2932 VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) { 2933 SDValue Ops[2]; 2934 2935 if (VT.isVector()) 2936 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG); 2937 else 2938 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); 2939 2940 return DAG.getMergeValues(Ops, SDLoc(N)); 2941 } 2942 2943 if (!IsFast) 2944 return SDValue(); 2945 } 2946 2947 if (!shouldCombineMemoryType(VT)) 2948 return SDValue(); 2949 2950 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 2951 2952 SDValue NewLoad 2953 = DAG.getLoad(NewVT, SL, LN->getChain(), 2954 LN->getBasePtr(), LN->getMemOperand()); 2955 2956 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad); 2957 DCI.CombineTo(N, BC, NewLoad.getValue(1)); 2958 return SDValue(N, 0); 2959 } 2960 2961 // Replace store of an illegal type with a store of a bitcast to a friendlier 2962 // type. 2963 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, 2964 DAGCombinerInfo &DCI) const { 2965 if (!DCI.isBeforeLegalize()) 2966 return SDValue(); 2967 2968 StoreSDNode *SN = cast<StoreSDNode>(N); 2969 if (SN->isVolatile() || !ISD::isNormalStore(SN)) 2970 return SDValue(); 2971 2972 EVT VT = SN->getMemoryVT(); 2973 unsigned Size = VT.getStoreSize(); 2974 2975 SDLoc SL(N); 2976 SelectionDAG &DAG = DCI.DAG; 2977 unsigned Align = SN->getAlignment(); 2978 if (Align < Size && isTypeLegal(VT)) { 2979 bool IsFast; 2980 unsigned AS = SN->getAddressSpace(); 2981 2982 // Expand unaligned stores earlier than legalization. Due to visitation 2983 // order problems during legalization, the emitted instructions to pack and 2984 // unpack the bytes again are not eliminated in the case of an unaligned 2985 // copy. 2986 if (!allowsMisalignedMemoryAccesses( 2987 VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) { 2988 if (VT.isVector()) 2989 return scalarizeVectorStore(SN, DAG); 2990 2991 return expandUnalignedStore(SN, DAG); 2992 } 2993 2994 if (!IsFast) 2995 return SDValue(); 2996 } 2997 2998 if (!shouldCombineMemoryType(VT)) 2999 return SDValue(); 3000 3001 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 3002 SDValue Val = SN->getValue(); 3003 3004 //DCI.AddToWorklist(Val.getNode()); 3005 3006 bool OtherUses = !Val.hasOneUse(); 3007 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); 3008 if (OtherUses) { 3009 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal); 3010 DAG.ReplaceAllUsesOfValueWith(Val, CastBack); 3011 } 3012 3013 return DAG.getStore(SN->getChain(), SL, CastVal, 3014 SN->getBasePtr(), SN->getMemOperand()); 3015 } 3016 3017 // FIXME: This should go in generic DAG combiner with an isTruncateFree check, 3018 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU 3019 // issues. 3020 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, 3021 DAGCombinerInfo &DCI) const { 3022 SelectionDAG &DAG = DCI.DAG; 3023 SDValue N0 = N->getOperand(0); 3024 3025 // (vt2 (assertzext (truncate vt0:x), vt1)) -> 3026 // (vt2 (truncate (assertzext vt0:x, vt1))) 3027 if (N0.getOpcode() == ISD::TRUNCATE) { 3028 SDValue N1 = N->getOperand(1); 3029 EVT ExtVT = cast<VTSDNode>(N1)->getVT(); 3030 SDLoc SL(N); 3031 3032 SDValue Src = N0.getOperand(0); 3033 EVT SrcVT = Src.getValueType(); 3034 if (SrcVT.bitsGE(ExtVT)) { 3035 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1); 3036 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg); 3037 } 3038 } 3039 3040 return SDValue(); 3041 } 3042 3043 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( 3044 SDNode *N, DAGCombinerInfo &DCI) const { 3045 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 3046 switch (IID) { 3047 case Intrinsic::amdgcn_mul_i24: 3048 case Intrinsic::amdgcn_mul_u24: 3049 return simplifyI24(N, DCI); 3050 case Intrinsic::amdgcn_fract: 3051 case Intrinsic::amdgcn_rsq: 3052 case Intrinsic::amdgcn_rcp_legacy: 3053 case Intrinsic::amdgcn_rsq_legacy: 3054 case Intrinsic::amdgcn_rsq_clamp: 3055 case Intrinsic::amdgcn_ldexp: { 3056 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted 3057 SDValue Src = N->getOperand(1); 3058 return Src.isUndef() ? Src : SDValue(); 3059 } 3060 default: 3061 return SDValue(); 3062 } 3063 } 3064 3065 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the 3066 /// binary operation \p Opc to it with the corresponding constant operands. 3067 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( 3068 DAGCombinerInfo &DCI, const SDLoc &SL, 3069 unsigned Opc, SDValue LHS, 3070 uint32_t ValLo, uint32_t ValHi) const { 3071 SelectionDAG &DAG = DCI.DAG; 3072 SDValue Lo, Hi; 3073 std::tie(Lo, Hi) = split64BitValue(LHS, DAG); 3074 3075 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32); 3076 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32); 3077 3078 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS); 3079 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS); 3080 3081 // Re-visit the ands. It's possible we eliminated one of them and it could 3082 // simplify the vector. 3083 DCI.AddToWorklist(Lo.getNode()); 3084 DCI.AddToWorklist(Hi.getNode()); 3085 3086 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd}); 3087 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 3088 } 3089 3090 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, 3091 DAGCombinerInfo &DCI) const { 3092 EVT VT = N->getValueType(0); 3093 3094 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 3095 if (!RHS) 3096 return SDValue(); 3097 3098 SDValue LHS = N->getOperand(0); 3099 unsigned RHSVal = RHS->getZExtValue(); 3100 if (!RHSVal) 3101 return LHS; 3102 3103 SDLoc SL(N); 3104 SelectionDAG &DAG = DCI.DAG; 3105 3106 switch (LHS->getOpcode()) { 3107 default: 3108 break; 3109 case ISD::ZERO_EXTEND: 3110 case ISD::SIGN_EXTEND: 3111 case ISD::ANY_EXTEND: { 3112 SDValue X = LHS->getOperand(0); 3113 3114 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && 3115 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { 3116 // Prefer build_vector as the canonical form if packed types are legal. 3117 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x 3118 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, 3119 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) }); 3120 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); 3121 } 3122 3123 // shl (ext x) => zext (shl x), if shift does not overflow int 3124 if (VT != MVT::i64) 3125 break; 3126 KnownBits Known = DAG.computeKnownBits(X); 3127 unsigned LZ = Known.countMinLeadingZeros(); 3128 if (LZ < RHSVal) 3129 break; 3130 EVT XVT = X.getValueType(); 3131 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); 3132 return DAG.getZExtOrTrunc(Shl, SL, VT); 3133 } 3134 } 3135 3136 if (VT != MVT::i64) 3137 return SDValue(); 3138 3139 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) 3140 3141 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 3142 // common case, splitting this into a move and a 32-bit shift is faster and 3143 // the same code size. 3144 if (RHSVal < 32) 3145 return SDValue(); 3146 3147 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32); 3148 3149 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); 3150 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt); 3151 3152 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 3153 3154 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift}); 3155 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 3156 } 3157 3158 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, 3159 DAGCombinerInfo &DCI) const { 3160 if (N->getValueType(0) != MVT::i64) 3161 return SDValue(); 3162 3163 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 3164 if (!RHS) 3165 return SDValue(); 3166 3167 SelectionDAG &DAG = DCI.DAG; 3168 SDLoc SL(N); 3169 unsigned RHSVal = RHS->getZExtValue(); 3170 3171 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31) 3172 if (RHSVal == 32) { 3173 SDValue Hi = getHiHalf64(N->getOperand(0), DAG); 3174 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, 3175 DAG.getConstant(31, SL, MVT::i32)); 3176 3177 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift}); 3178 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); 3179 } 3180 3181 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31) 3182 if (RHSVal == 63) { 3183 SDValue Hi = getHiHalf64(N->getOperand(0), DAG); 3184 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, 3185 DAG.getConstant(31, SL, MVT::i32)); 3186 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift}); 3187 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); 3188 } 3189 3190 return SDValue(); 3191 } 3192 3193 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, 3194 DAGCombinerInfo &DCI) const { 3195 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 3196 if (!RHS) 3197 return SDValue(); 3198 3199 EVT VT = N->getValueType(0); 3200 SDValue LHS = N->getOperand(0); 3201 unsigned ShiftAmt = RHS->getZExtValue(); 3202 SelectionDAG &DAG = DCI.DAG; 3203 SDLoc SL(N); 3204 3205 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) 3206 // this improves the ability to match BFE patterns in isel. 3207 if (LHS.getOpcode() == ISD::AND) { 3208 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) { 3209 if (Mask->getAPIntValue().isShiftedMask() && 3210 Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) { 3211 return DAG.getNode( 3212 ISD::AND, SL, VT, 3213 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)), 3214 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1))); 3215 } 3216 } 3217 } 3218 3219 if (VT != MVT::i64) 3220 return SDValue(); 3221 3222 if (ShiftAmt < 32) 3223 return SDValue(); 3224 3225 // srl i64:x, C for C >= 32 3226 // => 3227 // build_pair (srl hi_32(x), C - 32), 0 3228 SDValue One = DAG.getConstant(1, SL, MVT::i32); 3229 SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 3230 3231 SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS); 3232 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One); 3233 3234 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32); 3235 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst); 3236 3237 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero}); 3238 3239 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); 3240 } 3241 3242 SDValue AMDGPUTargetLowering::performTruncateCombine( 3243 SDNode *N, DAGCombinerInfo &DCI) const { 3244 SDLoc SL(N); 3245 SelectionDAG &DAG = DCI.DAG; 3246 EVT VT = N->getValueType(0); 3247 SDValue Src = N->getOperand(0); 3248 3249 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x) 3250 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) { 3251 SDValue Vec = Src.getOperand(0); 3252 if (Vec.getOpcode() == ISD::BUILD_VECTOR) { 3253 SDValue Elt0 = Vec.getOperand(0); 3254 EVT EltVT = Elt0.getValueType(); 3255 if (VT.getSizeInBits() <= EltVT.getSizeInBits()) { 3256 if (EltVT.isFloatingPoint()) { 3257 Elt0 = DAG.getNode(ISD::BITCAST, SL, 3258 EltVT.changeTypeToInteger(), Elt0); 3259 } 3260 3261 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0); 3262 } 3263 } 3264 } 3265 3266 // Equivalent of above for accessing the high element of a vector as an 3267 // integer operation. 3268 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y) 3269 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) { 3270 if (auto K = isConstOrConstSplat(Src.getOperand(1))) { 3271 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) { 3272 SDValue BV = stripBitcast(Src.getOperand(0)); 3273 if (BV.getOpcode() == ISD::BUILD_VECTOR && 3274 BV.getValueType().getVectorNumElements() == 2) { 3275 SDValue SrcElt = BV.getOperand(1); 3276 EVT SrcEltVT = SrcElt.getValueType(); 3277 if (SrcEltVT.isFloatingPoint()) { 3278 SrcElt = DAG.getNode(ISD::BITCAST, SL, 3279 SrcEltVT.changeTypeToInteger(), SrcElt); 3280 } 3281 3282 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); 3283 } 3284 } 3285 } 3286 } 3287 3288 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit. 3289 // 3290 // i16 (trunc (srl i64:x, K)), K <= 16 -> 3291 // i16 (trunc (srl (i32 (trunc x), K))) 3292 if (VT.getScalarSizeInBits() < 32) { 3293 EVT SrcVT = Src.getValueType(); 3294 if (SrcVT.getScalarSizeInBits() > 32 && 3295 (Src.getOpcode() == ISD::SRL || 3296 Src.getOpcode() == ISD::SRA || 3297 Src.getOpcode() == ISD::SHL)) { 3298 SDValue Amt = Src.getOperand(1); 3299 KnownBits Known = DAG.computeKnownBits(Amt); 3300 unsigned Size = VT.getScalarSizeInBits(); 3301 if ((Known.isConstant() && Known.getConstant().ule(Size)) || 3302 (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) { 3303 EVT MidVT = VT.isVector() ? 3304 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 3305 VT.getVectorNumElements()) : MVT::i32; 3306 3307 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout()); 3308 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT, 3309 Src.getOperand(0)); 3310 DCI.AddToWorklist(Trunc.getNode()); 3311 3312 if (Amt.getValueType() != NewShiftVT) { 3313 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT); 3314 DCI.AddToWorklist(Amt.getNode()); 3315 } 3316 3317 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT, 3318 Trunc, Amt); 3319 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift); 3320 } 3321 } 3322 } 3323 3324 return SDValue(); 3325 } 3326 3327 // We need to specifically handle i64 mul here to avoid unnecessary conversion 3328 // instructions. If we only match on the legalized i64 mul expansion, 3329 // SimplifyDemandedBits will be unable to remove them because there will be 3330 // multiple uses due to the separate mul + mulh[su]. 3331 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, 3332 SDValue N0, SDValue N1, unsigned Size, bool Signed) { 3333 if (Size <= 32) { 3334 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 3335 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1); 3336 } 3337 3338 // Because we want to eliminate extension instructions before the 3339 // operation, we need to create a single user here (i.e. not the separate 3340 // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it. 3341 3342 unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24; 3343 3344 SDValue Mul = DAG.getNode(MulOpc, SL, 3345 DAG.getVTList(MVT::i32, MVT::i32), N0, N1); 3346 3347 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, 3348 Mul.getValue(0), Mul.getValue(1)); 3349 } 3350 3351 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, 3352 DAGCombinerInfo &DCI) const { 3353 EVT VT = N->getValueType(0); 3354 3355 unsigned Size = VT.getSizeInBits(); 3356 if (VT.isVector() || Size > 64) 3357 return SDValue(); 3358 3359 // There are i16 integer mul/mad. 3360 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) 3361 return SDValue(); 3362 3363 SelectionDAG &DAG = DCI.DAG; 3364 SDLoc DL(N); 3365 3366 SDValue N0 = N->getOperand(0); 3367 SDValue N1 = N->getOperand(1); 3368 3369 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends 3370 // in the source into any_extends if the result of the mul is truncated. Since 3371 // we can assume the high bits are whatever we want, use the underlying value 3372 // to avoid the unknown high bits from interfering. 3373 if (N0.getOpcode() == ISD::ANY_EXTEND) 3374 N0 = N0.getOperand(0); 3375 3376 if (N1.getOpcode() == ISD::ANY_EXTEND) 3377 N1 = N1.getOperand(0); 3378 3379 SDValue Mul; 3380 3381 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 3382 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 3383 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 3384 Mul = getMul24(DAG, DL, N0, N1, Size, false); 3385 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 3386 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 3387 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 3388 Mul = getMul24(DAG, DL, N0, N1, Size, true); 3389 } else { 3390 return SDValue(); 3391 } 3392 3393 // We need to use sext even for MUL_U24, because MUL_U24 is used 3394 // for signed multiply of 8 and 16-bit types. 3395 return DAG.getSExtOrTrunc(Mul, DL, VT); 3396 } 3397 3398 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N, 3399 DAGCombinerInfo &DCI) const { 3400 EVT VT = N->getValueType(0); 3401 3402 if (!Subtarget->hasMulI24() || VT.isVector()) 3403 return SDValue(); 3404 3405 SelectionDAG &DAG = DCI.DAG; 3406 SDLoc DL(N); 3407 3408 SDValue N0 = N->getOperand(0); 3409 SDValue N1 = N->getOperand(1); 3410 3411 if (!isI24(N0, DAG) || !isI24(N1, DAG)) 3412 return SDValue(); 3413 3414 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 3415 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 3416 3417 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1); 3418 DCI.AddToWorklist(Mulhi.getNode()); 3419 return DAG.getSExtOrTrunc(Mulhi, DL, VT); 3420 } 3421 3422 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, 3423 DAGCombinerInfo &DCI) const { 3424 EVT VT = N->getValueType(0); 3425 3426 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32) 3427 return SDValue(); 3428 3429 SelectionDAG &DAG = DCI.DAG; 3430 SDLoc DL(N); 3431 3432 SDValue N0 = N->getOperand(0); 3433 SDValue N1 = N->getOperand(1); 3434 3435 if (!isU24(N0, DAG) || !isU24(N1, DAG)) 3436 return SDValue(); 3437 3438 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 3439 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 3440 3441 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1); 3442 DCI.AddToWorklist(Mulhi.getNode()); 3443 return DAG.getZExtOrTrunc(Mulhi, DL, VT); 3444 } 3445 3446 SDValue AMDGPUTargetLowering::performMulLoHi24Combine( 3447 SDNode *N, DAGCombinerInfo &DCI) const { 3448 SelectionDAG &DAG = DCI.DAG; 3449 3450 // Simplify demanded bits before splitting into multiple users. 3451 if (SDValue V = simplifyI24(N, DCI)) 3452 return V; 3453 3454 SDValue N0 = N->getOperand(0); 3455 SDValue N1 = N->getOperand(1); 3456 3457 bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24); 3458 3459 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 3460 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24; 3461 3462 SDLoc SL(N); 3463 3464 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1); 3465 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1); 3466 return DAG.getMergeValues({ MulLo, MulHi }, SL); 3467 } 3468 3469 static bool isNegativeOne(SDValue Val) { 3470 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) 3471 return C->isAllOnesValue(); 3472 return false; 3473 } 3474 3475 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, 3476 SDValue Op, 3477 const SDLoc &DL, 3478 unsigned Opc) const { 3479 EVT VT = Op.getValueType(); 3480 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); 3481 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && 3482 LegalVT != MVT::i16)) 3483 return SDValue(); 3484 3485 if (VT != MVT::i32) 3486 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); 3487 3488 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op); 3489 if (VT != MVT::i32) 3490 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX); 3491 3492 return FFBX; 3493 } 3494 3495 // The native instructions return -1 on 0 input. Optimize out a select that 3496 // produces -1 on 0. 3497 // 3498 // TODO: If zero is not undef, we could also do this if the output is compared 3499 // against the bitwidth. 3500 // 3501 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. 3502 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, 3503 SDValue LHS, SDValue RHS, 3504 DAGCombinerInfo &DCI) const { 3505 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 3506 if (!CmpRhs || !CmpRhs->isNullValue()) 3507 return SDValue(); 3508 3509 SelectionDAG &DAG = DCI.DAG; 3510 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 3511 SDValue CmpLHS = Cond.getOperand(0); 3512 3513 unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : 3514 AMDGPUISD::FFBH_U32; 3515 3516 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x 3517 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x 3518 if (CCOpcode == ISD::SETEQ && 3519 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && 3520 RHS.getOperand(0) == CmpLHS && 3521 isNegativeOne(LHS)) { 3522 return getFFBX_U32(DAG, CmpLHS, SL, Opc); 3523 } 3524 3525 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x 3526 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x 3527 if (CCOpcode == ISD::SETNE && 3528 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && 3529 LHS.getOperand(0) == CmpLHS && 3530 isNegativeOne(RHS)) { 3531 return getFFBX_U32(DAG, CmpLHS, SL, Opc); 3532 } 3533 3534 return SDValue(); 3535 } 3536 3537 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, 3538 unsigned Op, 3539 const SDLoc &SL, 3540 SDValue Cond, 3541 SDValue N1, 3542 SDValue N2) { 3543 SelectionDAG &DAG = DCI.DAG; 3544 EVT VT = N1.getValueType(); 3545 3546 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, 3547 N1.getOperand(0), N2.getOperand(0)); 3548 DCI.AddToWorklist(NewSelect.getNode()); 3549 return DAG.getNode(Op, SL, VT, NewSelect); 3550 } 3551 3552 // Pull a free FP operation out of a select so it may fold into uses. 3553 // 3554 // select c, (fneg x), (fneg y) -> fneg (select c, x, y) 3555 // select c, (fneg x), k -> fneg (select c, x, (fneg k)) 3556 // 3557 // select c, (fabs x), (fabs y) -> fabs (select c, x, y) 3558 // select c, (fabs x), +k -> fabs (select c, x, k) 3559 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, 3560 SDValue N) { 3561 SelectionDAG &DAG = DCI.DAG; 3562 SDValue Cond = N.getOperand(0); 3563 SDValue LHS = N.getOperand(1); 3564 SDValue RHS = N.getOperand(2); 3565 3566 EVT VT = N.getValueType(); 3567 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || 3568 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { 3569 return distributeOpThroughSelect(DCI, LHS.getOpcode(), 3570 SDLoc(N), Cond, LHS, RHS); 3571 } 3572 3573 bool Inv = false; 3574 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) { 3575 std::swap(LHS, RHS); 3576 Inv = true; 3577 } 3578 3579 // TODO: Support vector constants. 3580 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 3581 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) { 3582 SDLoc SL(N); 3583 // If one side is an fneg/fabs and the other is a constant, we can push the 3584 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative. 3585 SDValue NewLHS = LHS.getOperand(0); 3586 SDValue NewRHS = RHS; 3587 3588 // Careful: if the neg can be folded up, don't try to pull it back down. 3589 bool ShouldFoldNeg = true; 3590 3591 if (NewLHS.hasOneUse()) { 3592 unsigned Opc = NewLHS.getOpcode(); 3593 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc)) 3594 ShouldFoldNeg = false; 3595 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL) 3596 ShouldFoldNeg = false; 3597 } 3598 3599 if (ShouldFoldNeg) { 3600 if (LHS.getOpcode() == ISD::FNEG) 3601 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 3602 else if (CRHS->isNegative()) 3603 return SDValue(); 3604 3605 if (Inv) 3606 std::swap(NewLHS, NewRHS); 3607 3608 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, 3609 Cond, NewLHS, NewRHS); 3610 DCI.AddToWorklist(NewSelect.getNode()); 3611 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); 3612 } 3613 } 3614 3615 return SDValue(); 3616 } 3617 3618 3619 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, 3620 DAGCombinerInfo &DCI) const { 3621 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) 3622 return Folded; 3623 3624 SDValue Cond = N->getOperand(0); 3625 if (Cond.getOpcode() != ISD::SETCC) 3626 return SDValue(); 3627 3628 EVT VT = N->getValueType(0); 3629 SDValue LHS = Cond.getOperand(0); 3630 SDValue RHS = Cond.getOperand(1); 3631 SDValue CC = Cond.getOperand(2); 3632 3633 SDValue True = N->getOperand(1); 3634 SDValue False = N->getOperand(2); 3635 3636 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses. 3637 SelectionDAG &DAG = DCI.DAG; 3638 if (DAG.isConstantValueOfAnyType(True) && 3639 !DAG.isConstantValueOfAnyType(False)) { 3640 // Swap cmp + select pair to move constant to false input. 3641 // This will allow using VOPC cndmasks more often. 3642 // select (setcc x, y), k, x -> select (setccinv x, y), x, k 3643 3644 SDLoc SL(N); 3645 ISD::CondCode NewCC = 3646 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType()); 3647 3648 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC); 3649 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True); 3650 } 3651 3652 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) { 3653 SDValue MinMax 3654 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); 3655 // Revisit this node so we can catch min3/max3/med3 patterns. 3656 //DCI.AddToWorklist(MinMax.getNode()); 3657 return MinMax; 3658 } 3659 } 3660 3661 // There's no reason to not do this if the condition has other uses. 3662 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); 3663 } 3664 3665 static bool isInv2Pi(const APFloat &APF) { 3666 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); 3667 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); 3668 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882)); 3669 3670 return APF.bitwiseIsEqual(KF16) || 3671 APF.bitwiseIsEqual(KF32) || 3672 APF.bitwiseIsEqual(KF64); 3673 } 3674 3675 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an 3676 // additional cost to negate them. 3677 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const { 3678 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) { 3679 if (C->isZero() && !C->isNegative()) 3680 return true; 3681 3682 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF())) 3683 return true; 3684 } 3685 3686 return false; 3687 } 3688 3689 static unsigned inverseMinMax(unsigned Opc) { 3690 switch (Opc) { 3691 case ISD::FMAXNUM: 3692 return ISD::FMINNUM; 3693 case ISD::FMINNUM: 3694 return ISD::FMAXNUM; 3695 case ISD::FMAXNUM_IEEE: 3696 return ISD::FMINNUM_IEEE; 3697 case ISD::FMINNUM_IEEE: 3698 return ISD::FMAXNUM_IEEE; 3699 case AMDGPUISD::FMAX_LEGACY: 3700 return AMDGPUISD::FMIN_LEGACY; 3701 case AMDGPUISD::FMIN_LEGACY: 3702 return AMDGPUISD::FMAX_LEGACY; 3703 default: 3704 llvm_unreachable("invalid min/max opcode"); 3705 } 3706 } 3707 3708 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, 3709 DAGCombinerInfo &DCI) const { 3710 SelectionDAG &DAG = DCI.DAG; 3711 SDValue N0 = N->getOperand(0); 3712 EVT VT = N->getValueType(0); 3713 3714 unsigned Opc = N0.getOpcode(); 3715 3716 // If the input has multiple uses and we can either fold the negate down, or 3717 // the other uses cannot, give up. This both prevents unprofitable 3718 // transformations and infinite loops: we won't repeatedly try to fold around 3719 // a negate that has no 'good' form. 3720 if (N0.hasOneUse()) { 3721 // This may be able to fold into the source, but at a code size cost. Don't 3722 // fold if the fold into the user is free. 3723 if (allUsesHaveSourceMods(N, 0)) 3724 return SDValue(); 3725 } else { 3726 if (fnegFoldsIntoOp(Opc) && 3727 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode()))) 3728 return SDValue(); 3729 } 3730 3731 SDLoc SL(N); 3732 switch (Opc) { 3733 case ISD::FADD: { 3734 if (!mayIgnoreSignedZero(N0)) 3735 return SDValue(); 3736 3737 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) 3738 SDValue LHS = N0.getOperand(0); 3739 SDValue RHS = N0.getOperand(1); 3740 3741 if (LHS.getOpcode() != ISD::FNEG) 3742 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); 3743 else 3744 LHS = LHS.getOperand(0); 3745 3746 if (RHS.getOpcode() != ISD::FNEG) 3747 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 3748 else 3749 RHS = RHS.getOperand(0); 3750 3751 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags()); 3752 if (Res.getOpcode() != ISD::FADD) 3753 return SDValue(); // Op got folded away. 3754 if (!N0.hasOneUse()) 3755 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 3756 return Res; 3757 } 3758 case ISD::FMUL: 3759 case AMDGPUISD::FMUL_LEGACY: { 3760 // (fneg (fmul x, y)) -> (fmul x, (fneg y)) 3761 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y)) 3762 SDValue LHS = N0.getOperand(0); 3763 SDValue RHS = N0.getOperand(1); 3764 3765 if (LHS.getOpcode() == ISD::FNEG) 3766 LHS = LHS.getOperand(0); 3767 else if (RHS.getOpcode() == ISD::FNEG) 3768 RHS = RHS.getOperand(0); 3769 else 3770 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 3771 3772 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags()); 3773 if (Res.getOpcode() != Opc) 3774 return SDValue(); // Op got folded away. 3775 if (!N0.hasOneUse()) 3776 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 3777 return Res; 3778 } 3779 case ISD::FMA: 3780 case ISD::FMAD: { 3781 if (!mayIgnoreSignedZero(N0)) 3782 return SDValue(); 3783 3784 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z)) 3785 SDValue LHS = N0.getOperand(0); 3786 SDValue MHS = N0.getOperand(1); 3787 SDValue RHS = N0.getOperand(2); 3788 3789 if (LHS.getOpcode() == ISD::FNEG) 3790 LHS = LHS.getOperand(0); 3791 else if (MHS.getOpcode() == ISD::FNEG) 3792 MHS = MHS.getOperand(0); 3793 else 3794 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS); 3795 3796 if (RHS.getOpcode() != ISD::FNEG) 3797 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 3798 else 3799 RHS = RHS.getOperand(0); 3800 3801 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); 3802 if (Res.getOpcode() != Opc) 3803 return SDValue(); // Op got folded away. 3804 if (!N0.hasOneUse()) 3805 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 3806 return Res; 3807 } 3808 case ISD::FMAXNUM: 3809 case ISD::FMINNUM: 3810 case ISD::FMAXNUM_IEEE: 3811 case ISD::FMINNUM_IEEE: 3812 case AMDGPUISD::FMAX_LEGACY: 3813 case AMDGPUISD::FMIN_LEGACY: { 3814 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) 3815 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y) 3816 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y) 3817 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y) 3818 3819 SDValue LHS = N0.getOperand(0); 3820 SDValue RHS = N0.getOperand(1); 3821 3822 // 0 doesn't have a negated inline immediate. 3823 // TODO: This constant check should be generalized to other operations. 3824 if (isConstantCostlierToNegate(RHS)) 3825 return SDValue(); 3826 3827 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); 3828 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 3829 unsigned Opposite = inverseMinMax(Opc); 3830 3831 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags()); 3832 if (Res.getOpcode() != Opposite) 3833 return SDValue(); // Op got folded away. 3834 if (!N0.hasOneUse()) 3835 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 3836 return Res; 3837 } 3838 case AMDGPUISD::FMED3: { 3839 SDValue Ops[3]; 3840 for (unsigned I = 0; I < 3; ++I) 3841 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags()); 3842 3843 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags()); 3844 if (Res.getOpcode() != AMDGPUISD::FMED3) 3845 return SDValue(); // Op got folded away. 3846 if (!N0.hasOneUse()) 3847 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 3848 return Res; 3849 } 3850 case ISD::FP_EXTEND: 3851 case ISD::FTRUNC: 3852 case ISD::FRINT: 3853 case ISD::FNEARBYINT: // XXX - Should fround be handled? 3854 case ISD::FSIN: 3855 case ISD::FCANONICALIZE: 3856 case AMDGPUISD::RCP: 3857 case AMDGPUISD::RCP_LEGACY: 3858 case AMDGPUISD::RCP_IFLAG: 3859 case AMDGPUISD::SIN_HW: { 3860 SDValue CvtSrc = N0.getOperand(0); 3861 if (CvtSrc.getOpcode() == ISD::FNEG) { 3862 // (fneg (fp_extend (fneg x))) -> (fp_extend x) 3863 // (fneg (rcp (fneg x))) -> (rcp x) 3864 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0)); 3865 } 3866 3867 if (!N0.hasOneUse()) 3868 return SDValue(); 3869 3870 // (fneg (fp_extend x)) -> (fp_extend (fneg x)) 3871 // (fneg (rcp x)) -> (rcp (fneg x)) 3872 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); 3873 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags()); 3874 } 3875 case ISD::FP_ROUND: { 3876 SDValue CvtSrc = N0.getOperand(0); 3877 3878 if (CvtSrc.getOpcode() == ISD::FNEG) { 3879 // (fneg (fp_round (fneg x))) -> (fp_round x) 3880 return DAG.getNode(ISD::FP_ROUND, SL, VT, 3881 CvtSrc.getOperand(0), N0.getOperand(1)); 3882 } 3883 3884 if (!N0.hasOneUse()) 3885 return SDValue(); 3886 3887 // (fneg (fp_round x)) -> (fp_round (fneg x)) 3888 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); 3889 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); 3890 } 3891 case ISD::FP16_TO_FP: { 3892 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal 3893 // f16, but legalization of f16 fneg ends up pulling it out of the source. 3894 // Put the fneg back as a legal source operation that can be matched later. 3895 SDLoc SL(N); 3896 3897 SDValue Src = N0.getOperand(0); 3898 EVT SrcVT = Src.getValueType(); 3899 3900 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000) 3901 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src, 3902 DAG.getConstant(0x8000, SL, SrcVT)); 3903 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); 3904 } 3905 default: 3906 return SDValue(); 3907 } 3908 } 3909 3910 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, 3911 DAGCombinerInfo &DCI) const { 3912 SelectionDAG &DAG = DCI.DAG; 3913 SDValue N0 = N->getOperand(0); 3914 3915 if (!N0.hasOneUse()) 3916 return SDValue(); 3917 3918 switch (N0.getOpcode()) { 3919 case ISD::FP16_TO_FP: { 3920 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal"); 3921 SDLoc SL(N); 3922 SDValue Src = N0.getOperand(0); 3923 EVT SrcVT = Src.getValueType(); 3924 3925 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff) 3926 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src, 3927 DAG.getConstant(0x7fff, SL, SrcVT)); 3928 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs); 3929 } 3930 default: 3931 return SDValue(); 3932 } 3933 } 3934 3935 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N, 3936 DAGCombinerInfo &DCI) const { 3937 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 3938 if (!CFP) 3939 return SDValue(); 3940 3941 // XXX - Should this flush denormals? 3942 const APFloat &Val = CFP->getValueAPF(); 3943 APFloat One(Val.getSemantics(), "1.0"); 3944 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); 3945 } 3946 3947 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, 3948 DAGCombinerInfo &DCI) const { 3949 SelectionDAG &DAG = DCI.DAG; 3950 SDLoc DL(N); 3951 3952 switch(N->getOpcode()) { 3953 default: 3954 break; 3955 case ISD::BITCAST: { 3956 EVT DestVT = N->getValueType(0); 3957 3958 // Push casts through vector builds. This helps avoid emitting a large 3959 // number of copies when materializing floating point vector constants. 3960 // 3961 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) => 3962 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y)) 3963 if (DestVT.isVector()) { 3964 SDValue Src = N->getOperand(0); 3965 if (Src.getOpcode() == ISD::BUILD_VECTOR) { 3966 EVT SrcVT = Src.getValueType(); 3967 unsigned NElts = DestVT.getVectorNumElements(); 3968 3969 if (SrcVT.getVectorNumElements() == NElts) { 3970 EVT DestEltVT = DestVT.getVectorElementType(); 3971 3972 SmallVector<SDValue, 8> CastedElts; 3973 SDLoc SL(N); 3974 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) { 3975 SDValue Elt = Src.getOperand(I); 3976 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt)); 3977 } 3978 3979 return DAG.getBuildVector(DestVT, SL, CastedElts); 3980 } 3981 } 3982 } 3983 3984 if (DestVT.getSizeInBits() != 64 && !DestVT.isVector()) 3985 break; 3986 3987 // Fold bitcasts of constants. 3988 // 3989 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k) 3990 // TODO: Generalize and move to DAGCombiner 3991 SDValue Src = N->getOperand(0); 3992 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) { 3993 if (Src.getValueType() == MVT::i64) { 3994 SDLoc SL(N); 3995 uint64_t CVal = C->getZExtValue(); 3996 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 3997 DAG.getConstant(Lo_32(CVal), SL, MVT::i32), 3998 DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); 3999 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV); 4000 } 4001 } 4002 4003 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) { 4004 const APInt &Val = C->getValueAPF().bitcastToAPInt(); 4005 SDLoc SL(N); 4006 uint64_t CVal = Val.getZExtValue(); 4007 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 4008 DAG.getConstant(Lo_32(CVal), SL, MVT::i32), 4009 DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); 4010 4011 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec); 4012 } 4013 4014 break; 4015 } 4016 case ISD::SHL: { 4017 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 4018 break; 4019 4020 return performShlCombine(N, DCI); 4021 } 4022 case ISD::SRL: { 4023 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 4024 break; 4025 4026 return performSrlCombine(N, DCI); 4027 } 4028 case ISD::SRA: { 4029 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 4030 break; 4031 4032 return performSraCombine(N, DCI); 4033 } 4034 case ISD::TRUNCATE: 4035 return performTruncateCombine(N, DCI); 4036 case ISD::MUL: 4037 return performMulCombine(N, DCI); 4038 case ISD::MULHS: 4039 return performMulhsCombine(N, DCI); 4040 case ISD::MULHU: 4041 return performMulhuCombine(N, DCI); 4042 case AMDGPUISD::MUL_I24: 4043 case AMDGPUISD::MUL_U24: 4044 case AMDGPUISD::MULHI_I24: 4045 case AMDGPUISD::MULHI_U24: { 4046 if (SDValue V = simplifyI24(N, DCI)) 4047 return V; 4048 return SDValue(); 4049 } 4050 case AMDGPUISD::MUL_LOHI_I24: 4051 case AMDGPUISD::MUL_LOHI_U24: 4052 return performMulLoHi24Combine(N, DCI); 4053 case ISD::SELECT: 4054 return performSelectCombine(N, DCI); 4055 case ISD::FNEG: 4056 return performFNegCombine(N, DCI); 4057 case ISD::FABS: 4058 return performFAbsCombine(N, DCI); 4059 case AMDGPUISD::BFE_I32: 4060 case AMDGPUISD::BFE_U32: { 4061 assert(!N->getValueType(0).isVector() && 4062 "Vector handling of BFE not implemented"); 4063 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 4064 if (!Width) 4065 break; 4066 4067 uint32_t WidthVal = Width->getZExtValue() & 0x1f; 4068 if (WidthVal == 0) 4069 return DAG.getConstant(0, DL, MVT::i32); 4070 4071 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 4072 if (!Offset) 4073 break; 4074 4075 SDValue BitsFrom = N->getOperand(0); 4076 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; 4077 4078 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; 4079 4080 if (OffsetVal == 0) { 4081 // This is already sign / zero extended, so try to fold away extra BFEs. 4082 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); 4083 4084 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); 4085 if (OpSignBits >= SignBits) 4086 return BitsFrom; 4087 4088 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); 4089 if (Signed) { 4090 // This is a sign_extend_inreg. Replace it to take advantage of existing 4091 // DAG Combines. If not eliminated, we will match back to BFE during 4092 // selection. 4093 4094 // TODO: The sext_inreg of extended types ends, although we can could 4095 // handle them in a single BFE. 4096 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, 4097 DAG.getValueType(SmallVT)); 4098 } 4099 4100 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); 4101 } 4102 4103 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) { 4104 if (Signed) { 4105 return constantFoldBFE<int32_t>(DAG, 4106 CVal->getSExtValue(), 4107 OffsetVal, 4108 WidthVal, 4109 DL); 4110 } 4111 4112 return constantFoldBFE<uint32_t>(DAG, 4113 CVal->getZExtValue(), 4114 OffsetVal, 4115 WidthVal, 4116 DL); 4117 } 4118 4119 if ((OffsetVal + WidthVal) >= 32 && 4120 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) { 4121 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); 4122 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, 4123 BitsFrom, ShiftVal); 4124 } 4125 4126 if (BitsFrom.hasOneUse()) { 4127 APInt Demanded = APInt::getBitsSet(32, 4128 OffsetVal, 4129 OffsetVal + WidthVal); 4130 4131 KnownBits Known; 4132 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 4133 !DCI.isBeforeLegalizeOps()); 4134 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 4135 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) || 4136 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) { 4137 DCI.CommitTargetLoweringOpt(TLO); 4138 } 4139 } 4140 4141 break; 4142 } 4143 case ISD::LOAD: 4144 return performLoadCombine(N, DCI); 4145 case ISD::STORE: 4146 return performStoreCombine(N, DCI); 4147 case AMDGPUISD::RCP: 4148 case AMDGPUISD::RCP_IFLAG: 4149 return performRcpCombine(N, DCI); 4150 case ISD::AssertZext: 4151 case ISD::AssertSext: 4152 return performAssertSZExtCombine(N, DCI); 4153 case ISD::INTRINSIC_WO_CHAIN: 4154 return performIntrinsicWOChainCombine(N, DCI); 4155 } 4156 return SDValue(); 4157 } 4158 4159 //===----------------------------------------------------------------------===// 4160 // Helper functions 4161 //===----------------------------------------------------------------------===// 4162 4163 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 4164 const TargetRegisterClass *RC, 4165 Register Reg, EVT VT, 4166 const SDLoc &SL, 4167 bool RawReg) const { 4168 MachineFunction &MF = DAG.getMachineFunction(); 4169 MachineRegisterInfo &MRI = MF.getRegInfo(); 4170 Register VReg; 4171 4172 if (!MRI.isLiveIn(Reg)) { 4173 VReg = MRI.createVirtualRegister(RC); 4174 MRI.addLiveIn(Reg, VReg); 4175 } else { 4176 VReg = MRI.getLiveInVirtReg(Reg); 4177 } 4178 4179 if (RawReg) 4180 return DAG.getRegister(VReg, VT); 4181 4182 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); 4183 } 4184 4185 // This may be called multiple times, and nothing prevents creating multiple 4186 // objects at the same offset. See if we already defined this object. 4187 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, 4188 int64_t Offset) { 4189 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) { 4190 if (MFI.getObjectOffset(I) == Offset) { 4191 assert(MFI.getObjectSize(I) == Size); 4192 return I; 4193 } 4194 } 4195 4196 return MFI.CreateFixedObject(Size, Offset, true); 4197 } 4198 4199 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, 4200 EVT VT, 4201 const SDLoc &SL, 4202 int64_t Offset) const { 4203 MachineFunction &MF = DAG.getMachineFunction(); 4204 MachineFrameInfo &MFI = MF.getFrameInfo(); 4205 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset); 4206 4207 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); 4208 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); 4209 4210 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4, 4211 MachineMemOperand::MODereferenceable | 4212 MachineMemOperand::MOInvariant); 4213 } 4214 4215 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, 4216 const SDLoc &SL, 4217 SDValue Chain, 4218 SDValue ArgVal, 4219 int64_t Offset) const { 4220 MachineFunction &MF = DAG.getMachineFunction(); 4221 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); 4222 4223 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); 4224 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, 4225 MachineMemOperand::MODereferenceable); 4226 return Store; 4227 } 4228 4229 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, 4230 const TargetRegisterClass *RC, 4231 EVT VT, const SDLoc &SL, 4232 const ArgDescriptor &Arg) const { 4233 assert(Arg && "Attempting to load missing argument"); 4234 4235 SDValue V = Arg.isRegister() ? 4236 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) : 4237 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); 4238 4239 if (!Arg.isMasked()) 4240 return V; 4241 4242 unsigned Mask = Arg.getMask(); 4243 unsigned Shift = countTrailingZeros<unsigned>(Mask); 4244 V = DAG.getNode(ISD::SRL, SL, VT, V, 4245 DAG.getShiftAmountConstant(Shift, VT, SL)); 4246 return DAG.getNode(ISD::AND, SL, VT, V, 4247 DAG.getConstant(Mask >> Shift, SL, VT)); 4248 } 4249 4250 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( 4251 const MachineFunction &MF, const ImplicitParameter Param) const { 4252 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 4253 const AMDGPUSubtarget &ST = 4254 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction()); 4255 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction()); 4256 const Align Alignment = ST.getAlignmentForImplicitArgPtr(); 4257 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) + 4258 ExplicitArgOffset; 4259 switch (Param) { 4260 case GRID_DIM: 4261 return ArgOffset; 4262 case GRID_OFFSET: 4263 return ArgOffset + 4; 4264 } 4265 llvm_unreachable("unexpected implicit parameter type"); 4266 } 4267 4268 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; 4269 4270 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { 4271 switch ((AMDGPUISD::NodeType)Opcode) { 4272 case AMDGPUISD::FIRST_NUMBER: break; 4273 // AMDIL DAG nodes 4274 NODE_NAME_CASE(UMUL); 4275 NODE_NAME_CASE(BRANCH_COND); 4276 4277 // AMDGPU DAG nodes 4278 NODE_NAME_CASE(IF) 4279 NODE_NAME_CASE(ELSE) 4280 NODE_NAME_CASE(LOOP) 4281 NODE_NAME_CASE(CALL) 4282 NODE_NAME_CASE(TC_RETURN) 4283 NODE_NAME_CASE(TRAP) 4284 NODE_NAME_CASE(RET_FLAG) 4285 NODE_NAME_CASE(RETURN_TO_EPILOG) 4286 NODE_NAME_CASE(ENDPGM) 4287 NODE_NAME_CASE(DWORDADDR) 4288 NODE_NAME_CASE(FRACT) 4289 NODE_NAME_CASE(SETCC) 4290 NODE_NAME_CASE(SETREG) 4291 NODE_NAME_CASE(DENORM_MODE) 4292 NODE_NAME_CASE(FMA_W_CHAIN) 4293 NODE_NAME_CASE(FMUL_W_CHAIN) 4294 NODE_NAME_CASE(CLAMP) 4295 NODE_NAME_CASE(COS_HW) 4296 NODE_NAME_CASE(SIN_HW) 4297 NODE_NAME_CASE(FMAX_LEGACY) 4298 NODE_NAME_CASE(FMIN_LEGACY) 4299 NODE_NAME_CASE(FMAX3) 4300 NODE_NAME_CASE(SMAX3) 4301 NODE_NAME_CASE(UMAX3) 4302 NODE_NAME_CASE(FMIN3) 4303 NODE_NAME_CASE(SMIN3) 4304 NODE_NAME_CASE(UMIN3) 4305 NODE_NAME_CASE(FMED3) 4306 NODE_NAME_CASE(SMED3) 4307 NODE_NAME_CASE(UMED3) 4308 NODE_NAME_CASE(FDOT2) 4309 NODE_NAME_CASE(URECIP) 4310 NODE_NAME_CASE(DIV_SCALE) 4311 NODE_NAME_CASE(DIV_FMAS) 4312 NODE_NAME_CASE(DIV_FIXUP) 4313 NODE_NAME_CASE(FMAD_FTZ) 4314 NODE_NAME_CASE(TRIG_PREOP) 4315 NODE_NAME_CASE(RCP) 4316 NODE_NAME_CASE(RSQ) 4317 NODE_NAME_CASE(RCP_LEGACY) 4318 NODE_NAME_CASE(RCP_IFLAG) 4319 NODE_NAME_CASE(FMUL_LEGACY) 4320 NODE_NAME_CASE(RSQ_CLAMP) 4321 NODE_NAME_CASE(LDEXP) 4322 NODE_NAME_CASE(FP_CLASS) 4323 NODE_NAME_CASE(DOT4) 4324 NODE_NAME_CASE(CARRY) 4325 NODE_NAME_CASE(BORROW) 4326 NODE_NAME_CASE(BFE_U32) 4327 NODE_NAME_CASE(BFE_I32) 4328 NODE_NAME_CASE(BFI) 4329 NODE_NAME_CASE(BFM) 4330 NODE_NAME_CASE(FFBH_U32) 4331 NODE_NAME_CASE(FFBH_I32) 4332 NODE_NAME_CASE(FFBL_B32) 4333 NODE_NAME_CASE(MUL_U24) 4334 NODE_NAME_CASE(MUL_I24) 4335 NODE_NAME_CASE(MULHI_U24) 4336 NODE_NAME_CASE(MULHI_I24) 4337 NODE_NAME_CASE(MUL_LOHI_U24) 4338 NODE_NAME_CASE(MUL_LOHI_I24) 4339 NODE_NAME_CASE(MAD_U24) 4340 NODE_NAME_CASE(MAD_I24) 4341 NODE_NAME_CASE(MAD_I64_I32) 4342 NODE_NAME_CASE(MAD_U64_U32) 4343 NODE_NAME_CASE(PERM) 4344 NODE_NAME_CASE(TEXTURE_FETCH) 4345 NODE_NAME_CASE(R600_EXPORT) 4346 NODE_NAME_CASE(CONST_ADDRESS) 4347 NODE_NAME_CASE(REGISTER_LOAD) 4348 NODE_NAME_CASE(REGISTER_STORE) 4349 NODE_NAME_CASE(SAMPLE) 4350 NODE_NAME_CASE(SAMPLEB) 4351 NODE_NAME_CASE(SAMPLED) 4352 NODE_NAME_CASE(SAMPLEL) 4353 NODE_NAME_CASE(CVT_F32_UBYTE0) 4354 NODE_NAME_CASE(CVT_F32_UBYTE1) 4355 NODE_NAME_CASE(CVT_F32_UBYTE2) 4356 NODE_NAME_CASE(CVT_F32_UBYTE3) 4357 NODE_NAME_CASE(CVT_PKRTZ_F16_F32) 4358 NODE_NAME_CASE(CVT_PKNORM_I16_F32) 4359 NODE_NAME_CASE(CVT_PKNORM_U16_F32) 4360 NODE_NAME_CASE(CVT_PK_I16_I32) 4361 NODE_NAME_CASE(CVT_PK_U16_U32) 4362 NODE_NAME_CASE(FP_TO_FP16) 4363 NODE_NAME_CASE(FP16_ZEXT) 4364 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) 4365 NODE_NAME_CASE(CONST_DATA_PTR) 4366 NODE_NAME_CASE(PC_ADD_REL_OFFSET) 4367 NODE_NAME_CASE(LDS) 4368 NODE_NAME_CASE(DUMMY_CHAIN) 4369 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; 4370 NODE_NAME_CASE(LOAD_D16_HI) 4371 NODE_NAME_CASE(LOAD_D16_LO) 4372 NODE_NAME_CASE(LOAD_D16_HI_I8) 4373 NODE_NAME_CASE(LOAD_D16_HI_U8) 4374 NODE_NAME_CASE(LOAD_D16_LO_I8) 4375 NODE_NAME_CASE(LOAD_D16_LO_U8) 4376 NODE_NAME_CASE(STORE_MSKOR) 4377 NODE_NAME_CASE(LOAD_CONSTANT) 4378 NODE_NAME_CASE(TBUFFER_STORE_FORMAT) 4379 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) 4380 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) 4381 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) 4382 NODE_NAME_CASE(DS_ORDERED_COUNT) 4383 NODE_NAME_CASE(ATOMIC_CMP_SWAP) 4384 NODE_NAME_CASE(ATOMIC_INC) 4385 NODE_NAME_CASE(ATOMIC_DEC) 4386 NODE_NAME_CASE(ATOMIC_LOAD_FMIN) 4387 NODE_NAME_CASE(ATOMIC_LOAD_FMAX) 4388 NODE_NAME_CASE(BUFFER_LOAD) 4389 NODE_NAME_CASE(BUFFER_LOAD_UBYTE) 4390 NODE_NAME_CASE(BUFFER_LOAD_USHORT) 4391 NODE_NAME_CASE(BUFFER_LOAD_BYTE) 4392 NODE_NAME_CASE(BUFFER_LOAD_SHORT) 4393 NODE_NAME_CASE(BUFFER_LOAD_FORMAT) 4394 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) 4395 NODE_NAME_CASE(SBUFFER_LOAD) 4396 NODE_NAME_CASE(BUFFER_STORE) 4397 NODE_NAME_CASE(BUFFER_STORE_BYTE) 4398 NODE_NAME_CASE(BUFFER_STORE_SHORT) 4399 NODE_NAME_CASE(BUFFER_STORE_FORMAT) 4400 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) 4401 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) 4402 NODE_NAME_CASE(BUFFER_ATOMIC_ADD) 4403 NODE_NAME_CASE(BUFFER_ATOMIC_SUB) 4404 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) 4405 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) 4406 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) 4407 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) 4408 NODE_NAME_CASE(BUFFER_ATOMIC_AND) 4409 NODE_NAME_CASE(BUFFER_ATOMIC_OR) 4410 NODE_NAME_CASE(BUFFER_ATOMIC_XOR) 4411 NODE_NAME_CASE(BUFFER_ATOMIC_INC) 4412 NODE_NAME_CASE(BUFFER_ATOMIC_DEC) 4413 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) 4414 NODE_NAME_CASE(BUFFER_ATOMIC_FADD) 4415 NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD) 4416 NODE_NAME_CASE(ATOMIC_PK_FADD) 4417 4418 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; 4419 } 4420 return nullptr; 4421 } 4422 4423 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, 4424 SelectionDAG &DAG, int Enabled, 4425 int &RefinementSteps, 4426 bool &UseOneConstNR, 4427 bool Reciprocal) const { 4428 EVT VT = Operand.getValueType(); 4429 4430 if (VT == MVT::f32) { 4431 RefinementSteps = 0; 4432 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); 4433 } 4434 4435 // TODO: There is also f64 rsq instruction, but the documentation is less 4436 // clear on its precision. 4437 4438 return SDValue(); 4439 } 4440 4441 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, 4442 SelectionDAG &DAG, int Enabled, 4443 int &RefinementSteps) const { 4444 EVT VT = Operand.getValueType(); 4445 4446 if (VT == MVT::f32) { 4447 // Reciprocal, < 1 ulp error. 4448 // 4449 // This reciprocal approximation converges to < 0.5 ulp error with one 4450 // newton rhapson performed with two fused multiple adds (FMAs). 4451 4452 RefinementSteps = 0; 4453 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); 4454 } 4455 4456 // TODO: There is also f64 rcp instruction, but the documentation is less 4457 // clear on its precision. 4458 4459 return SDValue(); 4460 } 4461 4462 void AMDGPUTargetLowering::computeKnownBitsForTargetNode( 4463 const SDValue Op, KnownBits &Known, 4464 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { 4465 4466 Known.resetAll(); // Don't know anything. 4467 4468 unsigned Opc = Op.getOpcode(); 4469 4470 switch (Opc) { 4471 default: 4472 break; 4473 case AMDGPUISD::CARRY: 4474 case AMDGPUISD::BORROW: { 4475 Known.Zero = APInt::getHighBitsSet(32, 31); 4476 break; 4477 } 4478 4479 case AMDGPUISD::BFE_I32: 4480 case AMDGPUISD::BFE_U32: { 4481 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 4482 if (!CWidth) 4483 return; 4484 4485 uint32_t Width = CWidth->getZExtValue() & 0x1f; 4486 4487 if (Opc == AMDGPUISD::BFE_U32) 4488 Known.Zero = APInt::getHighBitsSet(32, 32 - Width); 4489 4490 break; 4491 } 4492 case AMDGPUISD::FP_TO_FP16: 4493 case AMDGPUISD::FP16_ZEXT: { 4494 unsigned BitWidth = Known.getBitWidth(); 4495 4496 // High bits are zero. 4497 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 4498 break; 4499 } 4500 case AMDGPUISD::MUL_U24: 4501 case AMDGPUISD::MUL_I24: { 4502 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 4503 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 4504 unsigned TrailZ = LHSKnown.countMinTrailingZeros() + 4505 RHSKnown.countMinTrailingZeros(); 4506 Known.Zero.setLowBits(std::min(TrailZ, 32u)); 4507 // Skip extra check if all bits are known zeros. 4508 if (TrailZ >= 32) 4509 break; 4510 4511 // Truncate to 24 bits. 4512 LHSKnown = LHSKnown.trunc(24); 4513 RHSKnown = RHSKnown.trunc(24); 4514 4515 if (Opc == AMDGPUISD::MUL_I24) { 4516 unsigned LHSValBits = 24 - LHSKnown.countMinSignBits(); 4517 unsigned RHSValBits = 24 - RHSKnown.countMinSignBits(); 4518 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); 4519 if (MaxValBits >= 32) 4520 break; 4521 bool LHSNegative = LHSKnown.isNegative(); 4522 bool LHSNonNegative = LHSKnown.isNonNegative(); 4523 bool LHSPositive = LHSKnown.isStrictlyPositive(); 4524 bool RHSNegative = RHSKnown.isNegative(); 4525 bool RHSNonNegative = RHSKnown.isNonNegative(); 4526 bool RHSPositive = RHSKnown.isStrictlyPositive(); 4527 4528 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative)) 4529 Known.Zero.setHighBits(32 - MaxValBits); 4530 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative)) 4531 Known.One.setHighBits(32 - MaxValBits); 4532 } else { 4533 unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros(); 4534 unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros(); 4535 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); 4536 if (MaxValBits >= 32) 4537 break; 4538 Known.Zero.setHighBits(32 - MaxValBits); 4539 } 4540 break; 4541 } 4542 case AMDGPUISD::PERM: { 4543 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 4544 if (!CMask) 4545 return; 4546 4547 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 4548 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 4549 unsigned Sel = CMask->getZExtValue(); 4550 4551 for (unsigned I = 0; I < 32; I += 8) { 4552 unsigned SelBits = Sel & 0xff; 4553 if (SelBits < 4) { 4554 SelBits *= 8; 4555 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; 4556 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; 4557 } else if (SelBits < 7) { 4558 SelBits = (SelBits & 3) * 8; 4559 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; 4560 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; 4561 } else if (SelBits == 0x0c) { 4562 Known.Zero |= 0xFFull << I; 4563 } else if (SelBits > 0x0c) { 4564 Known.One |= 0xFFull << I; 4565 } 4566 Sel >>= 8; 4567 } 4568 break; 4569 } 4570 case AMDGPUISD::BUFFER_LOAD_UBYTE: { 4571 Known.Zero.setHighBits(24); 4572 break; 4573 } 4574 case AMDGPUISD::BUFFER_LOAD_USHORT: { 4575 Known.Zero.setHighBits(16); 4576 break; 4577 } 4578 case AMDGPUISD::LDS: { 4579 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode()); 4580 unsigned Align = GA->getGlobal()->getAlignment(); 4581 4582 Known.Zero.setHighBits(16); 4583 if (Align) 4584 Known.Zero.setLowBits(Log2_32(Align)); 4585 break; 4586 } 4587 case ISD::INTRINSIC_WO_CHAIN: { 4588 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4589 switch (IID) { 4590 case Intrinsic::amdgcn_mbcnt_lo: 4591 case Intrinsic::amdgcn_mbcnt_hi: { 4592 const GCNSubtarget &ST = 4593 DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); 4594 // These return at most the wavefront size - 1. 4595 unsigned Size = Op.getValueType().getSizeInBits(); 4596 Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2()); 4597 break; 4598 } 4599 default: 4600 break; 4601 } 4602 } 4603 } 4604 } 4605 4606 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( 4607 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, 4608 unsigned Depth) const { 4609 switch (Op.getOpcode()) { 4610 case AMDGPUISD::BFE_I32: { 4611 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 4612 if (!Width) 4613 return 1; 4614 4615 unsigned SignBits = 32 - Width->getZExtValue() + 1; 4616 if (!isNullConstant(Op.getOperand(1))) 4617 return SignBits; 4618 4619 // TODO: Could probably figure something out with non-0 offsets. 4620 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 4621 return std::max(SignBits, Op0SignBits); 4622 } 4623 4624 case AMDGPUISD::BFE_U32: { 4625 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 4626 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; 4627 } 4628 4629 case AMDGPUISD::CARRY: 4630 case AMDGPUISD::BORROW: 4631 return 31; 4632 case AMDGPUISD::BUFFER_LOAD_BYTE: 4633 return 25; 4634 case AMDGPUISD::BUFFER_LOAD_SHORT: 4635 return 17; 4636 case AMDGPUISD::BUFFER_LOAD_UBYTE: 4637 return 24; 4638 case AMDGPUISD::BUFFER_LOAD_USHORT: 4639 return 16; 4640 case AMDGPUISD::FP_TO_FP16: 4641 case AMDGPUISD::FP16_ZEXT: 4642 return 16; 4643 default: 4644 return 1; 4645 } 4646 } 4647 4648 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr( 4649 GISelKnownBits &Analysis, Register R, 4650 const APInt &DemandedElts, const MachineRegisterInfo &MRI, 4651 unsigned Depth) const { 4652 const MachineInstr *MI = MRI.getVRegDef(R); 4653 if (!MI) 4654 return 1; 4655 4656 // TODO: Check range metadata on MMO. 4657 switch (MI->getOpcode()) { 4658 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 4659 return 25; 4660 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 4661 return 17; 4662 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 4663 return 24; 4664 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 4665 return 16; 4666 default: 4667 return 1; 4668 } 4669 } 4670 4671 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, 4672 const SelectionDAG &DAG, 4673 bool SNaN, 4674 unsigned Depth) const { 4675 unsigned Opcode = Op.getOpcode(); 4676 switch (Opcode) { 4677 case AMDGPUISD::FMIN_LEGACY: 4678 case AMDGPUISD::FMAX_LEGACY: { 4679 if (SNaN) 4680 return true; 4681 4682 // TODO: Can check no nans on one of the operands for each one, but which 4683 // one? 4684 return false; 4685 } 4686 case AMDGPUISD::FMUL_LEGACY: 4687 case AMDGPUISD::CVT_PKRTZ_F16_F32: { 4688 if (SNaN) 4689 return true; 4690 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && 4691 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); 4692 } 4693 case AMDGPUISD::FMED3: 4694 case AMDGPUISD::FMIN3: 4695 case AMDGPUISD::FMAX3: 4696 case AMDGPUISD::FMAD_FTZ: { 4697 if (SNaN) 4698 return true; 4699 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && 4700 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 4701 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); 4702 } 4703 case AMDGPUISD::CVT_F32_UBYTE0: 4704 case AMDGPUISD::CVT_F32_UBYTE1: 4705 case AMDGPUISD::CVT_F32_UBYTE2: 4706 case AMDGPUISD::CVT_F32_UBYTE3: 4707 return true; 4708 4709 case AMDGPUISD::RCP: 4710 case AMDGPUISD::RSQ: 4711 case AMDGPUISD::RCP_LEGACY: 4712 case AMDGPUISD::RSQ_CLAMP: { 4713 if (SNaN) 4714 return true; 4715 4716 // TODO: Need is known positive check. 4717 return false; 4718 } 4719 case AMDGPUISD::LDEXP: 4720 case AMDGPUISD::FRACT: { 4721 if (SNaN) 4722 return true; 4723 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); 4724 } 4725 case AMDGPUISD::DIV_SCALE: 4726 case AMDGPUISD::DIV_FMAS: 4727 case AMDGPUISD::DIV_FIXUP: 4728 case AMDGPUISD::TRIG_PREOP: 4729 // TODO: Refine on operands. 4730 return SNaN; 4731 case AMDGPUISD::SIN_HW: 4732 case AMDGPUISD::COS_HW: { 4733 // TODO: Need check for infinity 4734 return SNaN; 4735 } 4736 case ISD::INTRINSIC_WO_CHAIN: { 4737 unsigned IntrinsicID 4738 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4739 // TODO: Handle more intrinsics 4740 switch (IntrinsicID) { 4741 case Intrinsic::amdgcn_cubeid: 4742 return true; 4743 4744 case Intrinsic::amdgcn_frexp_mant: { 4745 if (SNaN) 4746 return true; 4747 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); 4748 } 4749 case Intrinsic::amdgcn_cvt_pkrtz: { 4750 if (SNaN) 4751 return true; 4752 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 4753 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); 4754 } 4755 case Intrinsic::amdgcn_rcp: 4756 case Intrinsic::amdgcn_rsq: 4757 case Intrinsic::amdgcn_rcp_legacy: 4758 case Intrinsic::amdgcn_rsq_legacy: 4759 case Intrinsic::amdgcn_rsq_clamp: { 4760 if (SNaN) 4761 return true; 4762 4763 // TODO: Need is known positive check. 4764 return false; 4765 } 4766 case Intrinsic::amdgcn_fdot2: 4767 // TODO: Refine on operand 4768 return SNaN; 4769 default: 4770 return false; 4771 } 4772 } 4773 default: 4774 return false; 4775 } 4776 } 4777 4778 TargetLowering::AtomicExpansionKind 4779 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { 4780 switch (RMW->getOperation()) { 4781 case AtomicRMWInst::Nand: 4782 case AtomicRMWInst::FAdd: 4783 case AtomicRMWInst::FSub: 4784 return AtomicExpansionKind::CmpXChg; 4785 default: 4786 return AtomicExpansionKind::None; 4787 } 4788 } 4789