1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 static LegalityPredicate isMultiple32(unsigned TypeIdx, 46 unsigned MaxSize = 1024) { 47 return [=](const LegalityQuery &Query) { 48 const LLT Ty = Query.Types[TypeIdx]; 49 const LLT EltTy = Ty.getScalarType(); 50 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 51 }; 52 } 53 54 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 55 return [=](const LegalityQuery &Query) { 56 return Query.Types[TypeIdx].getSizeInBits() == Size; 57 }; 58 } 59 60 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 return Ty.isVector() && 64 Ty.getNumElements() % 2 != 0 && 65 Ty.getElementType().getSizeInBits() < 32 && 66 Ty.getSizeInBits() % 32 != 0; 67 }; 68 } 69 70 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 71 return [=](const LegalityQuery &Query) { 72 const LLT Ty = Query.Types[TypeIdx]; 73 const LLT EltTy = Ty.getScalarType(); 74 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 75 }; 76 } 77 78 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 79 return [=](const LegalityQuery &Query) { 80 const LLT Ty = Query.Types[TypeIdx]; 81 const LLT EltTy = Ty.getElementType(); 82 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 83 }; 84 } 85 86 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getElementType(); 90 unsigned Size = Ty.getSizeInBits(); 91 unsigned Pieces = (Size + 63) / 64; 92 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 93 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 94 }; 95 } 96 97 // Increase the number of vector elements to reach the next multiple of 32-bit 98 // type. 99 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 100 return [=](const LegalityQuery &Query) { 101 const LLT Ty = Query.Types[TypeIdx]; 102 103 const LLT EltTy = Ty.getElementType(); 104 const int Size = Ty.getSizeInBits(); 105 const int EltSize = EltTy.getSizeInBits(); 106 const int NextMul32 = (Size + 31) / 32; 107 108 assert(EltSize < 32); 109 110 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 111 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 112 }; 113 } 114 115 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 116 return [=](const LegalityQuery &Query) { 117 const LLT QueryTy = Query.Types[TypeIdx]; 118 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 119 }; 120 } 121 122 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 123 return [=](const LegalityQuery &Query) { 124 const LLT QueryTy = Query.Types[TypeIdx]; 125 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 126 }; 127 } 128 129 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 133 }; 134 } 135 136 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 137 // v2s16. 138 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 139 return [=](const LegalityQuery &Query) { 140 const LLT Ty = Query.Types[TypeIdx]; 141 if (Ty.isVector()) { 142 const int EltSize = Ty.getElementType().getSizeInBits(); 143 return EltSize == 32 || EltSize == 64 || 144 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 145 EltSize == 128 || EltSize == 256; 146 } 147 148 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 149 }; 150 } 151 152 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 153 return [=](const LegalityQuery &Query) { 154 const LLT QueryTy = Query.Types[TypeIdx]; 155 return QueryTy.isVector() && QueryTy.getElementType() == Type; 156 }; 157 } 158 159 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 160 return [=](const LegalityQuery &Query) { 161 const LLT Ty = Query.Types[TypeIdx]; 162 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 163 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 164 }; 165 } 166 167 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 168 const GCNTargetMachine &TM) 169 : ST(ST_) { 170 using namespace TargetOpcode; 171 172 auto GetAddrSpacePtr = [&TM](unsigned AS) { 173 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 174 }; 175 176 const LLT S1 = LLT::scalar(1); 177 const LLT S16 = LLT::scalar(16); 178 const LLT S32 = LLT::scalar(32); 179 const LLT S64 = LLT::scalar(64); 180 const LLT S96 = LLT::scalar(96); 181 const LLT S128 = LLT::scalar(128); 182 const LLT S256 = LLT::scalar(256); 183 const LLT S1024 = LLT::scalar(1024); 184 185 const LLT V2S16 = LLT::vector(2, 16); 186 const LLT V4S16 = LLT::vector(4, 16); 187 188 const LLT V2S32 = LLT::vector(2, 32); 189 const LLT V3S32 = LLT::vector(3, 32); 190 const LLT V4S32 = LLT::vector(4, 32); 191 const LLT V5S32 = LLT::vector(5, 32); 192 const LLT V6S32 = LLT::vector(6, 32); 193 const LLT V7S32 = LLT::vector(7, 32); 194 const LLT V8S32 = LLT::vector(8, 32); 195 const LLT V9S32 = LLT::vector(9, 32); 196 const LLT V10S32 = LLT::vector(10, 32); 197 const LLT V11S32 = LLT::vector(11, 32); 198 const LLT V12S32 = LLT::vector(12, 32); 199 const LLT V13S32 = LLT::vector(13, 32); 200 const LLT V14S32 = LLT::vector(14, 32); 201 const LLT V15S32 = LLT::vector(15, 32); 202 const LLT V16S32 = LLT::vector(16, 32); 203 const LLT V32S32 = LLT::vector(32, 32); 204 205 const LLT V2S64 = LLT::vector(2, 64); 206 const LLT V3S64 = LLT::vector(3, 64); 207 const LLT V4S64 = LLT::vector(4, 64); 208 const LLT V5S64 = LLT::vector(5, 64); 209 const LLT V6S64 = LLT::vector(6, 64); 210 const LLT V7S64 = LLT::vector(7, 64); 211 const LLT V8S64 = LLT::vector(8, 64); 212 const LLT V16S64 = LLT::vector(16, 64); 213 214 std::initializer_list<LLT> AllS32Vectors = 215 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 216 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 217 std::initializer_list<LLT> AllS64Vectors = 218 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 219 220 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 221 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 222 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 223 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 224 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 225 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 226 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 227 228 const LLT CodePtr = FlatPtr; 229 230 const std::initializer_list<LLT> AddrSpaces64 = { 231 GlobalPtr, ConstantPtr, FlatPtr 232 }; 233 234 const std::initializer_list<LLT> AddrSpaces32 = { 235 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 236 }; 237 238 const std::initializer_list<LLT> FPTypesBase = { 239 S32, S64 240 }; 241 242 const std::initializer_list<LLT> FPTypes16 = { 243 S32, S64, S16 244 }; 245 246 const std::initializer_list<LLT> FPTypesPK16 = { 247 S32, S64, S16, V2S16 248 }; 249 250 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 251 252 setAction({G_BRCOND, S1}, Legal); // VCC branches 253 setAction({G_BRCOND, S32}, Legal); // SCC branches 254 255 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 256 // elements for v3s16 257 getActionDefinitionsBuilder(G_PHI) 258 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 259 .legalFor(AllS32Vectors) 260 .legalFor(AllS64Vectors) 261 .legalFor(AddrSpaces64) 262 .legalFor(AddrSpaces32) 263 .clampScalar(0, S32, S256) 264 .widenScalarToNextPow2(0, 32) 265 .clampMaxNumElements(0, S32, 16) 266 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 267 .legalIf(isPointer(0)); 268 269 if (ST.has16BitInsts()) { 270 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 271 .legalFor({S32, S16}) 272 .clampScalar(0, S16, S32) 273 .scalarize(0); 274 } else { 275 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 276 .legalFor({S32}) 277 .clampScalar(0, S32, S32) 278 .scalarize(0); 279 } 280 281 // FIXME: Not really legal. Placeholder for custom lowering. 282 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 283 .legalFor({S32, S64}) 284 .clampScalar(0, S32, S64) 285 .widenScalarToNextPow2(0, 32) 286 .scalarize(0); 287 288 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 289 .legalFor({S32}) 290 .clampScalar(0, S32, S32) 291 .scalarize(0); 292 293 // Report legal for any types we can handle anywhere. For the cases only legal 294 // on the SALU, RegBankSelect will be able to re-legalize. 295 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 296 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 297 .clampScalar(0, S32, S64) 298 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 299 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 300 .widenScalarToNextPow2(0) 301 .scalarize(0); 302 303 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 304 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 305 .legalFor({{S32, S1}, {S32, S32}}) 306 .clampScalar(0, S32, S32) 307 .scalarize(0); // TODO: Implement. 308 309 getActionDefinitionsBuilder(G_BITCAST) 310 // Don't worry about the size constraint. 311 .legalIf(all(isRegisterType(0), isRegisterType(1))) 312 // FIXME: Testing hack 313 .legalForCartesianProduct({S16, LLT::vector(2, 8), }) 314 .lower(); 315 316 317 getActionDefinitionsBuilder(G_CONSTANT) 318 .legalFor({S1, S32, S64, S16, GlobalPtr, 319 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 320 .clampScalar(0, S32, S64) 321 .widenScalarToNextPow2(0) 322 .legalIf(isPointer(0)); 323 324 getActionDefinitionsBuilder(G_FCONSTANT) 325 .legalFor({S32, S64, S16}) 326 .clampScalar(0, S16, S64); 327 328 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 329 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 330 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 331 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 332 .clampScalarOrElt(0, S32, S1024) 333 .legalIf(isMultiple32(0)) 334 .widenScalarToNextPow2(0, 32) 335 .clampMaxNumElements(0, S32, 16); 336 337 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 338 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 339 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 340 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 341 342 auto &FPOpActions = getActionDefinitionsBuilder( 343 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 344 .legalFor({S32, S64}); 345 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 346 .customFor({S32, S64}); 347 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 348 .customFor({S32, S64}); 349 350 if (ST.has16BitInsts()) { 351 if (ST.hasVOP3PInsts()) 352 FPOpActions.legalFor({S16, V2S16}); 353 else 354 FPOpActions.legalFor({S16}); 355 356 TrigActions.customFor({S16}); 357 FDIVActions.customFor({S16}); 358 } 359 360 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 361 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 362 363 if (ST.hasVOP3PInsts()) { 364 MinNumMaxNum.customFor(FPTypesPK16) 365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 366 .clampMaxNumElements(0, S16, 2) 367 .clampScalar(0, S16, S64) 368 .scalarize(0); 369 } else if (ST.has16BitInsts()) { 370 MinNumMaxNum.customFor(FPTypes16) 371 .clampScalar(0, S16, S64) 372 .scalarize(0); 373 } else { 374 MinNumMaxNum.customFor(FPTypesBase) 375 .clampScalar(0, S32, S64) 376 .scalarize(0); 377 } 378 379 if (ST.hasVOP3PInsts()) 380 FPOpActions.clampMaxNumElements(0, S16, 2); 381 382 FPOpActions 383 .scalarize(0) 384 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 385 386 TrigActions 387 .scalarize(0) 388 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 389 390 FDIVActions 391 .scalarize(0) 392 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 393 394 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 395 .legalFor(FPTypesPK16) 396 .clampMaxNumElements(0, S16, 2) 397 .scalarize(0) 398 .clampScalar(0, S16, S64); 399 400 if (ST.has16BitInsts()) { 401 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 402 .legalFor({S32, S64, S16}) 403 .scalarize(0) 404 .clampScalar(0, S16, S64); 405 } else { 406 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 407 .legalFor({S32, S64}) 408 .scalarize(0) 409 .clampScalar(0, S32, S64); 410 } 411 412 getActionDefinitionsBuilder(G_FPTRUNC) 413 .legalFor({{S32, S64}, {S16, S32}}) 414 .scalarize(0); 415 416 getActionDefinitionsBuilder(G_FPEXT) 417 .legalFor({{S64, S32}, {S32, S16}}) 418 .lowerFor({{S64, S16}}) // FIXME: Implement 419 .scalarize(0); 420 421 getActionDefinitionsBuilder(G_FSUB) 422 // Use actual fsub instruction 423 .legalFor({S32}) 424 // Must use fadd + fneg 425 .lowerFor({S64, S16, V2S16}) 426 .scalarize(0) 427 .clampScalar(0, S32, S64); 428 429 // Whether this is legal depends on the floating point mode for the function. 430 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 431 if (ST.hasMadF16()) 432 FMad.customFor({S32, S16}); 433 else 434 FMad.customFor({S32}); 435 FMad.scalarize(0) 436 .lower(); 437 438 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 439 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 440 {S32, S1}, {S64, S1}, {S16, S1}}) 441 .scalarize(0) 442 .clampScalar(0, S32, S64); 443 444 // TODO: Split s1->s64 during regbankselect for VALU. 445 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 446 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 447 .lowerFor({{S32, S64}}) 448 .lowerIf(typeIs(1, S1)) 449 .customFor({{S64, S64}}); 450 if (ST.has16BitInsts()) 451 IToFP.legalFor({{S16, S16}}); 452 IToFP.clampScalar(1, S32, S64) 453 .scalarize(0); 454 455 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 456 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 457 .customFor({{S64, S64}}); 458 if (ST.has16BitInsts()) 459 FPToI.legalFor({{S16, S16}}); 460 else 461 FPToI.minScalar(1, S32); 462 463 FPToI.minScalar(0, S32) 464 .scalarize(0) 465 .lower(); 466 467 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 468 .scalarize(0) 469 .lower(); 470 471 if (ST.has16BitInsts()) { 472 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 473 .legalFor({S16, S32, S64}) 474 .clampScalar(0, S16, S64) 475 .scalarize(0); 476 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 477 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 478 .legalFor({S32, S64}) 479 .clampScalar(0, S32, S64) 480 .scalarize(0); 481 } else { 482 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 483 .legalFor({S32}) 484 .customFor({S64}) 485 .clampScalar(0, S32, S64) 486 .scalarize(0); 487 } 488 489 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 490 .scalarize(0) 491 .alwaysLegal(); 492 493 auto &CmpBuilder = 494 getActionDefinitionsBuilder(G_ICMP) 495 // The compare output type differs based on the register bank of the output, 496 // so make both s1 and s32 legal. 497 // 498 // Scalar compares producing output in scc will be promoted to s32, as that 499 // is the allocatable register type that will be needed for the copy from 500 // scc. This will be promoted during RegBankSelect, and we assume something 501 // before that won't try to use s32 result types. 502 // 503 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 504 // bank. 505 .legalForCartesianProduct( 506 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 507 .legalForCartesianProduct( 508 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 509 if (ST.has16BitInsts()) { 510 CmpBuilder.legalFor({{S1, S16}}); 511 } 512 513 CmpBuilder 514 .widenScalarToNextPow2(1) 515 .clampScalar(1, S32, S64) 516 .scalarize(0) 517 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 518 519 getActionDefinitionsBuilder(G_FCMP) 520 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 521 .widenScalarToNextPow2(1) 522 .clampScalar(1, S32, S64) 523 .scalarize(0); 524 525 // FIXME: fpow has a selection pattern that should move to custom lowering. 526 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW}); 527 if (ST.has16BitInsts()) 528 Exp2Ops.legalFor({S32, S16}); 529 else 530 Exp2Ops.legalFor({S32}); 531 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 532 Exp2Ops.scalarize(0); 533 534 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10}); 535 if (ST.has16BitInsts()) 536 ExpOps.customFor({{S32}, {S16}}); 537 else 538 ExpOps.customFor({S32}); 539 ExpOps.clampScalar(0, MinScalarFPTy, S32) 540 .scalarize(0); 541 542 // The 64-bit versions produce 32-bit results, but only on the SALU. 543 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 544 G_CTTZ, G_CTTZ_ZERO_UNDEF, 545 G_CTPOP}) 546 .legalFor({{S32, S32}, {S32, S64}}) 547 .clampScalar(0, S32, S32) 548 .clampScalar(1, S32, S64) 549 .scalarize(0) 550 .widenScalarToNextPow2(0, 32) 551 .widenScalarToNextPow2(1, 32); 552 553 // TODO: Expand for > s32 554 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 555 .legalFor({S32}) 556 .clampScalar(0, S32, S32) 557 .scalarize(0); 558 559 if (ST.has16BitInsts()) { 560 if (ST.hasVOP3PInsts()) { 561 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 562 .legalFor({S32, S16, V2S16}) 563 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 564 .clampMaxNumElements(0, S16, 2) 565 .clampScalar(0, S16, S32) 566 .widenScalarToNextPow2(0) 567 .scalarize(0); 568 } else { 569 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 570 .legalFor({S32, S16}) 571 .widenScalarToNextPow2(0) 572 .clampScalar(0, S16, S32) 573 .scalarize(0); 574 } 575 } else { 576 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 577 .legalFor({S32}) 578 .clampScalar(0, S32, S32) 579 .widenScalarToNextPow2(0) 580 .scalarize(0); 581 } 582 583 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 584 return [=](const LegalityQuery &Query) { 585 return Query.Types[TypeIdx0].getSizeInBits() < 586 Query.Types[TypeIdx1].getSizeInBits(); 587 }; 588 }; 589 590 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 591 return [=](const LegalityQuery &Query) { 592 return Query.Types[TypeIdx0].getSizeInBits() > 593 Query.Types[TypeIdx1].getSizeInBits(); 594 }; 595 }; 596 597 getActionDefinitionsBuilder(G_INTTOPTR) 598 // List the common cases 599 .legalForCartesianProduct(AddrSpaces64, {S64}) 600 .legalForCartesianProduct(AddrSpaces32, {S32}) 601 .scalarize(0) 602 // Accept any address space as long as the size matches 603 .legalIf(sameSize(0, 1)) 604 .widenScalarIf(smallerThan(1, 0), 605 [](const LegalityQuery &Query) { 606 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 607 }) 608 .narrowScalarIf(greaterThan(1, 0), 609 [](const LegalityQuery &Query) { 610 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 611 }); 612 613 getActionDefinitionsBuilder(G_PTRTOINT) 614 // List the common cases 615 .legalForCartesianProduct(AddrSpaces64, {S64}) 616 .legalForCartesianProduct(AddrSpaces32, {S32}) 617 .scalarize(0) 618 // Accept any address space as long as the size matches 619 .legalIf(sameSize(0, 1)) 620 .widenScalarIf(smallerThan(0, 1), 621 [](const LegalityQuery &Query) { 622 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 623 }) 624 .narrowScalarIf( 625 greaterThan(0, 1), 626 [](const LegalityQuery &Query) { 627 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 628 }); 629 630 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 631 .scalarize(0) 632 .custom(); 633 634 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 635 // handle some operations by just promoting the register during 636 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 637 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 638 switch (AS) { 639 // FIXME: Private element size. 640 case AMDGPUAS::PRIVATE_ADDRESS: 641 return 32; 642 // FIXME: Check subtarget 643 case AMDGPUAS::LOCAL_ADDRESS: 644 return ST.useDS128() ? 128 : 64; 645 646 // Treat constant and global as identical. SMRD loads are sometimes usable 647 // for global loads (ideally constant address space should be eliminated) 648 // depending on the context. Legality cannot be context dependent, but 649 // RegBankSelect can split the load as necessary depending on the pointer 650 // register bank/uniformity and if the memory is invariant or not written in 651 // a kernel. 652 case AMDGPUAS::CONSTANT_ADDRESS: 653 case AMDGPUAS::GLOBAL_ADDRESS: 654 return IsLoad ? 512 : 128; 655 default: 656 return 128; 657 } 658 }; 659 660 const auto needToSplitMemOp = [=](const LegalityQuery &Query, bool IsLoad) -> bool { 661 const LLT DstTy = Query.Types[0]; 662 663 // Split vector extloads. 664 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 665 unsigned Align = Query.MMODescrs[0].AlignInBits; 666 667 if (MemSize < DstTy.getSizeInBits()) 668 MemSize = std::max(MemSize, Align); 669 670 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 671 return true; 672 673 const LLT PtrTy = Query.Types[1]; 674 unsigned AS = PtrTy.getAddressSpace(); 675 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 676 return true; 677 678 // Catch weird sized loads that don't evenly divide into the access sizes 679 // TODO: May be able to widen depending on alignment etc. 680 unsigned NumRegs = MemSize / 32; 681 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 682 return true; 683 684 if (Align < MemSize) { 685 const SITargetLowering *TLI = ST.getTargetLowering(); 686 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 687 } 688 689 return false; 690 }; 691 692 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 693 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 694 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 695 696 // TODO: Refine based on subtargets which support unaligned access or 128-bit 697 // LDS 698 // TODO: Unsupported flat for SI. 699 700 for (unsigned Op : {G_LOAD, G_STORE}) { 701 const bool IsStore = Op == G_STORE; 702 703 auto &Actions = getActionDefinitionsBuilder(Op); 704 // Whitelist the common cases. 705 // TODO: Pointer loads 706 // TODO: Wide constant loads 707 // TODO: Only CI+ has 3x loads 708 // TODO: Loads to s16 on gfx9 709 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 710 {V2S32, GlobalPtr, 64, GlobalAlign32}, 711 {V3S32, GlobalPtr, 96, GlobalAlign32}, 712 {S96, GlobalPtr, 96, GlobalAlign32}, 713 {V4S32, GlobalPtr, 128, GlobalAlign32}, 714 {S128, GlobalPtr, 128, GlobalAlign32}, 715 {S64, GlobalPtr, 64, GlobalAlign32}, 716 {V2S64, GlobalPtr, 128, GlobalAlign32}, 717 {V2S16, GlobalPtr, 32, GlobalAlign32}, 718 {S32, GlobalPtr, 8, GlobalAlign8}, 719 {S32, GlobalPtr, 16, GlobalAlign16}, 720 721 {S32, LocalPtr, 32, 32}, 722 {S64, LocalPtr, 64, 32}, 723 {V2S32, LocalPtr, 64, 32}, 724 {S32, LocalPtr, 8, 8}, 725 {S32, LocalPtr, 16, 16}, 726 {V2S16, LocalPtr, 32, 32}, 727 728 {S32, PrivatePtr, 32, 32}, 729 {S32, PrivatePtr, 8, 8}, 730 {S32, PrivatePtr, 16, 16}, 731 {V2S16, PrivatePtr, 32, 32}, 732 733 {S32, FlatPtr, 32, GlobalAlign32}, 734 {S32, FlatPtr, 16, GlobalAlign16}, 735 {S32, FlatPtr, 8, GlobalAlign8}, 736 {V2S16, FlatPtr, 32, GlobalAlign32}, 737 738 {S32, ConstantPtr, 32, GlobalAlign32}, 739 {V2S32, ConstantPtr, 64, GlobalAlign32}, 740 {V3S32, ConstantPtr, 96, GlobalAlign32}, 741 {V4S32, ConstantPtr, 128, GlobalAlign32}, 742 {S64, ConstantPtr, 64, GlobalAlign32}, 743 {S128, ConstantPtr, 128, GlobalAlign32}, 744 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 745 Actions 746 .customIf(typeIs(1, Constant32Ptr)) 747 .narrowScalarIf( 748 [=](const LegalityQuery &Query) -> bool { 749 return !Query.Types[0].isVector() && 750 needToSplitMemOp(Query, Op == G_LOAD); 751 }, 752 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 753 const LLT DstTy = Query.Types[0]; 754 const LLT PtrTy = Query.Types[1]; 755 756 const unsigned DstSize = DstTy.getSizeInBits(); 757 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 758 759 // Split extloads. 760 if (DstSize > MemSize) 761 return std::make_pair(0, LLT::scalar(MemSize)); 762 763 if (DstSize > 32 && (DstSize % 32 != 0)) { 764 // FIXME: Need a way to specify non-extload of larger size if 765 // suitably aligned. 766 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 767 } 768 769 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 770 Op == G_LOAD); 771 if (MemSize > MaxSize) 772 return std::make_pair(0, LLT::scalar(MaxSize)); 773 774 unsigned Align = Query.MMODescrs[0].AlignInBits; 775 return std::make_pair(0, LLT::scalar(Align)); 776 }) 777 .fewerElementsIf( 778 [=](const LegalityQuery &Query) -> bool { 779 return Query.Types[0].isVector() && 780 needToSplitMemOp(Query, Op == G_LOAD); 781 }, 782 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 783 const LLT DstTy = Query.Types[0]; 784 const LLT PtrTy = Query.Types[1]; 785 786 LLT EltTy = DstTy.getElementType(); 787 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 788 Op == G_LOAD); 789 790 // Split if it's too large for the address space. 791 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 792 unsigned NumElts = DstTy.getNumElements(); 793 unsigned EltSize = EltTy.getSizeInBits(); 794 795 if (MaxSize % EltSize == 0) { 796 return std::make_pair( 797 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 798 } 799 800 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 801 802 // FIXME: Refine when odd breakdowns handled 803 // The scalars will need to be re-legalized. 804 if (NumPieces == 1 || NumPieces >= NumElts || 805 NumElts % NumPieces != 0) 806 return std::make_pair(0, EltTy); 807 808 return std::make_pair(0, 809 LLT::vector(NumElts / NumPieces, EltTy)); 810 } 811 812 // Need to split because of alignment. 813 unsigned Align = Query.MMODescrs[0].AlignInBits; 814 unsigned EltSize = EltTy.getSizeInBits(); 815 if (EltSize > Align && 816 (EltSize / Align < DstTy.getNumElements())) { 817 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 818 } 819 820 // May need relegalization for the scalars. 821 return std::make_pair(0, EltTy); 822 }) 823 .minScalar(0, S32); 824 825 if (IsStore) 826 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 827 828 // TODO: Need a bitcast lower option? 829 Actions 830 .legalIf([=](const LegalityQuery &Query) { 831 const LLT Ty0 = Query.Types[0]; 832 unsigned Size = Ty0.getSizeInBits(); 833 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 834 unsigned Align = Query.MMODescrs[0].AlignInBits; 835 836 // FIXME: Widening store from alignment not valid. 837 if (MemSize < Size) 838 MemSize = std::max(MemSize, Align); 839 840 // No extending vector loads. 841 if (Size > MemSize && Ty0.isVector()) 842 return false; 843 844 switch (MemSize) { 845 case 8: 846 case 16: 847 return Size == 32; 848 case 32: 849 case 64: 850 case 128: 851 return true; 852 case 96: 853 return ST.hasDwordx3LoadStores(); 854 case 256: 855 case 512: 856 return true; 857 default: 858 return false; 859 } 860 }) 861 .widenScalarToNextPow2(0) 862 // TODO: v3s32->v4s32 with alignment 863 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 864 } 865 866 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 867 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 868 {S32, GlobalPtr, 16, 2 * 8}, 869 {S32, LocalPtr, 8, 8}, 870 {S32, LocalPtr, 16, 16}, 871 {S32, PrivatePtr, 8, 8}, 872 {S32, PrivatePtr, 16, 16}, 873 {S32, ConstantPtr, 8, 8}, 874 {S32, ConstantPtr, 16, 2 * 8}}); 875 if (ST.hasFlatAddressSpace()) { 876 ExtLoads.legalForTypesWithMemDesc( 877 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 878 } 879 880 ExtLoads.clampScalar(0, S32, S32) 881 .widenScalarToNextPow2(0) 882 .unsupportedIfMemSizeNotPow2() 883 .lower(); 884 885 auto &Atomics = getActionDefinitionsBuilder( 886 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 887 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 888 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 889 G_ATOMICRMW_UMIN}) 890 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 891 {S64, GlobalPtr}, {S64, LocalPtr}}); 892 if (ST.hasFlatAddressSpace()) { 893 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 894 } 895 896 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 897 .legalFor({{S32, LocalPtr}}); 898 899 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 900 // demarshalling 901 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 902 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 903 {S32, FlatPtr}, {S64, FlatPtr}}) 904 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 905 {S32, RegionPtr}, {S64, RegionPtr}}); 906 // TODO: Pointer types, any 32-bit or 64-bit vector 907 908 // Condition should be s32 for scalar, s1 for vector. 909 getActionDefinitionsBuilder(G_SELECT) 910 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 911 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 912 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 913 .clampScalar(0, S16, S64) 914 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 915 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 916 .scalarize(1) 917 .clampMaxNumElements(0, S32, 2) 918 .clampMaxNumElements(0, LocalPtr, 2) 919 .clampMaxNumElements(0, PrivatePtr, 2) 920 .scalarize(0) 921 .widenScalarToNextPow2(0) 922 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 923 924 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 925 // be more flexible with the shift amount type. 926 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 927 .legalFor({{S32, S32}, {S64, S32}}); 928 if (ST.has16BitInsts()) { 929 if (ST.hasVOP3PInsts()) { 930 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 931 .clampMaxNumElements(0, S16, 2); 932 } else 933 Shifts.legalFor({{S16, S32}, {S16, S16}}); 934 935 // TODO: Support 16-bit shift amounts 936 Shifts.clampScalar(1, S32, S32); 937 Shifts.clampScalar(0, S16, S64); 938 Shifts.widenScalarToNextPow2(0, 16); 939 } else { 940 // Make sure we legalize the shift amount type first, as the general 941 // expansion for the shifted type will produce much worse code if it hasn't 942 // been truncated already. 943 Shifts.clampScalar(1, S32, S32); 944 Shifts.clampScalar(0, S32, S64); 945 Shifts.widenScalarToNextPow2(0, 32); 946 } 947 Shifts.scalarize(0); 948 949 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 950 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 951 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 952 unsigned IdxTypeIdx = 2; 953 954 getActionDefinitionsBuilder(Op) 955 .customIf([=](const LegalityQuery &Query) { 956 const LLT EltTy = Query.Types[EltTypeIdx]; 957 const LLT VecTy = Query.Types[VecTypeIdx]; 958 const LLT IdxTy = Query.Types[IdxTypeIdx]; 959 return (EltTy.getSizeInBits() == 16 || 960 EltTy.getSizeInBits() % 32 == 0) && 961 VecTy.getSizeInBits() % 32 == 0 && 962 VecTy.getSizeInBits() <= 1024 && 963 IdxTy.getSizeInBits() == 32; 964 }) 965 .clampScalar(EltTypeIdx, S32, S64) 966 .clampScalar(VecTypeIdx, S32, S64) 967 .clampScalar(IdxTypeIdx, S32, S32); 968 } 969 970 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 971 .unsupportedIf([=](const LegalityQuery &Query) { 972 const LLT &EltTy = Query.Types[1].getElementType(); 973 return Query.Types[0] != EltTy; 974 }); 975 976 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 977 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 978 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 979 980 // FIXME: Doesn't handle extract of illegal sizes. 981 getActionDefinitionsBuilder(Op) 982 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 983 // FIXME: Multiples of 16 should not be legal. 984 .legalIf([=](const LegalityQuery &Query) { 985 const LLT BigTy = Query.Types[BigTyIdx]; 986 const LLT LitTy = Query.Types[LitTyIdx]; 987 return (BigTy.getSizeInBits() % 32 == 0) && 988 (LitTy.getSizeInBits() % 16 == 0); 989 }) 990 .widenScalarIf( 991 [=](const LegalityQuery &Query) { 992 const LLT BigTy = Query.Types[BigTyIdx]; 993 return (BigTy.getScalarSizeInBits() < 16); 994 }, 995 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 996 .widenScalarIf( 997 [=](const LegalityQuery &Query) { 998 const LLT LitTy = Query.Types[LitTyIdx]; 999 return (LitTy.getScalarSizeInBits() < 16); 1000 }, 1001 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1002 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1003 .widenScalarToNextPow2(BigTyIdx, 32); 1004 1005 } 1006 1007 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1008 .legalForCartesianProduct(AllS32Vectors, {S32}) 1009 .legalForCartesianProduct(AllS64Vectors, {S64}) 1010 .clampNumElements(0, V16S32, V32S32) 1011 .clampNumElements(0, V2S64, V16S64) 1012 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1013 1014 if (ST.hasScalarPackInsts()) 1015 BuildVector.legalFor({V2S16, S32}); 1016 1017 BuildVector 1018 .minScalarSameAs(1, 0) 1019 .legalIf(isRegisterType(0)) 1020 .minScalarOrElt(0, S32); 1021 1022 if (ST.hasScalarPackInsts()) { 1023 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1024 .legalFor({V2S16, S32}) 1025 .lower(); 1026 } else { 1027 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1028 .lower(); 1029 } 1030 1031 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1032 .legalIf(isRegisterType(0)); 1033 1034 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1035 // pre-legalize. 1036 if (ST.hasVOP3PInsts()) { 1037 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1038 .customFor({V2S16, V2S16}) 1039 .lower(); 1040 } else 1041 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1042 1043 // Merge/Unmerge 1044 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1045 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1046 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1047 1048 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1049 const LLT &Ty = Query.Types[TypeIdx]; 1050 if (Ty.isVector()) { 1051 const LLT &EltTy = Ty.getElementType(); 1052 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1053 return true; 1054 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1055 return true; 1056 } 1057 return false; 1058 }; 1059 1060 auto &Builder = getActionDefinitionsBuilder(Op) 1061 // Try to widen to s16 first for small types. 1062 // TODO: Only do this on targets with legal s16 shifts 1063 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1064 1065 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1066 .lowerFor({{S16, V2S16}}) 1067 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1068 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1069 elementTypeIs(1, S16)), 1070 changeTo(1, V2S16)) 1071 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1072 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1073 // valid. 1074 .clampScalar(LitTyIdx, S32, S256) 1075 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1076 // Break up vectors with weird elements into scalars 1077 .fewerElementsIf( 1078 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1079 scalarize(0)) 1080 .fewerElementsIf( 1081 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1082 scalarize(1)) 1083 .clampScalar(BigTyIdx, S32, S1024); 1084 1085 if (Op == G_MERGE_VALUES) { 1086 Builder.widenScalarIf( 1087 // TODO: Use 16-bit shifts if legal for 8-bit values? 1088 [=](const LegalityQuery &Query) { 1089 const LLT Ty = Query.Types[LitTyIdx]; 1090 return Ty.getSizeInBits() < 32; 1091 }, 1092 changeTo(LitTyIdx, S32)); 1093 } 1094 1095 Builder.widenScalarIf( 1096 [=](const LegalityQuery &Query) { 1097 const LLT Ty = Query.Types[BigTyIdx]; 1098 return !isPowerOf2_32(Ty.getSizeInBits()) && 1099 Ty.getSizeInBits() % 16 != 0; 1100 }, 1101 [=](const LegalityQuery &Query) { 1102 // Pick the next power of 2, or a multiple of 64 over 128. 1103 // Whichever is smaller. 1104 const LLT &Ty = Query.Types[BigTyIdx]; 1105 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1106 if (NewSizeInBits >= 256) { 1107 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1108 if (RoundedTo < NewSizeInBits) 1109 NewSizeInBits = RoundedTo; 1110 } 1111 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1112 }) 1113 .legalIf([=](const LegalityQuery &Query) { 1114 const LLT &BigTy = Query.Types[BigTyIdx]; 1115 const LLT &LitTy = Query.Types[LitTyIdx]; 1116 1117 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1118 return false; 1119 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1120 return false; 1121 1122 return BigTy.getSizeInBits() % 16 == 0 && 1123 LitTy.getSizeInBits() % 16 == 0 && 1124 BigTy.getSizeInBits() <= 1024; 1125 }) 1126 // Any vectors left are the wrong size. Scalarize them. 1127 .scalarize(0) 1128 .scalarize(1); 1129 } 1130 1131 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1132 // RegBankSelect. 1133 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1134 .legalFor({{S32}, {S64}}); 1135 1136 if (ST.hasVOP3PInsts()) { 1137 SextInReg.lowerFor({{V2S16}}) 1138 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1139 // get more vector shift opportunities, since we'll get those when 1140 // expanded. 1141 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1142 } else if (ST.has16BitInsts()) { 1143 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1144 } else { 1145 // Prefer to promote to s32 before lowering if we don't have 16-bit 1146 // shifts. This avoid a lot of intermediate truncate and extend operations. 1147 SextInReg.lowerFor({{S32}, {S64}}); 1148 } 1149 1150 SextInReg 1151 .scalarize(0) 1152 .clampScalar(0, S32, S64) 1153 .lower(); 1154 1155 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1156 .legalFor({S64}); 1157 1158 getActionDefinitionsBuilder({ 1159 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1160 G_FCOPYSIGN, 1161 1162 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1163 G_READ_REGISTER, 1164 G_WRITE_REGISTER, 1165 1166 G_SADDO, G_SSUBO, 1167 1168 // TODO: Implement 1169 G_FMINIMUM, G_FMAXIMUM 1170 }).lower(); 1171 1172 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1173 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1174 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1175 .unsupported(); 1176 1177 computeTables(); 1178 verify(*ST.getInstrInfo()); 1179 } 1180 1181 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1182 MachineRegisterInfo &MRI, 1183 MachineIRBuilder &B, 1184 GISelChangeObserver &Observer) const { 1185 switch (MI.getOpcode()) { 1186 case TargetOpcode::G_ADDRSPACE_CAST: 1187 return legalizeAddrSpaceCast(MI, MRI, B); 1188 case TargetOpcode::G_FRINT: 1189 return legalizeFrint(MI, MRI, B); 1190 case TargetOpcode::G_FCEIL: 1191 return legalizeFceil(MI, MRI, B); 1192 case TargetOpcode::G_INTRINSIC_TRUNC: 1193 return legalizeIntrinsicTrunc(MI, MRI, B); 1194 case TargetOpcode::G_SITOFP: 1195 return legalizeITOFP(MI, MRI, B, true); 1196 case TargetOpcode::G_UITOFP: 1197 return legalizeITOFP(MI, MRI, B, false); 1198 case TargetOpcode::G_FPTOSI: 1199 return legalizeFPTOI(MI, MRI, B, true); 1200 case TargetOpcode::G_FPTOUI: 1201 return legalizeFPTOI(MI, MRI, B, false); 1202 case TargetOpcode::G_FMINNUM: 1203 case TargetOpcode::G_FMAXNUM: 1204 case TargetOpcode::G_FMINNUM_IEEE: 1205 case TargetOpcode::G_FMAXNUM_IEEE: 1206 return legalizeMinNumMaxNum(MI, MRI, B); 1207 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1208 return legalizeExtractVectorElt(MI, MRI, B); 1209 case TargetOpcode::G_INSERT_VECTOR_ELT: 1210 return legalizeInsertVectorElt(MI, MRI, B); 1211 case TargetOpcode::G_SHUFFLE_VECTOR: 1212 return legalizeShuffleVector(MI, MRI, B); 1213 case TargetOpcode::G_FSIN: 1214 case TargetOpcode::G_FCOS: 1215 return legalizeSinCos(MI, MRI, B); 1216 case TargetOpcode::G_GLOBAL_VALUE: 1217 return legalizeGlobalValue(MI, MRI, B); 1218 case TargetOpcode::G_LOAD: 1219 return legalizeLoad(MI, MRI, B, Observer); 1220 case TargetOpcode::G_FMAD: 1221 return legalizeFMad(MI, MRI, B); 1222 case TargetOpcode::G_FDIV: 1223 return legalizeFDIV(MI, MRI, B); 1224 case TargetOpcode::G_ATOMIC_CMPXCHG: 1225 return legalizeAtomicCmpXChg(MI, MRI, B); 1226 case TargetOpcode::G_FLOG: 1227 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1228 case TargetOpcode::G_FLOG10: 1229 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1230 case TargetOpcode::G_FEXP: 1231 return legalizeFExp(MI, B); 1232 default: 1233 return false; 1234 } 1235 1236 llvm_unreachable("expected switch to return"); 1237 } 1238 1239 Register AMDGPULegalizerInfo::getSegmentAperture( 1240 unsigned AS, 1241 MachineRegisterInfo &MRI, 1242 MachineIRBuilder &B) const { 1243 MachineFunction &MF = B.getMF(); 1244 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1245 const LLT S32 = LLT::scalar(32); 1246 1247 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1248 1249 if (ST.hasApertureRegs()) { 1250 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1251 // getreg. 1252 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1253 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1254 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1255 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1256 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1257 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1258 unsigned Encoding = 1259 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1260 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1261 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1262 1263 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1264 1265 B.buildInstr(AMDGPU::S_GETREG_B32) 1266 .addDef(GetReg) 1267 .addImm(Encoding); 1268 MRI.setType(GetReg, S32); 1269 1270 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1271 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1272 } 1273 1274 Register QueuePtr = MRI.createGenericVirtualRegister( 1275 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1276 1277 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1278 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1279 return Register(); 1280 1281 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1282 // private_segment_aperture_base_hi. 1283 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1284 1285 // TODO: can we be smarter about machine pointer info? 1286 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1287 MachineMemOperand *MMO = MF.getMachineMemOperand( 1288 PtrInfo, 1289 MachineMemOperand::MOLoad | 1290 MachineMemOperand::MODereferenceable | 1291 MachineMemOperand::MOInvariant, 1292 4, 1293 MinAlign(64, StructOffset)); 1294 1295 Register LoadAddr; 1296 1297 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1298 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1299 } 1300 1301 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1302 MachineInstr &MI, MachineRegisterInfo &MRI, 1303 MachineIRBuilder &B) const { 1304 MachineFunction &MF = B.getMF(); 1305 1306 B.setInstr(MI); 1307 1308 const LLT S32 = LLT::scalar(32); 1309 Register Dst = MI.getOperand(0).getReg(); 1310 Register Src = MI.getOperand(1).getReg(); 1311 1312 LLT DstTy = MRI.getType(Dst); 1313 LLT SrcTy = MRI.getType(Src); 1314 unsigned DestAS = DstTy.getAddressSpace(); 1315 unsigned SrcAS = SrcTy.getAddressSpace(); 1316 1317 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1318 // vector element. 1319 assert(!DstTy.isVector()); 1320 1321 const AMDGPUTargetMachine &TM 1322 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1323 1324 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1325 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1326 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1327 return true; 1328 } 1329 1330 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1331 // Truncate. 1332 B.buildExtract(Dst, Src, 0); 1333 MI.eraseFromParent(); 1334 return true; 1335 } 1336 1337 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1338 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1339 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1340 1341 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1342 // another. Merge operands are required to be the same type, but creating an 1343 // extra ptrtoint would be kind of pointless. 1344 auto HighAddr = B.buildConstant( 1345 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1346 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1347 MI.eraseFromParent(); 1348 return true; 1349 } 1350 1351 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1352 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1353 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1354 unsigned NullVal = TM.getNullPointerValue(DestAS); 1355 1356 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1357 auto FlatNull = B.buildConstant(SrcTy, 0); 1358 1359 // Extract low 32-bits of the pointer. 1360 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1361 1362 auto CmpRes = 1363 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1364 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1365 1366 MI.eraseFromParent(); 1367 return true; 1368 } 1369 1370 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1371 return false; 1372 1373 if (!ST.hasFlatAddressSpace()) 1374 return false; 1375 1376 auto SegmentNull = 1377 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1378 auto FlatNull = 1379 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1380 1381 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1382 if (!ApertureReg.isValid()) 1383 return false; 1384 1385 auto CmpRes = 1386 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1387 1388 // Coerce the type of the low half of the result so we can use merge_values. 1389 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1390 1391 // TODO: Should we allow mismatched types but matching sizes in merges to 1392 // avoid the ptrtoint? 1393 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1394 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1395 1396 MI.eraseFromParent(); 1397 return true; 1398 } 1399 1400 bool AMDGPULegalizerInfo::legalizeFrint( 1401 MachineInstr &MI, MachineRegisterInfo &MRI, 1402 MachineIRBuilder &B) const { 1403 B.setInstr(MI); 1404 1405 Register Src = MI.getOperand(1).getReg(); 1406 LLT Ty = MRI.getType(Src); 1407 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1408 1409 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1410 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1411 1412 auto C1 = B.buildFConstant(Ty, C1Val); 1413 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1414 1415 // TODO: Should this propagate fast-math-flags? 1416 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1417 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1418 1419 auto C2 = B.buildFConstant(Ty, C2Val); 1420 auto Fabs = B.buildFAbs(Ty, Src); 1421 1422 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1423 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1424 return true; 1425 } 1426 1427 bool AMDGPULegalizerInfo::legalizeFceil( 1428 MachineInstr &MI, MachineRegisterInfo &MRI, 1429 MachineIRBuilder &B) const { 1430 B.setInstr(MI); 1431 1432 const LLT S1 = LLT::scalar(1); 1433 const LLT S64 = LLT::scalar(64); 1434 1435 Register Src = MI.getOperand(1).getReg(); 1436 assert(MRI.getType(Src) == S64); 1437 1438 // result = trunc(src) 1439 // if (src > 0.0 && src != result) 1440 // result += 1.0 1441 1442 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1443 1444 const auto Zero = B.buildFConstant(S64, 0.0); 1445 const auto One = B.buildFConstant(S64, 1.0); 1446 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1447 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1448 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1449 auto Add = B.buildSelect(S64, And, One, Zero); 1450 1451 // TODO: Should this propagate fast-math-flags? 1452 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1453 return true; 1454 } 1455 1456 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1457 MachineIRBuilder &B) { 1458 const unsigned FractBits = 52; 1459 const unsigned ExpBits = 11; 1460 LLT S32 = LLT::scalar(32); 1461 1462 auto Const0 = B.buildConstant(S32, FractBits - 32); 1463 auto Const1 = B.buildConstant(S32, ExpBits); 1464 1465 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1466 .addUse(Const0.getReg(0)) 1467 .addUse(Const1.getReg(0)); 1468 1469 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1470 } 1471 1472 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1473 MachineInstr &MI, MachineRegisterInfo &MRI, 1474 MachineIRBuilder &B) const { 1475 B.setInstr(MI); 1476 1477 const LLT S1 = LLT::scalar(1); 1478 const LLT S32 = LLT::scalar(32); 1479 const LLT S64 = LLT::scalar(64); 1480 1481 Register Src = MI.getOperand(1).getReg(); 1482 assert(MRI.getType(Src) == S64); 1483 1484 // TODO: Should this use extract since the low half is unused? 1485 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1486 Register Hi = Unmerge.getReg(1); 1487 1488 // Extract the upper half, since this is where we will find the sign and 1489 // exponent. 1490 auto Exp = extractF64Exponent(Hi, B); 1491 1492 const unsigned FractBits = 52; 1493 1494 // Extract the sign bit. 1495 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1496 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1497 1498 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1499 1500 const auto Zero32 = B.buildConstant(S32, 0); 1501 1502 // Extend back to 64-bits. 1503 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1504 1505 auto Shr = B.buildAShr(S64, FractMask, Exp); 1506 auto Not = B.buildNot(S64, Shr); 1507 auto Tmp0 = B.buildAnd(S64, Src, Not); 1508 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1509 1510 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1511 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1512 1513 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1514 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1515 return true; 1516 } 1517 1518 bool AMDGPULegalizerInfo::legalizeITOFP( 1519 MachineInstr &MI, MachineRegisterInfo &MRI, 1520 MachineIRBuilder &B, bool Signed) const { 1521 B.setInstr(MI); 1522 1523 Register Dst = MI.getOperand(0).getReg(); 1524 Register Src = MI.getOperand(1).getReg(); 1525 1526 const LLT S64 = LLT::scalar(64); 1527 const LLT S32 = LLT::scalar(32); 1528 1529 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1530 1531 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1532 1533 auto CvtHi = Signed ? 1534 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1535 B.buildUITOFP(S64, Unmerge.getReg(1)); 1536 1537 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1538 1539 auto ThirtyTwo = B.buildConstant(S32, 32); 1540 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1541 .addUse(CvtHi.getReg(0)) 1542 .addUse(ThirtyTwo.getReg(0)); 1543 1544 // TODO: Should this propagate fast-math-flags? 1545 B.buildFAdd(Dst, LdExp, CvtLo); 1546 MI.eraseFromParent(); 1547 return true; 1548 } 1549 1550 // TODO: Copied from DAG implementation. Verify logic and document how this 1551 // actually works. 1552 bool AMDGPULegalizerInfo::legalizeFPTOI( 1553 MachineInstr &MI, MachineRegisterInfo &MRI, 1554 MachineIRBuilder &B, bool Signed) const { 1555 B.setInstr(MI); 1556 1557 Register Dst = MI.getOperand(0).getReg(); 1558 Register Src = MI.getOperand(1).getReg(); 1559 1560 const LLT S64 = LLT::scalar(64); 1561 const LLT S32 = LLT::scalar(32); 1562 1563 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1564 1565 unsigned Flags = MI.getFlags(); 1566 1567 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1568 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1569 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1570 1571 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1572 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1573 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1574 1575 auto Hi = Signed ? 1576 B.buildFPTOSI(S32, FloorMul) : 1577 B.buildFPTOUI(S32, FloorMul); 1578 auto Lo = B.buildFPTOUI(S32, Fma); 1579 1580 B.buildMerge(Dst, { Lo.getReg(0), Hi.getReg(0) }); 1581 MI.eraseFromParent(); 1582 1583 return true; 1584 } 1585 1586 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1587 MachineInstr &MI, MachineRegisterInfo &MRI, 1588 MachineIRBuilder &B) const { 1589 MachineFunction &MF = B.getMF(); 1590 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1591 1592 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1593 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1594 1595 // With ieee_mode disabled, the instructions have the correct behavior 1596 // already for G_FMINNUM/G_FMAXNUM 1597 if (!MFI->getMode().IEEE) 1598 return !IsIEEEOp; 1599 1600 if (IsIEEEOp) 1601 return true; 1602 1603 MachineIRBuilder HelperBuilder(MI); 1604 GISelObserverWrapper DummyObserver; 1605 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1606 HelperBuilder.setInstr(MI); 1607 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1608 } 1609 1610 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1611 MachineInstr &MI, MachineRegisterInfo &MRI, 1612 MachineIRBuilder &B) const { 1613 // TODO: Should move some of this into LegalizerHelper. 1614 1615 // TODO: Promote dynamic indexing of s16 to s32 1616 // TODO: Dynamic s64 indexing is only legal for SGPR. 1617 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1618 if (!IdxVal) // Dynamic case will be selected to register indexing. 1619 return true; 1620 1621 Register Dst = MI.getOperand(0).getReg(); 1622 Register Vec = MI.getOperand(1).getReg(); 1623 1624 LLT VecTy = MRI.getType(Vec); 1625 LLT EltTy = VecTy.getElementType(); 1626 assert(EltTy == MRI.getType(Dst)); 1627 1628 B.setInstr(MI); 1629 1630 if (IdxVal.getValue() < VecTy.getNumElements()) 1631 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1632 else 1633 B.buildUndef(Dst); 1634 1635 MI.eraseFromParent(); 1636 return true; 1637 } 1638 1639 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1640 MachineInstr &MI, MachineRegisterInfo &MRI, 1641 MachineIRBuilder &B) const { 1642 // TODO: Should move some of this into LegalizerHelper. 1643 1644 // TODO: Promote dynamic indexing of s16 to s32 1645 // TODO: Dynamic s64 indexing is only legal for SGPR. 1646 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1647 if (!IdxVal) // Dynamic case will be selected to register indexing. 1648 return true; 1649 1650 Register Dst = MI.getOperand(0).getReg(); 1651 Register Vec = MI.getOperand(1).getReg(); 1652 Register Ins = MI.getOperand(2).getReg(); 1653 1654 LLT VecTy = MRI.getType(Vec); 1655 LLT EltTy = VecTy.getElementType(); 1656 assert(EltTy == MRI.getType(Ins)); 1657 1658 B.setInstr(MI); 1659 1660 if (IdxVal.getValue() < VecTy.getNumElements()) 1661 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1662 else 1663 B.buildUndef(Dst); 1664 1665 MI.eraseFromParent(); 1666 return true; 1667 } 1668 1669 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1670 assert(Mask.size() == 2); 1671 1672 // If one half is undef, the other is trivially in the same reg. 1673 if (Mask[0] == -1 || Mask[1] == -1) 1674 return true; 1675 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1676 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1677 } 1678 1679 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1680 MachineInstr &MI, MachineRegisterInfo &MRI, 1681 MachineIRBuilder &B) const { 1682 const LLT V2S16 = LLT::vector(2, 16); 1683 1684 Register Dst = MI.getOperand(0).getReg(); 1685 Register Src0 = MI.getOperand(1).getReg(); 1686 LLT DstTy = MRI.getType(Dst); 1687 LLT SrcTy = MRI.getType(Src0); 1688 1689 if (SrcTy == V2S16 && DstTy == V2S16 && 1690 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1691 return true; 1692 1693 MachineIRBuilder HelperBuilder(MI); 1694 GISelObserverWrapper DummyObserver; 1695 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1696 HelperBuilder.setInstr(MI); 1697 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1698 } 1699 1700 bool AMDGPULegalizerInfo::legalizeSinCos( 1701 MachineInstr &MI, MachineRegisterInfo &MRI, 1702 MachineIRBuilder &B) const { 1703 B.setInstr(MI); 1704 1705 Register DstReg = MI.getOperand(0).getReg(); 1706 Register SrcReg = MI.getOperand(1).getReg(); 1707 LLT Ty = MRI.getType(DstReg); 1708 unsigned Flags = MI.getFlags(); 1709 1710 Register TrigVal; 1711 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1712 if (ST.hasTrigReducedRange()) { 1713 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1714 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1715 .addUse(MulVal.getReg(0)) 1716 .setMIFlags(Flags).getReg(0); 1717 } else 1718 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1719 1720 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1721 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1722 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1723 .addUse(TrigVal) 1724 .setMIFlags(Flags); 1725 MI.eraseFromParent(); 1726 return true; 1727 } 1728 1729 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1730 Register DstReg, LLT PtrTy, 1731 MachineIRBuilder &B, const GlobalValue *GV, 1732 unsigned Offset, unsigned GAFlags) const { 1733 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1734 // to the following code sequence: 1735 // 1736 // For constant address space: 1737 // s_getpc_b64 s[0:1] 1738 // s_add_u32 s0, s0, $symbol 1739 // s_addc_u32 s1, s1, 0 1740 // 1741 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1742 // a fixup or relocation is emitted to replace $symbol with a literal 1743 // constant, which is a pc-relative offset from the encoding of the $symbol 1744 // operand to the global variable. 1745 // 1746 // For global address space: 1747 // s_getpc_b64 s[0:1] 1748 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1749 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1750 // 1751 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1752 // fixups or relocations are emitted to replace $symbol@*@lo and 1753 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1754 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1755 // operand to the global variable. 1756 // 1757 // What we want here is an offset from the value returned by s_getpc 1758 // (which is the address of the s_add_u32 instruction) to the global 1759 // variable, but since the encoding of $symbol starts 4 bytes after the start 1760 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1761 // small. This requires us to add 4 to the global variable offset in order to 1762 // compute the correct address. 1763 1764 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1765 1766 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1767 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1768 1769 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1770 .addDef(PCReg); 1771 1772 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1773 if (GAFlags == SIInstrInfo::MO_NONE) 1774 MIB.addImm(0); 1775 else 1776 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1777 1778 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1779 1780 if (PtrTy.getSizeInBits() == 32) 1781 B.buildExtract(DstReg, PCReg, 0); 1782 return true; 1783 } 1784 1785 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1786 MachineInstr &MI, MachineRegisterInfo &MRI, 1787 MachineIRBuilder &B) const { 1788 Register DstReg = MI.getOperand(0).getReg(); 1789 LLT Ty = MRI.getType(DstReg); 1790 unsigned AS = Ty.getAddressSpace(); 1791 1792 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1793 MachineFunction &MF = B.getMF(); 1794 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1795 B.setInstr(MI); 1796 1797 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1798 if (!MFI->isEntryFunction()) { 1799 const Function &Fn = MF.getFunction(); 1800 DiagnosticInfoUnsupported BadLDSDecl( 1801 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1802 Fn.getContext().diagnose(BadLDSDecl); 1803 } 1804 1805 // TODO: We could emit code to handle the initialization somewhere. 1806 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1807 const SITargetLowering *TLI = ST.getTargetLowering(); 1808 if (!TLI->shouldUseLDSConstAddress(GV)) { 1809 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1810 return true; // Leave in place; 1811 } 1812 1813 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1814 MI.eraseFromParent(); 1815 return true; 1816 } 1817 1818 const Function &Fn = MF.getFunction(); 1819 DiagnosticInfoUnsupported BadInit( 1820 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1821 Fn.getContext().diagnose(BadInit); 1822 return true; 1823 } 1824 1825 const SITargetLowering *TLI = ST.getTargetLowering(); 1826 1827 if (TLI->shouldEmitFixup(GV)) { 1828 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1829 MI.eraseFromParent(); 1830 return true; 1831 } 1832 1833 if (TLI->shouldEmitPCReloc(GV)) { 1834 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1835 MI.eraseFromParent(); 1836 return true; 1837 } 1838 1839 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1840 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1841 1842 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1843 MachinePointerInfo::getGOT(MF), 1844 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1845 MachineMemOperand::MOInvariant, 1846 8 /*Size*/, 8 /*Align*/); 1847 1848 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1849 1850 if (Ty.getSizeInBits() == 32) { 1851 // Truncate if this is a 32-bit constant adrdess. 1852 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1853 B.buildExtract(DstReg, Load, 0); 1854 } else 1855 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1856 1857 MI.eraseFromParent(); 1858 return true; 1859 } 1860 1861 bool AMDGPULegalizerInfo::legalizeLoad( 1862 MachineInstr &MI, MachineRegisterInfo &MRI, 1863 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1864 B.setInstr(MI); 1865 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1866 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1867 Observer.changingInstr(MI); 1868 MI.getOperand(1).setReg(Cast.getReg(0)); 1869 Observer.changedInstr(MI); 1870 return true; 1871 } 1872 1873 bool AMDGPULegalizerInfo::legalizeFMad( 1874 MachineInstr &MI, MachineRegisterInfo &MRI, 1875 MachineIRBuilder &B) const { 1876 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1877 assert(Ty.isScalar()); 1878 1879 MachineFunction &MF = B.getMF(); 1880 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1881 1882 // TODO: Always legal with future ftz flag. 1883 // FIXME: Do we need just output? 1884 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 1885 return true; 1886 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 1887 return true; 1888 1889 MachineIRBuilder HelperBuilder(MI); 1890 GISelObserverWrapper DummyObserver; 1891 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1892 HelperBuilder.setMBB(*MI.getParent()); 1893 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1894 } 1895 1896 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1897 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1898 Register DstReg = MI.getOperand(0).getReg(); 1899 Register PtrReg = MI.getOperand(1).getReg(); 1900 Register CmpVal = MI.getOperand(2).getReg(); 1901 Register NewVal = MI.getOperand(3).getReg(); 1902 1903 assert(SITargetLowering::isFlatGlobalAddrSpace( 1904 MRI.getType(PtrReg).getAddressSpace()) && 1905 "this should not have been custom lowered"); 1906 1907 LLT ValTy = MRI.getType(CmpVal); 1908 LLT VecTy = LLT::vector(2, ValTy); 1909 1910 B.setInstr(MI); 1911 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1912 1913 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1914 .addDef(DstReg) 1915 .addUse(PtrReg) 1916 .addUse(PackedVal) 1917 .setMemRefs(MI.memoperands()); 1918 1919 MI.eraseFromParent(); 1920 return true; 1921 } 1922 1923 bool AMDGPULegalizerInfo::legalizeFlog( 1924 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 1925 Register Dst = MI.getOperand(0).getReg(); 1926 Register Src = MI.getOperand(1).getReg(); 1927 LLT Ty = B.getMRI()->getType(Dst); 1928 unsigned Flags = MI.getFlags(); 1929 B.setInstr(MI); 1930 1931 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 1932 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 1933 1934 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 1935 MI.eraseFromParent(); 1936 return true; 1937 } 1938 1939 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 1940 MachineIRBuilder &B) const { 1941 Register Dst = MI.getOperand(0).getReg(); 1942 Register Src = MI.getOperand(1).getReg(); 1943 unsigned Flags = MI.getFlags(); 1944 LLT Ty = B.getMRI()->getType(Dst); 1945 B.setInstr(MI); 1946 1947 auto K = B.buildFConstant(Ty, numbers::log2e); 1948 auto Mul = B.buildFMul(Ty, Src, K, Flags); 1949 B.buildFExp2(Dst, Mul, Flags); 1950 1951 MI.eraseFromParent(); 1952 return true; 1953 } 1954 1955 // Return the use branch instruction, otherwise null if the usage is invalid. 1956 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1957 MachineRegisterInfo &MRI, 1958 MachineInstr *&Br) { 1959 Register CondDef = MI.getOperand(0).getReg(); 1960 if (!MRI.hasOneNonDBGUse(CondDef)) 1961 return nullptr; 1962 1963 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1964 if (UseMI.getParent() != MI.getParent() || 1965 UseMI.getOpcode() != AMDGPU::G_BRCOND) 1966 return nullptr; 1967 1968 // Make sure the cond br is followed by a G_BR 1969 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 1970 if (Next != MI.getParent()->end()) { 1971 if (Next->getOpcode() != AMDGPU::G_BR) 1972 return nullptr; 1973 Br = &*Next; 1974 } 1975 1976 return &UseMI; 1977 } 1978 1979 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1980 Register Reg, LLT Ty) const { 1981 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1982 if (LiveIn) 1983 return LiveIn; 1984 1985 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1986 MRI.addLiveIn(Reg, NewReg); 1987 return NewReg; 1988 } 1989 1990 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1991 const ArgDescriptor *Arg) const { 1992 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1993 return false; // TODO: Handle these 1994 1995 assert(Arg->getRegister().isPhysical()); 1996 1997 MachineRegisterInfo &MRI = *B.getMRI(); 1998 1999 LLT Ty = MRI.getType(DstReg); 2000 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2001 2002 if (Arg->isMasked()) { 2003 // TODO: Should we try to emit this once in the entry block? 2004 const LLT S32 = LLT::scalar(32); 2005 const unsigned Mask = Arg->getMask(); 2006 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2007 2008 Register AndMaskSrc = LiveIn; 2009 2010 if (Shift != 0) { 2011 auto ShiftAmt = B.buildConstant(S32, Shift); 2012 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2013 } 2014 2015 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2016 } else 2017 B.buildCopy(DstReg, LiveIn); 2018 2019 // Insert the argument copy if it doens't already exist. 2020 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2021 if (!MRI.getVRegDef(LiveIn)) { 2022 // FIXME: Should have scoped insert pt 2023 MachineBasicBlock &OrigInsBB = B.getMBB(); 2024 auto OrigInsPt = B.getInsertPt(); 2025 2026 MachineBasicBlock &EntryMBB = B.getMF().front(); 2027 EntryMBB.addLiveIn(Arg->getRegister()); 2028 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2029 B.buildCopy(LiveIn, Arg->getRegister()); 2030 2031 B.setInsertPt(OrigInsBB, OrigInsPt); 2032 } 2033 2034 return true; 2035 } 2036 2037 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2038 MachineInstr &MI, 2039 MachineRegisterInfo &MRI, 2040 MachineIRBuilder &B, 2041 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2042 B.setInstr(MI); 2043 2044 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2045 2046 const ArgDescriptor *Arg; 2047 const TargetRegisterClass *RC; 2048 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2049 if (!Arg) { 2050 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2051 return false; 2052 } 2053 2054 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2055 MI.eraseFromParent(); 2056 return true; 2057 } 2058 2059 return false; 2060 } 2061 2062 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2063 MachineRegisterInfo &MRI, 2064 MachineIRBuilder &B) const { 2065 B.setInstr(MI); 2066 Register Dst = MI.getOperand(0).getReg(); 2067 LLT DstTy = MRI.getType(Dst); 2068 LLT S16 = LLT::scalar(16); 2069 LLT S32 = LLT::scalar(32); 2070 LLT S64 = LLT::scalar(64); 2071 2072 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2073 return true; 2074 2075 if (DstTy == S16) 2076 return legalizeFDIV16(MI, MRI, B); 2077 if (DstTy == S32) 2078 return legalizeFDIV32(MI, MRI, B); 2079 if (DstTy == S64) 2080 return legalizeFDIV64(MI, MRI, B); 2081 2082 return false; 2083 } 2084 2085 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2086 MachineRegisterInfo &MRI, 2087 MachineIRBuilder &B) const { 2088 Register Res = MI.getOperand(0).getReg(); 2089 Register LHS = MI.getOperand(1).getReg(); 2090 Register RHS = MI.getOperand(2).getReg(); 2091 2092 uint16_t Flags = MI.getFlags(); 2093 2094 LLT ResTy = MRI.getType(Res); 2095 LLT S32 = LLT::scalar(32); 2096 LLT S64 = LLT::scalar(64); 2097 2098 const MachineFunction &MF = B.getMF(); 2099 bool Unsafe = 2100 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2101 2102 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2103 return false; 2104 2105 if (!Unsafe && ResTy == S32 && 2106 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2107 return false; 2108 2109 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2110 // 1 / x -> RCP(x) 2111 if (CLHS->isExactlyValue(1.0)) { 2112 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2113 .addUse(RHS) 2114 .setMIFlags(Flags); 2115 2116 MI.eraseFromParent(); 2117 return true; 2118 } 2119 2120 // -1 / x -> RCP( FNEG(x) ) 2121 if (CLHS->isExactlyValue(-1.0)) { 2122 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2123 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2124 .addUse(FNeg.getReg(0)) 2125 .setMIFlags(Flags); 2126 2127 MI.eraseFromParent(); 2128 return true; 2129 } 2130 } 2131 2132 // x / y -> x * (1.0 / y) 2133 if (Unsafe) { 2134 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2135 .addUse(RHS) 2136 .setMIFlags(Flags); 2137 B.buildFMul(Res, LHS, RCP, Flags); 2138 2139 MI.eraseFromParent(); 2140 return true; 2141 } 2142 2143 return false; 2144 } 2145 2146 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2147 MachineRegisterInfo &MRI, 2148 MachineIRBuilder &B) const { 2149 B.setInstr(MI); 2150 Register Res = MI.getOperand(0).getReg(); 2151 Register LHS = MI.getOperand(1).getReg(); 2152 Register RHS = MI.getOperand(2).getReg(); 2153 2154 uint16_t Flags = MI.getFlags(); 2155 2156 LLT S16 = LLT::scalar(16); 2157 LLT S32 = LLT::scalar(32); 2158 2159 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2160 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2161 2162 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2163 .addUse(RHSExt.getReg(0)) 2164 .setMIFlags(Flags); 2165 2166 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2167 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2168 2169 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2170 .addUse(RDst.getReg(0)) 2171 .addUse(RHS) 2172 .addUse(LHS) 2173 .setMIFlags(Flags); 2174 2175 MI.eraseFromParent(); 2176 return true; 2177 } 2178 2179 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2180 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2181 static void toggleSPDenormMode(bool Enable, 2182 MachineIRBuilder &B, 2183 const GCNSubtarget &ST, 2184 AMDGPU::SIModeRegisterDefaults Mode) { 2185 // Set SP denorm mode to this value. 2186 unsigned SPDenormMode = 2187 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2188 2189 if (ST.hasDenormModeInst()) { 2190 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2191 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2192 2193 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2194 B.buildInstr(AMDGPU::S_DENORM_MODE) 2195 .addImm(NewDenormModeValue); 2196 2197 } else { 2198 // Select FP32 bit field in mode register. 2199 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2200 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2201 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2202 2203 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2204 .addImm(SPDenormMode) 2205 .addImm(SPDenormModeBitField); 2206 } 2207 } 2208 2209 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2210 MachineRegisterInfo &MRI, 2211 MachineIRBuilder &B) const { 2212 B.setInstr(MI); 2213 Register Res = MI.getOperand(0).getReg(); 2214 Register LHS = MI.getOperand(1).getReg(); 2215 Register RHS = MI.getOperand(2).getReg(); 2216 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2217 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2218 2219 uint16_t Flags = MI.getFlags(); 2220 2221 LLT S32 = LLT::scalar(32); 2222 LLT S1 = LLT::scalar(1); 2223 2224 auto One = B.buildFConstant(S32, 1.0f); 2225 2226 auto DenominatorScaled = 2227 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2228 .addUse(RHS) 2229 .addUse(LHS) 2230 .addImm(1) 2231 .setMIFlags(Flags); 2232 auto NumeratorScaled = 2233 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2234 .addUse(LHS) 2235 .addUse(RHS) 2236 .addImm(0) 2237 .setMIFlags(Flags); 2238 2239 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2240 .addUse(DenominatorScaled.getReg(0)) 2241 .setMIFlags(Flags); 2242 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2243 2244 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2245 // aren't modeled as reading it. 2246 if (!Mode.allFP32Denormals()) 2247 toggleSPDenormMode(true, B, ST, Mode); 2248 2249 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2250 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2251 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2252 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2253 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2254 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2255 2256 if (!Mode.allFP32Denormals()) 2257 toggleSPDenormMode(false, B, ST, Mode); 2258 2259 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2260 .addUse(Fma4.getReg(0)) 2261 .addUse(Fma1.getReg(0)) 2262 .addUse(Fma3.getReg(0)) 2263 .addUse(NumeratorScaled.getReg(1)) 2264 .setMIFlags(Flags); 2265 2266 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2267 .addUse(Fmas.getReg(0)) 2268 .addUse(RHS) 2269 .addUse(LHS) 2270 .setMIFlags(Flags); 2271 2272 MI.eraseFromParent(); 2273 return true; 2274 } 2275 2276 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2277 MachineRegisterInfo &MRI, 2278 MachineIRBuilder &B) const { 2279 B.setInstr(MI); 2280 Register Res = MI.getOperand(0).getReg(); 2281 Register LHS = MI.getOperand(1).getReg(); 2282 Register RHS = MI.getOperand(2).getReg(); 2283 2284 uint16_t Flags = MI.getFlags(); 2285 2286 LLT S64 = LLT::scalar(64); 2287 LLT S1 = LLT::scalar(1); 2288 2289 auto One = B.buildFConstant(S64, 1.0); 2290 2291 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2292 .addUse(LHS) 2293 .addUse(RHS) 2294 .addImm(1) 2295 .setMIFlags(Flags); 2296 2297 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2298 2299 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2300 .addUse(DivScale0.getReg(0)) 2301 .setMIFlags(Flags); 2302 2303 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2304 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2305 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2306 2307 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2308 .addUse(LHS) 2309 .addUse(RHS) 2310 .addImm(0) 2311 .setMIFlags(Flags); 2312 2313 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2314 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2315 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2316 2317 Register Scale; 2318 if (!ST.hasUsableDivScaleConditionOutput()) { 2319 // Workaround a hardware bug on SI where the condition output from div_scale 2320 // is not usable. 2321 2322 LLT S32 = LLT::scalar(32); 2323 2324 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2325 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2326 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2327 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2328 2329 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2330 Scale1Unmerge.getReg(1)); 2331 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2332 Scale0Unmerge.getReg(1)); 2333 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2334 } else { 2335 Scale = DivScale1.getReg(1); 2336 } 2337 2338 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2339 .addUse(Fma4.getReg(0)) 2340 .addUse(Fma3.getReg(0)) 2341 .addUse(Mul.getReg(0)) 2342 .addUse(Scale) 2343 .setMIFlags(Flags); 2344 2345 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2346 .addUse(Fmas.getReg(0)) 2347 .addUse(RHS) 2348 .addUse(LHS) 2349 .setMIFlags(Flags); 2350 2351 MI.eraseFromParent(); 2352 return true; 2353 } 2354 2355 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2356 MachineRegisterInfo &MRI, 2357 MachineIRBuilder &B) const { 2358 B.setInstr(MI); 2359 Register Res = MI.getOperand(0).getReg(); 2360 Register LHS = MI.getOperand(2).getReg(); 2361 Register RHS = MI.getOperand(3).getReg(); 2362 uint16_t Flags = MI.getFlags(); 2363 2364 LLT S32 = LLT::scalar(32); 2365 LLT S1 = LLT::scalar(1); 2366 2367 auto Abs = B.buildFAbs(S32, RHS, Flags); 2368 const APFloat C0Val(1.0f); 2369 2370 auto C0 = B.buildConstant(S32, 0x6f800000); 2371 auto C1 = B.buildConstant(S32, 0x2f800000); 2372 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2373 2374 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2375 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2376 2377 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2378 2379 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2380 .addUse(Mul0.getReg(0)) 2381 .setMIFlags(Flags); 2382 2383 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2384 2385 B.buildFMul(Res, Sel, Mul1, Flags); 2386 2387 MI.eraseFromParent(); 2388 return true; 2389 } 2390 2391 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2392 MachineRegisterInfo &MRI, 2393 MachineIRBuilder &B) const { 2394 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2395 if (!MFI->isEntryFunction()) { 2396 return legalizePreloadedArgIntrin(MI, MRI, B, 2397 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2398 } 2399 2400 B.setInstr(MI); 2401 2402 uint64_t Offset = 2403 ST.getTargetLowering()->getImplicitParameterOffset( 2404 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2405 Register DstReg = MI.getOperand(0).getReg(); 2406 LLT DstTy = MRI.getType(DstReg); 2407 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2408 2409 const ArgDescriptor *Arg; 2410 const TargetRegisterClass *RC; 2411 std::tie(Arg, RC) 2412 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2413 if (!Arg) 2414 return false; 2415 2416 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2417 if (!loadInputValue(KernargPtrReg, B, Arg)) 2418 return false; 2419 2420 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2421 MI.eraseFromParent(); 2422 return true; 2423 } 2424 2425 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2426 MachineRegisterInfo &MRI, 2427 MachineIRBuilder &B, 2428 unsigned AddrSpace) const { 2429 B.setInstr(MI); 2430 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2431 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2432 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2433 MI.eraseFromParent(); 2434 return true; 2435 } 2436 2437 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2438 // offset (the offset that is included in bounds checking and swizzling, to be 2439 // split between the instruction's voffset and immoffset fields) and soffset 2440 // (the offset that is excluded from bounds checking and swizzling, to go in 2441 // the instruction's soffset field). This function takes the first kind of 2442 // offset and figures out how to split it between voffset and immoffset. 2443 std::tuple<Register, unsigned, unsigned> 2444 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2445 Register OrigOffset) const { 2446 const unsigned MaxImm = 4095; 2447 Register BaseReg; 2448 unsigned TotalConstOffset; 2449 MachineInstr *OffsetDef; 2450 const LLT S32 = LLT::scalar(32); 2451 2452 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2453 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2454 2455 unsigned ImmOffset = TotalConstOffset; 2456 2457 // If the immediate value is too big for the immoffset field, put the value 2458 // and -4096 into the immoffset field so that the value that is copied/added 2459 // for the voffset field is a multiple of 4096, and it stands more chance 2460 // of being CSEd with the copy/add for another similar load/store. 2461 // However, do not do that rounding down to a multiple of 4096 if that is a 2462 // negative number, as it appears to be illegal to have a negative offset 2463 // in the vgpr, even if adding the immediate offset makes it positive. 2464 unsigned Overflow = ImmOffset & ~MaxImm; 2465 ImmOffset -= Overflow; 2466 if ((int32_t)Overflow < 0) { 2467 Overflow += ImmOffset; 2468 ImmOffset = 0; 2469 } 2470 2471 if (Overflow != 0) { 2472 if (!BaseReg) { 2473 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2474 } else { 2475 auto OverflowVal = B.buildConstant(S32, Overflow); 2476 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2477 } 2478 } 2479 2480 if (!BaseReg) 2481 BaseReg = B.buildConstant(S32, 0).getReg(0); 2482 2483 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2484 } 2485 2486 /// Handle register layout difference for f16 images for some subtargets. 2487 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2488 MachineRegisterInfo &MRI, 2489 Register Reg) const { 2490 if (!ST.hasUnpackedD16VMem()) 2491 return Reg; 2492 2493 const LLT S16 = LLT::scalar(16); 2494 const LLT S32 = LLT::scalar(32); 2495 LLT StoreVT = MRI.getType(Reg); 2496 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2497 2498 auto Unmerge = B.buildUnmerge(S16, Reg); 2499 2500 SmallVector<Register, 4> WideRegs; 2501 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2502 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2503 2504 int NumElts = StoreVT.getNumElements(); 2505 2506 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2507 } 2508 2509 Register AMDGPULegalizerInfo::fixStoreSourceType( 2510 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2511 MachineRegisterInfo *MRI = B.getMRI(); 2512 LLT Ty = MRI->getType(VData); 2513 2514 const LLT S16 = LLT::scalar(16); 2515 2516 // Fixup illegal register types for i8 stores. 2517 if (Ty == LLT::scalar(8) || Ty == S16) { 2518 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2519 return AnyExt; 2520 } 2521 2522 if (Ty.isVector()) { 2523 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2524 if (IsFormat) 2525 return handleD16VData(B, *MRI, VData); 2526 } 2527 } 2528 2529 return VData; 2530 } 2531 2532 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2533 MachineRegisterInfo &MRI, 2534 MachineIRBuilder &B, 2535 bool IsTyped, 2536 bool IsFormat) const { 2537 B.setInstr(MI); 2538 2539 Register VData = MI.getOperand(1).getReg(); 2540 LLT Ty = MRI.getType(VData); 2541 LLT EltTy = Ty.getScalarType(); 2542 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2543 const LLT S32 = LLT::scalar(32); 2544 2545 VData = fixStoreSourceType(B, VData, IsFormat); 2546 Register RSrc = MI.getOperand(2).getReg(); 2547 2548 MachineMemOperand *MMO = *MI.memoperands_begin(); 2549 const int MemSize = MMO->getSize(); 2550 2551 unsigned ImmOffset; 2552 unsigned TotalOffset; 2553 2554 // The typed intrinsics add an immediate after the registers. 2555 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2556 2557 // The struct intrinsic variants add one additional operand over raw. 2558 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2559 Register VIndex; 2560 int OpOffset = 0; 2561 if (HasVIndex) { 2562 VIndex = MI.getOperand(3).getReg(); 2563 OpOffset = 1; 2564 } 2565 2566 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2567 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2568 2569 unsigned Format = 0; 2570 if (IsTyped) { 2571 Format = MI.getOperand(5 + OpOffset).getImm(); 2572 ++OpOffset; 2573 } 2574 2575 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2576 2577 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2578 if (TotalOffset != 0) 2579 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2580 2581 unsigned Opc; 2582 if (IsTyped) { 2583 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2584 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2585 } else if (IsFormat) { 2586 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2587 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2588 } else { 2589 switch (MemSize) { 2590 case 1: 2591 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2592 break; 2593 case 2: 2594 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 2595 break; 2596 default: 2597 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 2598 break; 2599 } 2600 } 2601 2602 if (!VIndex) 2603 VIndex = B.buildConstant(S32, 0).getReg(0); 2604 2605 auto MIB = B.buildInstr(Opc) 2606 .addUse(VData) // vdata 2607 .addUse(RSrc) // rsrc 2608 .addUse(VIndex) // vindex 2609 .addUse(VOffset) // voffset 2610 .addUse(SOffset) // soffset 2611 .addImm(ImmOffset); // offset(imm) 2612 2613 if (IsTyped) 2614 MIB.addImm(Format); 2615 2616 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2617 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2618 .addMemOperand(MMO); 2619 2620 MI.eraseFromParent(); 2621 return true; 2622 } 2623 2624 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 2625 MachineRegisterInfo &MRI, 2626 MachineIRBuilder &B, 2627 bool IsFormat, 2628 bool IsTyped) const { 2629 B.setInstr(MI); 2630 2631 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 2632 MachineMemOperand *MMO = *MI.memoperands_begin(); 2633 const int MemSize = MMO->getSize(); 2634 const LLT S32 = LLT::scalar(32); 2635 2636 Register Dst = MI.getOperand(0).getReg(); 2637 Register RSrc = MI.getOperand(2).getReg(); 2638 2639 // The typed intrinsics add an immediate after the registers. 2640 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2641 2642 // The struct intrinsic variants add one additional operand over raw. 2643 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2644 Register VIndex; 2645 int OpOffset = 0; 2646 if (HasVIndex) { 2647 VIndex = MI.getOperand(3).getReg(); 2648 OpOffset = 1; 2649 } 2650 2651 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2652 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2653 2654 unsigned Format = 0; 2655 if (IsTyped) { 2656 Format = MI.getOperand(5 + OpOffset).getImm(); 2657 ++OpOffset; 2658 } 2659 2660 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2661 unsigned ImmOffset; 2662 unsigned TotalOffset; 2663 2664 LLT Ty = MRI.getType(Dst); 2665 LLT EltTy = Ty.getScalarType(); 2666 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2667 const bool Unpacked = ST.hasUnpackedD16VMem(); 2668 2669 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2670 if (TotalOffset != 0) 2671 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2672 2673 unsigned Opc; 2674 2675 if (IsTyped) { 2676 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 2677 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 2678 } else if (IsFormat) { 2679 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 2680 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 2681 } else { 2682 switch (MemSize) { 2683 case 1: 2684 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 2685 break; 2686 case 2: 2687 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 2688 break; 2689 default: 2690 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 2691 break; 2692 } 2693 } 2694 2695 Register LoadDstReg; 2696 2697 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 2698 LLT UnpackedTy = Ty.changeElementSize(32); 2699 2700 if (IsExtLoad) 2701 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 2702 else if (Unpacked && IsD16 && Ty.isVector()) 2703 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 2704 else 2705 LoadDstReg = Dst; 2706 2707 if (!VIndex) 2708 VIndex = B.buildConstant(S32, 0).getReg(0); 2709 2710 auto MIB = B.buildInstr(Opc) 2711 .addDef(LoadDstReg) // vdata 2712 .addUse(RSrc) // rsrc 2713 .addUse(VIndex) // vindex 2714 .addUse(VOffset) // voffset 2715 .addUse(SOffset) // soffset 2716 .addImm(ImmOffset); // offset(imm) 2717 2718 if (IsTyped) 2719 MIB.addImm(Format); 2720 2721 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2722 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2723 .addMemOperand(MMO); 2724 2725 if (LoadDstReg != Dst) { 2726 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2727 2728 // Widen result for extending loads was widened. 2729 if (IsExtLoad) 2730 B.buildTrunc(Dst, LoadDstReg); 2731 else { 2732 // Repack to original 16-bit vector result 2733 // FIXME: G_TRUNC should work, but legalization currently fails 2734 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 2735 SmallVector<Register, 4> Repack; 2736 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 2737 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 2738 B.buildMerge(Dst, Repack); 2739 } 2740 } 2741 2742 MI.eraseFromParent(); 2743 return true; 2744 } 2745 2746 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2747 MachineIRBuilder &B, 2748 bool IsInc) const { 2749 B.setInstr(MI); 2750 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2751 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2752 B.buildInstr(Opc) 2753 .addDef(MI.getOperand(0).getReg()) 2754 .addUse(MI.getOperand(2).getReg()) 2755 .addUse(MI.getOperand(3).getReg()) 2756 .cloneMemRefs(MI); 2757 MI.eraseFromParent(); 2758 return true; 2759 } 2760 2761 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 2762 switch (IntrID) { 2763 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2764 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2765 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 2766 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2767 case Intrinsic::amdgcn_struct_buffer_atomic_add: 2768 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 2769 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 2770 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 2771 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 2772 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 2773 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 2774 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 2775 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2776 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2777 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 2778 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2779 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2780 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 2781 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2782 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2783 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 2784 case Intrinsic::amdgcn_raw_buffer_atomic_and: 2785 case Intrinsic::amdgcn_struct_buffer_atomic_and: 2786 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 2787 case Intrinsic::amdgcn_raw_buffer_atomic_or: 2788 case Intrinsic::amdgcn_struct_buffer_atomic_or: 2789 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 2790 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 2791 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 2792 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 2793 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 2794 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 2795 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 2796 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 2797 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 2798 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 2799 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 2800 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 2801 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 2802 default: 2803 llvm_unreachable("unhandled atomic opcode"); 2804 } 2805 } 2806 2807 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 2808 MachineIRBuilder &B, 2809 Intrinsic::ID IID) const { 2810 B.setInstr(MI); 2811 2812 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 2813 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 2814 2815 Register Dst = MI.getOperand(0).getReg(); 2816 Register VData = MI.getOperand(2).getReg(); 2817 2818 Register CmpVal; 2819 int OpOffset = 0; 2820 2821 if (IsCmpSwap) { 2822 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 2823 ++OpOffset; 2824 } 2825 2826 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 2827 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 2828 2829 // The struct intrinsic variants add one additional operand over raw. 2830 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2831 Register VIndex; 2832 if (HasVIndex) { 2833 VIndex = MI.getOperand(4 + OpOffset).getReg(); 2834 ++OpOffset; 2835 } 2836 2837 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 2838 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 2839 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 2840 2841 MachineMemOperand *MMO = *MI.memoperands_begin(); 2842 2843 unsigned ImmOffset; 2844 unsigned TotalOffset; 2845 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2846 if (TotalOffset != 0) 2847 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 2848 2849 if (!VIndex) 2850 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 2851 2852 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 2853 .addDef(Dst) 2854 .addUse(VData); // vdata 2855 2856 if (IsCmpSwap) 2857 MIB.addReg(CmpVal); 2858 2859 MIB.addUse(RSrc) // rsrc 2860 .addUse(VIndex) // vindex 2861 .addUse(VOffset) // voffset 2862 .addUse(SOffset) // soffset 2863 .addImm(ImmOffset) // offset(imm) 2864 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2865 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2866 .addMemOperand(MMO); 2867 2868 MI.eraseFromParent(); 2869 return true; 2870 } 2871 2872 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 2873 MachineInstr &MI, MachineIRBuilder &B, 2874 GISelChangeObserver &Observer, 2875 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 2876 // We are only processing the operands of d16 image operations on subtargets 2877 // that use the unpacked register layout. 2878 if (!ST.hasUnpackedD16VMem()) 2879 return true; 2880 2881 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 2882 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 2883 2884 if (BaseOpcode->Atomic) // No d16 atomics 2885 return true; 2886 2887 MachineRegisterInfo *MRI = B.getMRI(); 2888 const LLT S32 = LLT::scalar(32); 2889 const LLT S16 = LLT::scalar(16); 2890 2891 if (BaseOpcode->Store) { 2892 Register VData = MI.getOperand(1).getReg(); 2893 LLT Ty = MRI->getType(VData); 2894 if (!Ty.isVector() || Ty.getElementType() != S16) 2895 return true; 2896 2897 B.setInstr(MI); 2898 2899 Observer.changingInstr(MI); 2900 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 2901 Observer.changedInstr(MI); 2902 return true; 2903 } 2904 2905 // Must be an image load. 2906 Register DstReg = MI.getOperand(0).getReg(); 2907 LLT Ty = MRI->getType(DstReg); 2908 if (!Ty.isVector() || Ty.getElementType() != S16) 2909 return true; 2910 2911 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2912 2913 LLT WidenedTy = Ty.changeElementType(S32); 2914 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 2915 2916 Observer.changingInstr(MI); 2917 MI.getOperand(0).setReg(WideDstReg); 2918 Observer.changedInstr(MI); 2919 2920 // FIXME: Just vector trunc should be sufficent, but legalization currently 2921 // broken. 2922 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 2923 2924 int NumOps = Unmerge->getNumOperands() - 1; 2925 SmallVector<Register, 4> RemergeParts(NumOps); 2926 for (int I = 0; I != NumOps; ++I) 2927 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 2928 2929 B.buildBuildVector(DstReg, RemergeParts); 2930 return true; 2931 } 2932 2933 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2934 MachineIRBuilder &B, 2935 GISelChangeObserver &Observer) const { 2936 MachineRegisterInfo &MRI = *B.getMRI(); 2937 2938 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2939 auto IntrID = MI.getIntrinsicID(); 2940 switch (IntrID) { 2941 case Intrinsic::amdgcn_if: 2942 case Intrinsic::amdgcn_else: { 2943 MachineInstr *Br = nullptr; 2944 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2945 const SIRegisterInfo *TRI 2946 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2947 2948 B.setInstr(*BrCond); 2949 Register Def = MI.getOperand(1).getReg(); 2950 Register Use = MI.getOperand(3).getReg(); 2951 2952 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 2953 if (Br) 2954 BrTarget = Br->getOperand(0).getMBB(); 2955 2956 if (IntrID == Intrinsic::amdgcn_if) { 2957 B.buildInstr(AMDGPU::SI_IF) 2958 .addDef(Def) 2959 .addUse(Use) 2960 .addMBB(BrTarget); 2961 } else { 2962 B.buildInstr(AMDGPU::SI_ELSE) 2963 .addDef(Def) 2964 .addUse(Use) 2965 .addMBB(BrTarget) 2966 .addImm(0); 2967 } 2968 2969 if (Br) 2970 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 2971 2972 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2973 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2974 MI.eraseFromParent(); 2975 BrCond->eraseFromParent(); 2976 return true; 2977 } 2978 2979 return false; 2980 } 2981 case Intrinsic::amdgcn_loop: { 2982 MachineInstr *Br = nullptr; 2983 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2984 const SIRegisterInfo *TRI 2985 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2986 2987 B.setInstr(*BrCond); 2988 2989 // FIXME: Need to adjust branch targets based on unconditional branch. 2990 Register Reg = MI.getOperand(2).getReg(); 2991 B.buildInstr(AMDGPU::SI_LOOP) 2992 .addUse(Reg) 2993 .addMBB(BrCond->getOperand(1).getMBB()); 2994 MI.eraseFromParent(); 2995 BrCond->eraseFromParent(); 2996 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2997 return true; 2998 } 2999 3000 return false; 3001 } 3002 case Intrinsic::amdgcn_kernarg_segment_ptr: 3003 return legalizePreloadedArgIntrin( 3004 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3005 case Intrinsic::amdgcn_implicitarg_ptr: 3006 return legalizeImplicitArgPtr(MI, MRI, B); 3007 case Intrinsic::amdgcn_workitem_id_x: 3008 return legalizePreloadedArgIntrin(MI, MRI, B, 3009 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3010 case Intrinsic::amdgcn_workitem_id_y: 3011 return legalizePreloadedArgIntrin(MI, MRI, B, 3012 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3013 case Intrinsic::amdgcn_workitem_id_z: 3014 return legalizePreloadedArgIntrin(MI, MRI, B, 3015 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3016 case Intrinsic::amdgcn_workgroup_id_x: 3017 return legalizePreloadedArgIntrin(MI, MRI, B, 3018 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3019 case Intrinsic::amdgcn_workgroup_id_y: 3020 return legalizePreloadedArgIntrin(MI, MRI, B, 3021 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3022 case Intrinsic::amdgcn_workgroup_id_z: 3023 return legalizePreloadedArgIntrin(MI, MRI, B, 3024 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3025 case Intrinsic::amdgcn_dispatch_ptr: 3026 return legalizePreloadedArgIntrin(MI, MRI, B, 3027 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3028 case Intrinsic::amdgcn_queue_ptr: 3029 return legalizePreloadedArgIntrin(MI, MRI, B, 3030 AMDGPUFunctionArgInfo::QUEUE_PTR); 3031 case Intrinsic::amdgcn_implicit_buffer_ptr: 3032 return legalizePreloadedArgIntrin( 3033 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3034 case Intrinsic::amdgcn_dispatch_id: 3035 return legalizePreloadedArgIntrin(MI, MRI, B, 3036 AMDGPUFunctionArgInfo::DISPATCH_ID); 3037 case Intrinsic::amdgcn_fdiv_fast: 3038 return legalizeFDIVFastIntrin(MI, MRI, B); 3039 case Intrinsic::amdgcn_is_shared: 3040 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3041 case Intrinsic::amdgcn_is_private: 3042 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3043 case Intrinsic::amdgcn_wavefrontsize: { 3044 B.setInstr(MI); 3045 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3046 MI.eraseFromParent(); 3047 return true; 3048 } 3049 case Intrinsic::amdgcn_raw_buffer_store: 3050 case Intrinsic::amdgcn_struct_buffer_store: 3051 return legalizeBufferStore(MI, MRI, B, false, false); 3052 case Intrinsic::amdgcn_raw_buffer_store_format: 3053 case Intrinsic::amdgcn_struct_buffer_store_format: 3054 return legalizeBufferStore(MI, MRI, B, false, true); 3055 case Intrinsic::amdgcn_raw_tbuffer_store: 3056 case Intrinsic::amdgcn_struct_tbuffer_store: 3057 return legalizeBufferStore(MI, MRI, B, true, true); 3058 case Intrinsic::amdgcn_raw_buffer_load: 3059 case Intrinsic::amdgcn_struct_buffer_load: 3060 return legalizeBufferLoad(MI, MRI, B, false, false); 3061 case Intrinsic::amdgcn_raw_buffer_load_format: 3062 case Intrinsic::amdgcn_struct_buffer_load_format: 3063 return legalizeBufferLoad(MI, MRI, B, true, false); 3064 case Intrinsic::amdgcn_raw_tbuffer_load: 3065 case Intrinsic::amdgcn_struct_tbuffer_load: 3066 return legalizeBufferLoad(MI, MRI, B, true, true); 3067 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3068 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3069 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3070 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3071 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3072 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3073 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3074 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3075 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3076 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3077 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3078 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3079 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3080 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3081 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3082 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3083 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3084 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3085 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3086 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3087 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3088 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3089 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3090 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3091 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3092 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3093 return legalizeBufferAtomic(MI, B, IntrID); 3094 case Intrinsic::amdgcn_atomic_inc: 3095 return legalizeAtomicIncDec(MI, B, true); 3096 case Intrinsic::amdgcn_atomic_dec: 3097 return legalizeAtomicIncDec(MI, B, false); 3098 default: { 3099 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3100 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3101 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3102 return true; 3103 } 3104 } 3105 3106 return true; 3107 } 3108