1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); // VCC branches 248 setAction({G_BRCOND, S32}, Legal); // SCC branches 249 250 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 251 // elements for v3s16 252 getActionDefinitionsBuilder(G_PHI) 253 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 254 .legalFor(AllS32Vectors) 255 .legalFor(AllS64Vectors) 256 .legalFor(AddrSpaces64) 257 .legalFor(AddrSpaces32) 258 .clampScalar(0, S32, S256) 259 .widenScalarToNextPow2(0, 32) 260 .clampMaxNumElements(0, S32, 16) 261 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 262 .legalIf(isPointer(0)); 263 264 if (ST.has16BitInsts()) { 265 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 266 .legalFor({S32, S16}) 267 .clampScalar(0, S16, S32) 268 .scalarize(0); 269 } else { 270 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 271 .legalFor({S32}) 272 .clampScalar(0, S32, S32) 273 .scalarize(0); 274 } 275 276 // FIXME: Not really legal. Placeholder for custom lowering. 277 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 278 .legalFor({S32, S64}) 279 .clampScalar(0, S32, S64) 280 .widenScalarToNextPow2(0, 32) 281 .scalarize(0); 282 283 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 284 .legalFor({S32}) 285 .clampScalar(0, S32, S32) 286 .scalarize(0); 287 288 // Report legal for any types we can handle anywhere. For the cases only legal 289 // on the SALU, RegBankSelect will be able to re-legalize. 290 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 291 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 292 .clampScalar(0, S32, S64) 293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 294 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 295 .widenScalarToNextPow2(0) 296 .scalarize(0); 297 298 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 299 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 300 .legalFor({{S32, S1}, {S32, S32}}) 301 .clampScalar(0, S32, S32) 302 .scalarize(0); // TODO: Implement. 303 304 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 305 .lower(); 306 307 getActionDefinitionsBuilder(G_BITCAST) 308 // Don't worry about the size constraint. 309 .legalIf(all(isRegisterType(0), isRegisterType(1))) 310 // FIXME: Testing hack 311 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 312 313 getActionDefinitionsBuilder(G_FCONSTANT) 314 .legalFor({S32, S64, S16}) 315 .clampScalar(0, S16, S64); 316 317 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 318 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 319 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 320 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 321 .clampScalarOrElt(0, S32, S1024) 322 .legalIf(isMultiple32(0)) 323 .widenScalarToNextPow2(0, 32) 324 .clampMaxNumElements(0, S32, 16); 325 326 327 // FIXME: i1 operands to intrinsics should always be legal, but other i1 328 // values may not be legal. We need to figure out how to distinguish 329 // between these two scenarios. 330 getActionDefinitionsBuilder(G_CONSTANT) 331 .legalFor({S1, S32, S64, S16, GlobalPtr, 332 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 333 .clampScalar(0, S32, S64) 334 .widenScalarToNextPow2(0) 335 .legalIf(isPointer(0)); 336 337 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 338 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 339 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 340 341 342 auto &FPOpActions = getActionDefinitionsBuilder( 343 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 344 .legalFor({S32, S64}); 345 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 346 .customFor({S32, S64}); 347 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 348 .customFor({S32, S64}); 349 350 if (ST.has16BitInsts()) { 351 if (ST.hasVOP3PInsts()) 352 FPOpActions.legalFor({S16, V2S16}); 353 else 354 FPOpActions.legalFor({S16}); 355 356 TrigActions.customFor({S16}); 357 FDIVActions.customFor({S16}); 358 } 359 360 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 361 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 362 363 if (ST.hasVOP3PInsts()) { 364 MinNumMaxNum.customFor(FPTypesPK16) 365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 366 .clampMaxNumElements(0, S16, 2) 367 .clampScalar(0, S16, S64) 368 .scalarize(0); 369 } else if (ST.has16BitInsts()) { 370 MinNumMaxNum.customFor(FPTypes16) 371 .clampScalar(0, S16, S64) 372 .scalarize(0); 373 } else { 374 MinNumMaxNum.customFor(FPTypesBase) 375 .clampScalar(0, S32, S64) 376 .scalarize(0); 377 } 378 379 if (ST.hasVOP3PInsts()) 380 FPOpActions.clampMaxNumElements(0, S16, 2); 381 382 FPOpActions 383 .scalarize(0) 384 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 385 386 TrigActions 387 .scalarize(0) 388 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 389 390 FDIVActions 391 .scalarize(0) 392 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 393 394 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 395 .legalFor(FPTypesPK16) 396 .clampMaxNumElements(0, S16, 2) 397 .scalarize(0) 398 .clampScalar(0, S16, S64); 399 400 // TODO: Implement 401 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 402 403 if (ST.has16BitInsts()) { 404 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 405 .legalFor({S32, S64, S16}) 406 .scalarize(0) 407 .clampScalar(0, S16, S64); 408 } else { 409 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 410 .legalFor({S32, S64}) 411 .scalarize(0) 412 .clampScalar(0, S32, S64); 413 } 414 415 getActionDefinitionsBuilder(G_FPTRUNC) 416 .legalFor({{S32, S64}, {S16, S32}}) 417 .scalarize(0); 418 419 getActionDefinitionsBuilder(G_FPEXT) 420 .legalFor({{S64, S32}, {S32, S16}}) 421 .lowerFor({{S64, S16}}) // FIXME: Implement 422 .scalarize(0); 423 424 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 425 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 426 427 getActionDefinitionsBuilder(G_FSUB) 428 // Use actual fsub instruction 429 .legalFor({S32}) 430 // Must use fadd + fneg 431 .lowerFor({S64, S16, V2S16}) 432 .scalarize(0) 433 .clampScalar(0, S32, S64); 434 435 // Whether this is legal depends on the floating point mode for the function. 436 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 437 if (ST.hasMadF16()) 438 FMad.customFor({S32, S16}); 439 else 440 FMad.customFor({S32}); 441 FMad.scalarize(0) 442 .lower(); 443 444 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 445 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 446 {S32, S1}, {S64, S1}, {S16, S1}, 447 {S96, S32}, 448 // FIXME: Hack 449 {S64, LLT::scalar(33)}, 450 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 451 .scalarize(0); 452 453 // TODO: Split s1->s64 during regbankselect for VALU. 454 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 455 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 456 .lowerFor({{S32, S64}}) 457 .lowerIf(typeIs(1, S1)) 458 .customFor({{S64, S64}}); 459 if (ST.has16BitInsts()) 460 IToFP.legalFor({{S16, S16}}); 461 IToFP.clampScalar(1, S32, S64) 462 .scalarize(0); 463 464 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 465 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 466 if (ST.has16BitInsts()) 467 FPToI.legalFor({{S16, S16}}); 468 else 469 FPToI.minScalar(1, S32); 470 471 FPToI.minScalar(0, S32) 472 .scalarize(0); 473 474 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 475 .scalarize(0) 476 .lower(); 477 478 if (ST.has16BitInsts()) { 479 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 480 .legalFor({S16, S32, S64}) 481 .clampScalar(0, S16, S64) 482 .scalarize(0); 483 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 484 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 485 .legalFor({S32, S64}) 486 .clampScalar(0, S32, S64) 487 .scalarize(0); 488 } else { 489 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 490 .legalFor({S32}) 491 .customFor({S64}) 492 .clampScalar(0, S32, S64) 493 .scalarize(0); 494 } 495 496 getActionDefinitionsBuilder(G_PTR_ADD) 497 .legalForCartesianProduct(AddrSpaces64, {S64}) 498 .legalForCartesianProduct(AddrSpaces32, {S32}) 499 .scalarize(0); 500 501 getActionDefinitionsBuilder(G_PTR_MASK) 502 .scalarize(0) 503 .alwaysLegal(); 504 505 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 506 507 auto &CmpBuilder = 508 getActionDefinitionsBuilder(G_ICMP) 509 // The compare output type differs based on the register bank of the output, 510 // so make both s1 and s32 legal. 511 // 512 // Scalar compares producing output in scc will be promoted to s32, as that 513 // is the allocatable register type that will be needed for the copy from 514 // scc. This will be promoted during RegBankSelect, and we assume something 515 // before that won't try to use s32 result types. 516 // 517 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 518 // bank. 519 .legalForCartesianProduct( 520 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 521 .legalForCartesianProduct( 522 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 523 if (ST.has16BitInsts()) { 524 CmpBuilder.legalFor({{S1, S16}}); 525 } 526 527 CmpBuilder 528 .widenScalarToNextPow2(1) 529 .clampScalar(1, S32, S64) 530 .scalarize(0) 531 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 532 533 getActionDefinitionsBuilder(G_FCMP) 534 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 535 .widenScalarToNextPow2(1) 536 .clampScalar(1, S32, S64) 537 .scalarize(0); 538 539 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 540 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 541 G_FLOG, G_FLOG2, G_FLOG10}) 542 .legalFor({S32}) 543 .scalarize(0); 544 545 // The 64-bit versions produce 32-bit results, but only on the SALU. 546 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 547 G_CTTZ, G_CTTZ_ZERO_UNDEF, 548 G_CTPOP}) 549 .legalFor({{S32, S32}, {S32, S64}}) 550 .clampScalar(0, S32, S32) 551 .clampScalar(1, S32, S64) 552 .scalarize(0) 553 .widenScalarToNextPow2(0, 32) 554 .widenScalarToNextPow2(1, 32); 555 556 // TODO: Expand for > s32 557 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 558 .legalFor({S32}) 559 .clampScalar(0, S32, S32) 560 .scalarize(0); 561 562 if (ST.has16BitInsts()) { 563 if (ST.hasVOP3PInsts()) { 564 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 565 .legalFor({S32, S16, V2S16}) 566 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 567 .clampMaxNumElements(0, S16, 2) 568 .clampScalar(0, S16, S32) 569 .widenScalarToNextPow2(0) 570 .scalarize(0); 571 } else { 572 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 573 .legalFor({S32, S16}) 574 .widenScalarToNextPow2(0) 575 .clampScalar(0, S16, S32) 576 .scalarize(0); 577 } 578 } else { 579 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 580 .legalFor({S32}) 581 .clampScalar(0, S32, S32) 582 .widenScalarToNextPow2(0) 583 .scalarize(0); 584 } 585 586 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 587 return [=](const LegalityQuery &Query) { 588 return Query.Types[TypeIdx0].getSizeInBits() < 589 Query.Types[TypeIdx1].getSizeInBits(); 590 }; 591 }; 592 593 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 594 return [=](const LegalityQuery &Query) { 595 return Query.Types[TypeIdx0].getSizeInBits() > 596 Query.Types[TypeIdx1].getSizeInBits(); 597 }; 598 }; 599 600 getActionDefinitionsBuilder(G_INTTOPTR) 601 // List the common cases 602 .legalForCartesianProduct(AddrSpaces64, {S64}) 603 .legalForCartesianProduct(AddrSpaces32, {S32}) 604 .scalarize(0) 605 // Accept any address space as long as the size matches 606 .legalIf(sameSize(0, 1)) 607 .widenScalarIf(smallerThan(1, 0), 608 [](const LegalityQuery &Query) { 609 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 610 }) 611 .narrowScalarIf(greaterThan(1, 0), 612 [](const LegalityQuery &Query) { 613 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 614 }); 615 616 getActionDefinitionsBuilder(G_PTRTOINT) 617 // List the common cases 618 .legalForCartesianProduct(AddrSpaces64, {S64}) 619 .legalForCartesianProduct(AddrSpaces32, {S32}) 620 .scalarize(0) 621 // Accept any address space as long as the size matches 622 .legalIf(sameSize(0, 1)) 623 .widenScalarIf(smallerThan(0, 1), 624 [](const LegalityQuery &Query) { 625 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 626 }) 627 .narrowScalarIf( 628 greaterThan(0, 1), 629 [](const LegalityQuery &Query) { 630 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 631 }); 632 633 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 634 .scalarize(0) 635 .custom(); 636 637 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 638 // handle some operations by just promoting the register during 639 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 640 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 641 switch (AS) { 642 // FIXME: Private element size. 643 case AMDGPUAS::PRIVATE_ADDRESS: 644 return 32; 645 // FIXME: Check subtarget 646 case AMDGPUAS::LOCAL_ADDRESS: 647 return ST.useDS128() ? 128 : 64; 648 649 // Treat constant and global as identical. SMRD loads are sometimes usable 650 // for global loads (ideally constant address space should be eliminated) 651 // depending on the context. Legality cannot be context dependent, but 652 // RegBankSelect can split the load as necessary depending on the pointer 653 // register bank/uniformity and if the memory is invariant or not written in 654 // a kernel. 655 case AMDGPUAS::CONSTANT_ADDRESS: 656 case AMDGPUAS::GLOBAL_ADDRESS: 657 return 512; 658 default: 659 return 128; 660 } 661 }; 662 663 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 664 const LLT DstTy = Query.Types[0]; 665 666 // Split vector extloads. 667 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 668 unsigned Align = Query.MMODescrs[0].AlignInBits; 669 670 if (MemSize < DstTy.getSizeInBits()) 671 MemSize = std::max(MemSize, Align); 672 673 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 674 return true; 675 676 const LLT PtrTy = Query.Types[1]; 677 unsigned AS = PtrTy.getAddressSpace(); 678 if (MemSize > maxSizeForAddrSpace(AS)) 679 return true; 680 681 // Catch weird sized loads that don't evenly divide into the access sizes 682 // TODO: May be able to widen depending on alignment etc. 683 unsigned NumRegs = MemSize / 32; 684 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 685 return true; 686 687 if (Align < MemSize) { 688 const SITargetLowering *TLI = ST.getTargetLowering(); 689 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 690 } 691 692 return false; 693 }; 694 695 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 696 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 697 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 698 699 // TODO: Refine based on subtargets which support unaligned access or 128-bit 700 // LDS 701 // TODO: Unsupported flat for SI. 702 703 for (unsigned Op : {G_LOAD, G_STORE}) { 704 const bool IsStore = Op == G_STORE; 705 706 auto &Actions = getActionDefinitionsBuilder(Op); 707 // Whitelist the common cases. 708 // TODO: Pointer loads 709 // TODO: Wide constant loads 710 // TODO: Only CI+ has 3x loads 711 // TODO: Loads to s16 on gfx9 712 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 713 {V2S32, GlobalPtr, 64, GlobalAlign32}, 714 {V3S32, GlobalPtr, 96, GlobalAlign32}, 715 {S96, GlobalPtr, 96, GlobalAlign32}, 716 {V4S32, GlobalPtr, 128, GlobalAlign32}, 717 {S128, GlobalPtr, 128, GlobalAlign32}, 718 {S64, GlobalPtr, 64, GlobalAlign32}, 719 {V2S64, GlobalPtr, 128, GlobalAlign32}, 720 {V2S16, GlobalPtr, 32, GlobalAlign32}, 721 {S32, GlobalPtr, 8, GlobalAlign8}, 722 {S32, GlobalPtr, 16, GlobalAlign16}, 723 724 {S32, LocalPtr, 32, 32}, 725 {S64, LocalPtr, 64, 32}, 726 {V2S32, LocalPtr, 64, 32}, 727 {S32, LocalPtr, 8, 8}, 728 {S32, LocalPtr, 16, 16}, 729 {V2S16, LocalPtr, 32, 32}, 730 731 {S32, PrivatePtr, 32, 32}, 732 {S32, PrivatePtr, 8, 8}, 733 {S32, PrivatePtr, 16, 16}, 734 {V2S16, PrivatePtr, 32, 32}, 735 736 {S32, FlatPtr, 32, GlobalAlign32}, 737 {S32, FlatPtr, 16, GlobalAlign16}, 738 {S32, FlatPtr, 8, GlobalAlign8}, 739 {V2S16, FlatPtr, 32, GlobalAlign32}, 740 741 {S32, ConstantPtr, 32, GlobalAlign32}, 742 {V2S32, ConstantPtr, 64, GlobalAlign32}, 743 {V3S32, ConstantPtr, 96, GlobalAlign32}, 744 {V4S32, ConstantPtr, 128, GlobalAlign32}, 745 {S64, ConstantPtr, 64, GlobalAlign32}, 746 {S128, ConstantPtr, 128, GlobalAlign32}, 747 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 748 Actions 749 .customIf(typeIs(1, Constant32Ptr)) 750 .narrowScalarIf( 751 [=](const LegalityQuery &Query) -> bool { 752 return !Query.Types[0].isVector() && needToSplitLoad(Query); 753 }, 754 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 755 const LLT DstTy = Query.Types[0]; 756 const LLT PtrTy = Query.Types[1]; 757 758 const unsigned DstSize = DstTy.getSizeInBits(); 759 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 760 761 // Split extloads. 762 if (DstSize > MemSize) 763 return std::make_pair(0, LLT::scalar(MemSize)); 764 765 if (DstSize > 32 && (DstSize % 32 != 0)) { 766 // FIXME: Need a way to specify non-extload of larger size if 767 // suitably aligned. 768 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 769 } 770 771 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 772 if (MemSize > MaxSize) 773 return std::make_pair(0, LLT::scalar(MaxSize)); 774 775 unsigned Align = Query.MMODescrs[0].AlignInBits; 776 return std::make_pair(0, LLT::scalar(Align)); 777 }) 778 .fewerElementsIf( 779 [=](const LegalityQuery &Query) -> bool { 780 return Query.Types[0].isVector() && needToSplitLoad(Query); 781 }, 782 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 783 const LLT DstTy = Query.Types[0]; 784 const LLT PtrTy = Query.Types[1]; 785 786 LLT EltTy = DstTy.getElementType(); 787 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 788 789 // Split if it's too large for the address space. 790 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 791 unsigned NumElts = DstTy.getNumElements(); 792 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 793 794 // FIXME: Refine when odd breakdowns handled 795 // The scalars will need to be re-legalized. 796 if (NumPieces == 1 || NumPieces >= NumElts || 797 NumElts % NumPieces != 0) 798 return std::make_pair(0, EltTy); 799 800 return std::make_pair(0, 801 LLT::vector(NumElts / NumPieces, EltTy)); 802 } 803 804 // Need to split because of alignment. 805 unsigned Align = Query.MMODescrs[0].AlignInBits; 806 unsigned EltSize = EltTy.getSizeInBits(); 807 if (EltSize > Align && 808 (EltSize / Align < DstTy.getNumElements())) { 809 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 810 } 811 812 // May need relegalization for the scalars. 813 return std::make_pair(0, EltTy); 814 }) 815 .minScalar(0, S32); 816 817 if (IsStore) 818 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 819 820 // TODO: Need a bitcast lower option? 821 Actions 822 .legalIf([=](const LegalityQuery &Query) { 823 const LLT Ty0 = Query.Types[0]; 824 unsigned Size = Ty0.getSizeInBits(); 825 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 826 unsigned Align = Query.MMODescrs[0].AlignInBits; 827 828 // FIXME: Widening store from alignment not valid. 829 if (MemSize < Size) 830 MemSize = std::max(MemSize, Align); 831 832 // No extending vector loads. 833 if (Size > MemSize && Ty0.isVector()) 834 return false; 835 836 switch (MemSize) { 837 case 8: 838 case 16: 839 return Size == 32; 840 case 32: 841 case 64: 842 case 128: 843 return true; 844 case 96: 845 return ST.hasDwordx3LoadStores(); 846 case 256: 847 case 512: 848 return true; 849 default: 850 return false; 851 } 852 }) 853 .widenScalarToNextPow2(0) 854 // TODO: v3s32->v4s32 with alignment 855 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 856 } 857 858 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 859 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 860 {S32, GlobalPtr, 16, 2 * 8}, 861 {S32, LocalPtr, 8, 8}, 862 {S32, LocalPtr, 16, 16}, 863 {S32, PrivatePtr, 8, 8}, 864 {S32, PrivatePtr, 16, 16}, 865 {S32, ConstantPtr, 8, 8}, 866 {S32, ConstantPtr, 16, 2 * 8}}); 867 if (ST.hasFlatAddressSpace()) { 868 ExtLoads.legalForTypesWithMemDesc( 869 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 870 } 871 872 ExtLoads.clampScalar(0, S32, S32) 873 .widenScalarToNextPow2(0) 874 .unsupportedIfMemSizeNotPow2() 875 .lower(); 876 877 auto &Atomics = getActionDefinitionsBuilder( 878 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 879 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 880 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 881 G_ATOMICRMW_UMIN}) 882 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 883 {S64, GlobalPtr}, {S64, LocalPtr}}); 884 if (ST.hasFlatAddressSpace()) { 885 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 886 } 887 888 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 889 .legalFor({{S32, LocalPtr}}); 890 891 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 892 // demarshalling 893 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 894 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 895 {S32, FlatPtr}, {S64, FlatPtr}}) 896 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 897 {S32, RegionPtr}, {S64, RegionPtr}}); 898 899 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 900 .lower(); 901 902 // TODO: Pointer types, any 32-bit or 64-bit vector 903 904 // Condition should be s32 for scalar, s1 for vector. 905 getActionDefinitionsBuilder(G_SELECT) 906 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 907 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 908 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 909 .clampScalar(0, S16, S64) 910 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 911 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 912 .scalarize(1) 913 .clampMaxNumElements(0, S32, 2) 914 .clampMaxNumElements(0, LocalPtr, 2) 915 .clampMaxNumElements(0, PrivatePtr, 2) 916 .scalarize(0) 917 .widenScalarToNextPow2(0) 918 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 919 920 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 921 // be more flexible with the shift amount type. 922 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 923 .legalFor({{S32, S32}, {S64, S32}}); 924 if (ST.has16BitInsts()) { 925 if (ST.hasVOP3PInsts()) { 926 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 927 .clampMaxNumElements(0, S16, 2); 928 } else 929 Shifts.legalFor({{S16, S32}, {S16, S16}}); 930 931 Shifts.clampScalar(1, S16, S32); 932 Shifts.clampScalar(0, S16, S64); 933 Shifts.widenScalarToNextPow2(0, 16); 934 } else { 935 // Make sure we legalize the shift amount type first, as the general 936 // expansion for the shifted type will produce much worse code if it hasn't 937 // been truncated already. 938 Shifts.clampScalar(1, S32, S32); 939 Shifts.clampScalar(0, S32, S64); 940 Shifts.widenScalarToNextPow2(0, 32); 941 } 942 Shifts.scalarize(0); 943 944 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 945 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 946 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 947 unsigned IdxTypeIdx = 2; 948 949 getActionDefinitionsBuilder(Op) 950 .customIf([=](const LegalityQuery &Query) { 951 const LLT EltTy = Query.Types[EltTypeIdx]; 952 const LLT VecTy = Query.Types[VecTypeIdx]; 953 const LLT IdxTy = Query.Types[IdxTypeIdx]; 954 return (EltTy.getSizeInBits() == 16 || 955 EltTy.getSizeInBits() % 32 == 0) && 956 VecTy.getSizeInBits() % 32 == 0 && 957 VecTy.getSizeInBits() <= 1024 && 958 IdxTy.getSizeInBits() == 32; 959 }) 960 .clampScalar(EltTypeIdx, S32, S64) 961 .clampScalar(VecTypeIdx, S32, S64) 962 .clampScalar(IdxTypeIdx, S32, S32); 963 } 964 965 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 966 .unsupportedIf([=](const LegalityQuery &Query) { 967 const LLT &EltTy = Query.Types[1].getElementType(); 968 return Query.Types[0] != EltTy; 969 }); 970 971 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 972 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 973 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 974 975 // FIXME: Doesn't handle extract of illegal sizes. 976 getActionDefinitionsBuilder(Op) 977 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 978 // FIXME: Multiples of 16 should not be legal. 979 .legalIf([=](const LegalityQuery &Query) { 980 const LLT BigTy = Query.Types[BigTyIdx]; 981 const LLT LitTy = Query.Types[LitTyIdx]; 982 return (BigTy.getSizeInBits() % 32 == 0) && 983 (LitTy.getSizeInBits() % 16 == 0); 984 }) 985 .widenScalarIf( 986 [=](const LegalityQuery &Query) { 987 const LLT BigTy = Query.Types[BigTyIdx]; 988 return (BigTy.getScalarSizeInBits() < 16); 989 }, 990 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 991 .widenScalarIf( 992 [=](const LegalityQuery &Query) { 993 const LLT LitTy = Query.Types[LitTyIdx]; 994 return (LitTy.getScalarSizeInBits() < 16); 995 }, 996 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 997 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 998 .widenScalarToNextPow2(BigTyIdx, 32); 999 1000 } 1001 1002 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1003 .legalForCartesianProduct(AllS32Vectors, {S32}) 1004 .legalForCartesianProduct(AllS64Vectors, {S64}) 1005 .clampNumElements(0, V16S32, V32S32) 1006 .clampNumElements(0, V2S64, V16S64) 1007 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1008 1009 if (ST.hasScalarPackInsts()) 1010 BuildVector.legalFor({V2S16, S32}); 1011 1012 BuildVector 1013 .minScalarSameAs(1, 0) 1014 .legalIf(isRegisterType(0)) 1015 .minScalarOrElt(0, S32); 1016 1017 if (ST.hasScalarPackInsts()) { 1018 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1019 .legalFor({V2S16, S32}) 1020 .lower(); 1021 } else { 1022 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1023 .lower(); 1024 } 1025 1026 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1027 .legalIf(isRegisterType(0)); 1028 1029 // TODO: Don't fully scalarize v2s16 pieces 1030 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1031 1032 // Merge/Unmerge 1033 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1034 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1035 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1036 1037 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1038 const LLT &Ty = Query.Types[TypeIdx]; 1039 if (Ty.isVector()) { 1040 const LLT &EltTy = Ty.getElementType(); 1041 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1042 return true; 1043 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1044 return true; 1045 } 1046 return false; 1047 }; 1048 1049 auto &Builder = getActionDefinitionsBuilder(Op) 1050 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1051 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1052 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1053 // valid. 1054 .clampScalar(LitTyIdx, S16, S256) 1055 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1056 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1057 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1058 elementTypeIs(1, S16)), 1059 changeTo(1, V2S16)) 1060 // Break up vectors with weird elements into scalars 1061 .fewerElementsIf( 1062 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1063 scalarize(0)) 1064 .fewerElementsIf( 1065 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1066 scalarize(1)) 1067 .clampScalar(BigTyIdx, S32, S1024) 1068 .lowerFor({{S16, V2S16}}); 1069 1070 if (Op == G_MERGE_VALUES) { 1071 Builder.widenScalarIf( 1072 // TODO: Use 16-bit shifts if legal for 8-bit values? 1073 [=](const LegalityQuery &Query) { 1074 const LLT Ty = Query.Types[LitTyIdx]; 1075 return Ty.getSizeInBits() < 32; 1076 }, 1077 changeTo(LitTyIdx, S32)); 1078 } 1079 1080 Builder.widenScalarIf( 1081 [=](const LegalityQuery &Query) { 1082 const LLT Ty = Query.Types[BigTyIdx]; 1083 return !isPowerOf2_32(Ty.getSizeInBits()) && 1084 Ty.getSizeInBits() % 16 != 0; 1085 }, 1086 [=](const LegalityQuery &Query) { 1087 // Pick the next power of 2, or a multiple of 64 over 128. 1088 // Whichever is smaller. 1089 const LLT &Ty = Query.Types[BigTyIdx]; 1090 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1091 if (NewSizeInBits >= 256) { 1092 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1093 if (RoundedTo < NewSizeInBits) 1094 NewSizeInBits = RoundedTo; 1095 } 1096 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1097 }) 1098 .legalIf([=](const LegalityQuery &Query) { 1099 const LLT &BigTy = Query.Types[BigTyIdx]; 1100 const LLT &LitTy = Query.Types[LitTyIdx]; 1101 1102 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1103 return false; 1104 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1105 return false; 1106 1107 return BigTy.getSizeInBits() % 16 == 0 && 1108 LitTy.getSizeInBits() % 16 == 0 && 1109 BigTy.getSizeInBits() <= 1024; 1110 }) 1111 // Any vectors left are the wrong size. Scalarize them. 1112 .scalarize(0) 1113 .scalarize(1); 1114 } 1115 1116 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1117 1118 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1119 .legalFor({S64}); 1120 1121 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1122 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1123 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1124 .unsupported(); 1125 1126 computeTables(); 1127 verify(*ST.getInstrInfo()); 1128 } 1129 1130 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1131 MachineRegisterInfo &MRI, 1132 MachineIRBuilder &B, 1133 GISelChangeObserver &Observer) const { 1134 switch (MI.getOpcode()) { 1135 case TargetOpcode::G_ADDRSPACE_CAST: 1136 return legalizeAddrSpaceCast(MI, MRI, B); 1137 case TargetOpcode::G_FRINT: 1138 return legalizeFrint(MI, MRI, B); 1139 case TargetOpcode::G_FCEIL: 1140 return legalizeFceil(MI, MRI, B); 1141 case TargetOpcode::G_INTRINSIC_TRUNC: 1142 return legalizeIntrinsicTrunc(MI, MRI, B); 1143 case TargetOpcode::G_SITOFP: 1144 return legalizeITOFP(MI, MRI, B, true); 1145 case TargetOpcode::G_UITOFP: 1146 return legalizeITOFP(MI, MRI, B, false); 1147 case TargetOpcode::G_FMINNUM: 1148 case TargetOpcode::G_FMAXNUM: 1149 case TargetOpcode::G_FMINNUM_IEEE: 1150 case TargetOpcode::G_FMAXNUM_IEEE: 1151 return legalizeMinNumMaxNum(MI, MRI, B); 1152 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1153 return legalizeExtractVectorElt(MI, MRI, B); 1154 case TargetOpcode::G_INSERT_VECTOR_ELT: 1155 return legalizeInsertVectorElt(MI, MRI, B); 1156 case TargetOpcode::G_FSIN: 1157 case TargetOpcode::G_FCOS: 1158 return legalizeSinCos(MI, MRI, B); 1159 case TargetOpcode::G_GLOBAL_VALUE: 1160 return legalizeGlobalValue(MI, MRI, B); 1161 case TargetOpcode::G_LOAD: 1162 return legalizeLoad(MI, MRI, B, Observer); 1163 case TargetOpcode::G_FMAD: 1164 return legalizeFMad(MI, MRI, B); 1165 case TargetOpcode::G_FDIV: 1166 return legalizeFDIV(MI, MRI, B); 1167 case TargetOpcode::G_ATOMIC_CMPXCHG: 1168 return legalizeAtomicCmpXChg(MI, MRI, B); 1169 default: 1170 return false; 1171 } 1172 1173 llvm_unreachable("expected switch to return"); 1174 } 1175 1176 Register AMDGPULegalizerInfo::getSegmentAperture( 1177 unsigned AS, 1178 MachineRegisterInfo &MRI, 1179 MachineIRBuilder &B) const { 1180 MachineFunction &MF = B.getMF(); 1181 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1182 const LLT S32 = LLT::scalar(32); 1183 1184 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1185 1186 if (ST.hasApertureRegs()) { 1187 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1188 // getreg. 1189 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1190 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1191 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1192 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1193 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1194 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1195 unsigned Encoding = 1196 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1197 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1198 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1199 1200 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1201 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1202 1203 B.buildInstr(AMDGPU::S_GETREG_B32) 1204 .addDef(GetReg) 1205 .addImm(Encoding); 1206 MRI.setType(GetReg, S32); 1207 1208 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1209 B.buildInstr(TargetOpcode::G_SHL) 1210 .addDef(ApertureReg) 1211 .addUse(GetReg) 1212 .addUse(ShiftAmt.getReg(0)); 1213 1214 return ApertureReg; 1215 } 1216 1217 Register QueuePtr = MRI.createGenericVirtualRegister( 1218 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1219 1220 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1221 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1222 return Register(); 1223 1224 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1225 // private_segment_aperture_base_hi. 1226 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1227 1228 // TODO: can we be smarter about machine pointer info? 1229 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1230 MachineMemOperand *MMO = MF.getMachineMemOperand( 1231 PtrInfo, 1232 MachineMemOperand::MOLoad | 1233 MachineMemOperand::MODereferenceable | 1234 MachineMemOperand::MOInvariant, 1235 4, 1236 MinAlign(64, StructOffset)); 1237 1238 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1239 Register LoadAddr; 1240 1241 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1242 B.buildLoad(LoadResult, LoadAddr, *MMO); 1243 return LoadResult; 1244 } 1245 1246 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1247 MachineInstr &MI, MachineRegisterInfo &MRI, 1248 MachineIRBuilder &B) const { 1249 MachineFunction &MF = B.getMF(); 1250 1251 B.setInstr(MI); 1252 1253 const LLT S32 = LLT::scalar(32); 1254 Register Dst = MI.getOperand(0).getReg(); 1255 Register Src = MI.getOperand(1).getReg(); 1256 1257 LLT DstTy = MRI.getType(Dst); 1258 LLT SrcTy = MRI.getType(Src); 1259 unsigned DestAS = DstTy.getAddressSpace(); 1260 unsigned SrcAS = SrcTy.getAddressSpace(); 1261 1262 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1263 // vector element. 1264 assert(!DstTy.isVector()); 1265 1266 const AMDGPUTargetMachine &TM 1267 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1268 1269 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1270 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1271 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1272 return true; 1273 } 1274 1275 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1276 // Truncate. 1277 B.buildExtract(Dst, Src, 0); 1278 MI.eraseFromParent(); 1279 return true; 1280 } 1281 1282 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1283 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1284 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1285 1286 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1287 // another. Merge operands are required to be the same type, but creating an 1288 // extra ptrtoint would be kind of pointless. 1289 auto HighAddr = B.buildConstant( 1290 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1291 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1292 MI.eraseFromParent(); 1293 return true; 1294 } 1295 1296 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1297 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1298 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1299 unsigned NullVal = TM.getNullPointerValue(DestAS); 1300 1301 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1302 auto FlatNull = B.buildConstant(SrcTy, 0); 1303 1304 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1305 1306 // Extract low 32-bits of the pointer. 1307 B.buildExtract(PtrLo32, Src, 0); 1308 1309 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1310 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1311 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1312 1313 MI.eraseFromParent(); 1314 return true; 1315 } 1316 1317 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1318 return false; 1319 1320 if (!ST.hasFlatAddressSpace()) 1321 return false; 1322 1323 auto SegmentNull = 1324 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1325 auto FlatNull = 1326 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1327 1328 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1329 if (!ApertureReg.isValid()) 1330 return false; 1331 1332 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1333 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1334 1335 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1336 1337 // Coerce the type of the low half of the result so we can use merge_values. 1338 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1339 B.buildInstr(TargetOpcode::G_PTRTOINT) 1340 .addDef(SrcAsInt) 1341 .addUse(Src); 1342 1343 // TODO: Should we allow mismatched types but matching sizes in merges to 1344 // avoid the ptrtoint? 1345 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1346 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1347 1348 MI.eraseFromParent(); 1349 return true; 1350 } 1351 1352 bool AMDGPULegalizerInfo::legalizeFrint( 1353 MachineInstr &MI, MachineRegisterInfo &MRI, 1354 MachineIRBuilder &B) const { 1355 B.setInstr(MI); 1356 1357 Register Src = MI.getOperand(1).getReg(); 1358 LLT Ty = MRI.getType(Src); 1359 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1360 1361 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1362 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1363 1364 auto C1 = B.buildFConstant(Ty, C1Val); 1365 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1366 1367 // TODO: Should this propagate fast-math-flags? 1368 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1369 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1370 1371 auto C2 = B.buildFConstant(Ty, C2Val); 1372 auto Fabs = B.buildFAbs(Ty, Src); 1373 1374 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1375 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1376 return true; 1377 } 1378 1379 bool AMDGPULegalizerInfo::legalizeFceil( 1380 MachineInstr &MI, MachineRegisterInfo &MRI, 1381 MachineIRBuilder &B) const { 1382 B.setInstr(MI); 1383 1384 const LLT S1 = LLT::scalar(1); 1385 const LLT S64 = LLT::scalar(64); 1386 1387 Register Src = MI.getOperand(1).getReg(); 1388 assert(MRI.getType(Src) == S64); 1389 1390 // result = trunc(src) 1391 // if (src > 0.0 && src != result) 1392 // result += 1.0 1393 1394 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1395 1396 const auto Zero = B.buildFConstant(S64, 0.0); 1397 const auto One = B.buildFConstant(S64, 1.0); 1398 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1399 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1400 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1401 auto Add = B.buildSelect(S64, And, One, Zero); 1402 1403 // TODO: Should this propagate fast-math-flags? 1404 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1405 return true; 1406 } 1407 1408 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1409 MachineIRBuilder &B) { 1410 const unsigned FractBits = 52; 1411 const unsigned ExpBits = 11; 1412 LLT S32 = LLT::scalar(32); 1413 1414 auto Const0 = B.buildConstant(S32, FractBits - 32); 1415 auto Const1 = B.buildConstant(S32, ExpBits); 1416 1417 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1418 .addUse(Const0.getReg(0)) 1419 .addUse(Const1.getReg(0)); 1420 1421 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1422 } 1423 1424 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1425 MachineInstr &MI, MachineRegisterInfo &MRI, 1426 MachineIRBuilder &B) const { 1427 B.setInstr(MI); 1428 1429 const LLT S1 = LLT::scalar(1); 1430 const LLT S32 = LLT::scalar(32); 1431 const LLT S64 = LLT::scalar(64); 1432 1433 Register Src = MI.getOperand(1).getReg(); 1434 assert(MRI.getType(Src) == S64); 1435 1436 // TODO: Should this use extract since the low half is unused? 1437 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1438 Register Hi = Unmerge.getReg(1); 1439 1440 // Extract the upper half, since this is where we will find the sign and 1441 // exponent. 1442 auto Exp = extractF64Exponent(Hi, B); 1443 1444 const unsigned FractBits = 52; 1445 1446 // Extract the sign bit. 1447 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1448 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1449 1450 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1451 1452 const auto Zero32 = B.buildConstant(S32, 0); 1453 1454 // Extend back to 64-bits. 1455 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1456 1457 auto Shr = B.buildAShr(S64, FractMask, Exp); 1458 auto Not = B.buildNot(S64, Shr); 1459 auto Tmp0 = B.buildAnd(S64, Src, Not); 1460 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1461 1462 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1463 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1464 1465 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1466 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1467 return true; 1468 } 1469 1470 bool AMDGPULegalizerInfo::legalizeITOFP( 1471 MachineInstr &MI, MachineRegisterInfo &MRI, 1472 MachineIRBuilder &B, bool Signed) const { 1473 B.setInstr(MI); 1474 1475 Register Dst = MI.getOperand(0).getReg(); 1476 Register Src = MI.getOperand(1).getReg(); 1477 1478 const LLT S64 = LLT::scalar(64); 1479 const LLT S32 = LLT::scalar(32); 1480 1481 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1482 1483 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1484 1485 auto CvtHi = Signed ? 1486 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1487 B.buildUITOFP(S64, Unmerge.getReg(1)); 1488 1489 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1490 1491 auto ThirtyTwo = B.buildConstant(S32, 32); 1492 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1493 .addUse(CvtHi.getReg(0)) 1494 .addUse(ThirtyTwo.getReg(0)); 1495 1496 // TODO: Should this propagate fast-math-flags? 1497 B.buildFAdd(Dst, LdExp, CvtLo); 1498 MI.eraseFromParent(); 1499 return true; 1500 } 1501 1502 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1503 MachineInstr &MI, MachineRegisterInfo &MRI, 1504 MachineIRBuilder &B) const { 1505 MachineFunction &MF = B.getMF(); 1506 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1507 1508 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1509 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1510 1511 // With ieee_mode disabled, the instructions have the correct behavior 1512 // already for G_FMINNUM/G_FMAXNUM 1513 if (!MFI->getMode().IEEE) 1514 return !IsIEEEOp; 1515 1516 if (IsIEEEOp) 1517 return true; 1518 1519 MachineIRBuilder HelperBuilder(MI); 1520 GISelObserverWrapper DummyObserver; 1521 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1522 HelperBuilder.setInstr(MI); 1523 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1524 } 1525 1526 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1527 MachineInstr &MI, MachineRegisterInfo &MRI, 1528 MachineIRBuilder &B) const { 1529 // TODO: Should move some of this into LegalizerHelper. 1530 1531 // TODO: Promote dynamic indexing of s16 to s32 1532 // TODO: Dynamic s64 indexing is only legal for SGPR. 1533 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1534 if (!IdxVal) // Dynamic case will be selected to register indexing. 1535 return true; 1536 1537 Register Dst = MI.getOperand(0).getReg(); 1538 Register Vec = MI.getOperand(1).getReg(); 1539 1540 LLT VecTy = MRI.getType(Vec); 1541 LLT EltTy = VecTy.getElementType(); 1542 assert(EltTy == MRI.getType(Dst)); 1543 1544 B.setInstr(MI); 1545 1546 if (IdxVal.getValue() < VecTy.getNumElements()) 1547 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1548 else 1549 B.buildUndef(Dst); 1550 1551 MI.eraseFromParent(); 1552 return true; 1553 } 1554 1555 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1556 MachineInstr &MI, MachineRegisterInfo &MRI, 1557 MachineIRBuilder &B) const { 1558 // TODO: Should move some of this into LegalizerHelper. 1559 1560 // TODO: Promote dynamic indexing of s16 to s32 1561 // TODO: Dynamic s64 indexing is only legal for SGPR. 1562 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1563 if (!IdxVal) // Dynamic case will be selected to register indexing. 1564 return true; 1565 1566 Register Dst = MI.getOperand(0).getReg(); 1567 Register Vec = MI.getOperand(1).getReg(); 1568 Register Ins = MI.getOperand(2).getReg(); 1569 1570 LLT VecTy = MRI.getType(Vec); 1571 LLT EltTy = VecTy.getElementType(); 1572 assert(EltTy == MRI.getType(Ins)); 1573 1574 B.setInstr(MI); 1575 1576 if (IdxVal.getValue() < VecTy.getNumElements()) 1577 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1578 else 1579 B.buildUndef(Dst); 1580 1581 MI.eraseFromParent(); 1582 return true; 1583 } 1584 1585 bool AMDGPULegalizerInfo::legalizeSinCos( 1586 MachineInstr &MI, MachineRegisterInfo &MRI, 1587 MachineIRBuilder &B) const { 1588 B.setInstr(MI); 1589 1590 Register DstReg = MI.getOperand(0).getReg(); 1591 Register SrcReg = MI.getOperand(1).getReg(); 1592 LLT Ty = MRI.getType(DstReg); 1593 unsigned Flags = MI.getFlags(); 1594 1595 Register TrigVal; 1596 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1597 if (ST.hasTrigReducedRange()) { 1598 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1599 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1600 .addUse(MulVal.getReg(0)) 1601 .setMIFlags(Flags).getReg(0); 1602 } else 1603 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1604 1605 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1606 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1607 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1608 .addUse(TrigVal) 1609 .setMIFlags(Flags); 1610 MI.eraseFromParent(); 1611 return true; 1612 } 1613 1614 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1615 Register DstReg, LLT PtrTy, 1616 MachineIRBuilder &B, const GlobalValue *GV, 1617 unsigned Offset, unsigned GAFlags) const { 1618 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1619 // to the following code sequence: 1620 // 1621 // For constant address space: 1622 // s_getpc_b64 s[0:1] 1623 // s_add_u32 s0, s0, $symbol 1624 // s_addc_u32 s1, s1, 0 1625 // 1626 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1627 // a fixup or relocation is emitted to replace $symbol with a literal 1628 // constant, which is a pc-relative offset from the encoding of the $symbol 1629 // operand to the global variable. 1630 // 1631 // For global address space: 1632 // s_getpc_b64 s[0:1] 1633 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1634 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1635 // 1636 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1637 // fixups or relocations are emitted to replace $symbol@*@lo and 1638 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1639 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1640 // operand to the global variable. 1641 // 1642 // What we want here is an offset from the value returned by s_getpc 1643 // (which is the address of the s_add_u32 instruction) to the global 1644 // variable, but since the encoding of $symbol starts 4 bytes after the start 1645 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1646 // small. This requires us to add 4 to the global variable offset in order to 1647 // compute the correct address. 1648 1649 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1650 1651 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1652 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1653 1654 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1655 .addDef(PCReg); 1656 1657 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1658 if (GAFlags == SIInstrInfo::MO_NONE) 1659 MIB.addImm(0); 1660 else 1661 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1662 1663 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1664 1665 if (PtrTy.getSizeInBits() == 32) 1666 B.buildExtract(DstReg, PCReg, 0); 1667 return true; 1668 } 1669 1670 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1671 MachineInstr &MI, MachineRegisterInfo &MRI, 1672 MachineIRBuilder &B) const { 1673 Register DstReg = MI.getOperand(0).getReg(); 1674 LLT Ty = MRI.getType(DstReg); 1675 unsigned AS = Ty.getAddressSpace(); 1676 1677 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1678 MachineFunction &MF = B.getMF(); 1679 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1680 B.setInstr(MI); 1681 1682 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1683 if (!MFI->isEntryFunction()) { 1684 const Function &Fn = MF.getFunction(); 1685 DiagnosticInfoUnsupported BadLDSDecl( 1686 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1687 Fn.getContext().diagnose(BadLDSDecl); 1688 } 1689 1690 // TODO: We could emit code to handle the initialization somewhere. 1691 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1692 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1693 MI.eraseFromParent(); 1694 return true; 1695 } 1696 1697 const Function &Fn = MF.getFunction(); 1698 DiagnosticInfoUnsupported BadInit( 1699 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1700 Fn.getContext().diagnose(BadInit); 1701 return true; 1702 } 1703 1704 const SITargetLowering *TLI = ST.getTargetLowering(); 1705 1706 if (TLI->shouldEmitFixup(GV)) { 1707 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1708 MI.eraseFromParent(); 1709 return true; 1710 } 1711 1712 if (TLI->shouldEmitPCReloc(GV)) { 1713 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1714 MI.eraseFromParent(); 1715 return true; 1716 } 1717 1718 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1719 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1720 1721 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1722 MachinePointerInfo::getGOT(MF), 1723 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1724 MachineMemOperand::MOInvariant, 1725 8 /*Size*/, 8 /*Align*/); 1726 1727 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1728 1729 if (Ty.getSizeInBits() == 32) { 1730 // Truncate if this is a 32-bit constant adrdess. 1731 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1732 B.buildExtract(DstReg, Load, 0); 1733 } else 1734 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1735 1736 MI.eraseFromParent(); 1737 return true; 1738 } 1739 1740 bool AMDGPULegalizerInfo::legalizeLoad( 1741 MachineInstr &MI, MachineRegisterInfo &MRI, 1742 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1743 B.setInstr(MI); 1744 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1745 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1746 Observer.changingInstr(MI); 1747 MI.getOperand(1).setReg(Cast.getReg(0)); 1748 Observer.changedInstr(MI); 1749 return true; 1750 } 1751 1752 bool AMDGPULegalizerInfo::legalizeFMad( 1753 MachineInstr &MI, MachineRegisterInfo &MRI, 1754 MachineIRBuilder &B) const { 1755 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1756 assert(Ty.isScalar()); 1757 1758 MachineFunction &MF = B.getMF(); 1759 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1760 1761 // TODO: Always legal with future ftz flag. 1762 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1763 return true; 1764 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1765 return true; 1766 1767 1768 MachineIRBuilder HelperBuilder(MI); 1769 GISelObserverWrapper DummyObserver; 1770 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1771 HelperBuilder.setMBB(*MI.getParent()); 1772 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1773 } 1774 1775 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1776 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1777 Register DstReg = MI.getOperand(0).getReg(); 1778 Register PtrReg = MI.getOperand(1).getReg(); 1779 Register CmpVal = MI.getOperand(2).getReg(); 1780 Register NewVal = MI.getOperand(3).getReg(); 1781 1782 assert(SITargetLowering::isFlatGlobalAddrSpace( 1783 MRI.getType(PtrReg).getAddressSpace()) && 1784 "this should not have been custom lowered"); 1785 1786 LLT ValTy = MRI.getType(CmpVal); 1787 LLT VecTy = LLT::vector(2, ValTy); 1788 1789 B.setInstr(MI); 1790 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1791 1792 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1793 .addDef(DstReg) 1794 .addUse(PtrReg) 1795 .addUse(PackedVal) 1796 .setMemRefs(MI.memoperands()); 1797 1798 MI.eraseFromParent(); 1799 return true; 1800 } 1801 1802 // Return the use branch instruction, otherwise null if the usage is invalid. 1803 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1804 MachineRegisterInfo &MRI) { 1805 Register CondDef = MI.getOperand(0).getReg(); 1806 if (!MRI.hasOneNonDBGUse(CondDef)) 1807 return nullptr; 1808 1809 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1810 return UseMI.getParent() == MI.getParent() && 1811 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1812 } 1813 1814 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1815 Register Reg, LLT Ty) const { 1816 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1817 if (LiveIn) 1818 return LiveIn; 1819 1820 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1821 MRI.addLiveIn(Reg, NewReg); 1822 return NewReg; 1823 } 1824 1825 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1826 const ArgDescriptor *Arg) const { 1827 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1828 return false; // TODO: Handle these 1829 1830 assert(Arg->getRegister().isPhysical()); 1831 1832 MachineRegisterInfo &MRI = *B.getMRI(); 1833 1834 LLT Ty = MRI.getType(DstReg); 1835 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1836 1837 if (Arg->isMasked()) { 1838 // TODO: Should we try to emit this once in the entry block? 1839 const LLT S32 = LLT::scalar(32); 1840 const unsigned Mask = Arg->getMask(); 1841 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1842 1843 Register AndMaskSrc = LiveIn; 1844 1845 if (Shift != 0) { 1846 auto ShiftAmt = B.buildConstant(S32, Shift); 1847 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1848 } 1849 1850 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1851 } else 1852 B.buildCopy(DstReg, LiveIn); 1853 1854 // Insert the argument copy if it doens't already exist. 1855 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1856 if (!MRI.getVRegDef(LiveIn)) { 1857 // FIXME: Should have scoped insert pt 1858 MachineBasicBlock &OrigInsBB = B.getMBB(); 1859 auto OrigInsPt = B.getInsertPt(); 1860 1861 MachineBasicBlock &EntryMBB = B.getMF().front(); 1862 EntryMBB.addLiveIn(Arg->getRegister()); 1863 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1864 B.buildCopy(LiveIn, Arg->getRegister()); 1865 1866 B.setInsertPt(OrigInsBB, OrigInsPt); 1867 } 1868 1869 return true; 1870 } 1871 1872 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1873 MachineInstr &MI, 1874 MachineRegisterInfo &MRI, 1875 MachineIRBuilder &B, 1876 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1877 B.setInstr(MI); 1878 1879 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1880 1881 const ArgDescriptor *Arg; 1882 const TargetRegisterClass *RC; 1883 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1884 if (!Arg) { 1885 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1886 return false; 1887 } 1888 1889 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1890 MI.eraseFromParent(); 1891 return true; 1892 } 1893 1894 return false; 1895 } 1896 1897 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1898 MachineRegisterInfo &MRI, 1899 MachineIRBuilder &B) const { 1900 B.setInstr(MI); 1901 Register Dst = MI.getOperand(0).getReg(); 1902 LLT DstTy = MRI.getType(Dst); 1903 LLT S16 = LLT::scalar(16); 1904 LLT S32 = LLT::scalar(32); 1905 LLT S64 = LLT::scalar(64); 1906 1907 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1908 return true; 1909 1910 if (DstTy == S16) 1911 return legalizeFDIV16(MI, MRI, B); 1912 if (DstTy == S32) 1913 return legalizeFDIV32(MI, MRI, B); 1914 if (DstTy == S64) 1915 return legalizeFDIV64(MI, MRI, B); 1916 1917 return false; 1918 } 1919 1920 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1921 MachineRegisterInfo &MRI, 1922 MachineIRBuilder &B) const { 1923 Register Res = MI.getOperand(0).getReg(); 1924 Register LHS = MI.getOperand(1).getReg(); 1925 Register RHS = MI.getOperand(2).getReg(); 1926 1927 uint16_t Flags = MI.getFlags(); 1928 1929 LLT ResTy = MRI.getType(Res); 1930 LLT S32 = LLT::scalar(32); 1931 LLT S64 = LLT::scalar(64); 1932 1933 const MachineFunction &MF = B.getMF(); 1934 bool Unsafe = 1935 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1936 1937 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1938 return false; 1939 1940 if (!Unsafe && ResTy == S32 && 1941 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 1942 return false; 1943 1944 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1945 // 1 / x -> RCP(x) 1946 if (CLHS->isExactlyValue(1.0)) { 1947 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1948 .addUse(RHS) 1949 .setMIFlags(Flags); 1950 1951 MI.eraseFromParent(); 1952 return true; 1953 } 1954 1955 // -1 / x -> RCP( FNEG(x) ) 1956 if (CLHS->isExactlyValue(-1.0)) { 1957 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1958 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1959 .addUse(FNeg.getReg(0)) 1960 .setMIFlags(Flags); 1961 1962 MI.eraseFromParent(); 1963 return true; 1964 } 1965 } 1966 1967 // x / y -> x * (1.0 / y) 1968 if (Unsafe) { 1969 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1970 .addUse(RHS) 1971 .setMIFlags(Flags); 1972 B.buildFMul(Res, LHS, RCP, Flags); 1973 1974 MI.eraseFromParent(); 1975 return true; 1976 } 1977 1978 return false; 1979 } 1980 1981 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 1982 MachineRegisterInfo &MRI, 1983 MachineIRBuilder &B) const { 1984 B.setInstr(MI); 1985 Register Res = MI.getOperand(0).getReg(); 1986 Register LHS = MI.getOperand(1).getReg(); 1987 Register RHS = MI.getOperand(2).getReg(); 1988 1989 uint16_t Flags = MI.getFlags(); 1990 1991 LLT S16 = LLT::scalar(16); 1992 LLT S32 = LLT::scalar(32); 1993 1994 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 1995 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 1996 1997 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1998 .addUse(RHSExt.getReg(0)) 1999 .setMIFlags(Flags); 2000 2001 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2002 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2003 2004 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2005 .addUse(RDst.getReg(0)) 2006 .addUse(RHS) 2007 .addUse(LHS) 2008 .setMIFlags(Flags); 2009 2010 MI.eraseFromParent(); 2011 return true; 2012 } 2013 2014 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2015 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2016 static void toggleSPDenormMode(bool Enable, 2017 MachineIRBuilder &B, 2018 const GCNSubtarget &ST, 2019 AMDGPU::SIModeRegisterDefaults Mode) { 2020 // Set SP denorm mode to this value. 2021 unsigned SPDenormMode = 2022 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2023 2024 if (ST.hasDenormModeInst()) { 2025 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2026 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 2027 ? FP_DENORM_FLUSH_NONE 2028 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2029 2030 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2031 B.buildInstr(AMDGPU::S_DENORM_MODE) 2032 .addImm(NewDenormModeValue); 2033 2034 } else { 2035 // Select FP32 bit field in mode register. 2036 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2037 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2038 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2039 2040 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2041 .addImm(SPDenormMode) 2042 .addImm(SPDenormModeBitField); 2043 } 2044 } 2045 2046 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2047 MachineRegisterInfo &MRI, 2048 MachineIRBuilder &B) const { 2049 B.setInstr(MI); 2050 Register Res = MI.getOperand(0).getReg(); 2051 Register LHS = MI.getOperand(1).getReg(); 2052 Register RHS = MI.getOperand(2).getReg(); 2053 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2054 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2055 2056 uint16_t Flags = MI.getFlags(); 2057 2058 LLT S32 = LLT::scalar(32); 2059 LLT S1 = LLT::scalar(1); 2060 2061 auto One = B.buildFConstant(S32, 1.0f); 2062 2063 auto DenominatorScaled = 2064 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2065 .addUse(RHS) 2066 .addUse(LHS) 2067 .addImm(1) 2068 .setMIFlags(Flags); 2069 auto NumeratorScaled = 2070 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2071 .addUse(LHS) 2072 .addUse(RHS) 2073 .addImm(0) 2074 .setMIFlags(Flags); 2075 2076 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2077 .addUse(DenominatorScaled.getReg(0)) 2078 .setMIFlags(Flags); 2079 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2080 2081 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2082 // aren't modeled as reading it. 2083 if (!Mode.FP32Denormals) 2084 toggleSPDenormMode(true, B, ST, Mode); 2085 2086 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2087 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2088 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2089 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2090 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2091 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2092 2093 if (!Mode.FP32Denormals) 2094 toggleSPDenormMode(false, B, ST, Mode); 2095 2096 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2097 .addUse(Fma4.getReg(0)) 2098 .addUse(Fma1.getReg(0)) 2099 .addUse(Fma3.getReg(0)) 2100 .addUse(NumeratorScaled.getReg(1)) 2101 .setMIFlags(Flags); 2102 2103 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2104 .addUse(Fmas.getReg(0)) 2105 .addUse(RHS) 2106 .addUse(LHS) 2107 .setMIFlags(Flags); 2108 2109 MI.eraseFromParent(); 2110 return true; 2111 } 2112 2113 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2114 MachineRegisterInfo &MRI, 2115 MachineIRBuilder &B) const { 2116 B.setInstr(MI); 2117 Register Res = MI.getOperand(0).getReg(); 2118 Register LHS = MI.getOperand(1).getReg(); 2119 Register RHS = MI.getOperand(2).getReg(); 2120 2121 uint16_t Flags = MI.getFlags(); 2122 2123 LLT S64 = LLT::scalar(64); 2124 LLT S1 = LLT::scalar(1); 2125 2126 auto One = B.buildFConstant(S64, 1.0); 2127 2128 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2129 .addUse(LHS) 2130 .addUse(RHS) 2131 .addImm(1) 2132 .setMIFlags(Flags); 2133 2134 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2135 2136 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2137 .addUse(DivScale0.getReg(0)) 2138 .setMIFlags(Flags); 2139 2140 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2141 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2142 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2143 2144 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2145 .addUse(LHS) 2146 .addUse(RHS) 2147 .addImm(0) 2148 .setMIFlags(Flags); 2149 2150 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2151 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2152 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2153 2154 Register Scale; 2155 if (!ST.hasUsableDivScaleConditionOutput()) { 2156 // Workaround a hardware bug on SI where the condition output from div_scale 2157 // is not usable. 2158 2159 Scale = MRI.createGenericVirtualRegister(S1); 2160 2161 LLT S32 = LLT::scalar(32); 2162 2163 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2164 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2165 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2166 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2167 2168 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2169 Scale1Unmerge.getReg(1)); 2170 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2171 Scale0Unmerge.getReg(1)); 2172 B.buildXor(Scale, CmpNum, CmpDen); 2173 } else { 2174 Scale = DivScale1.getReg(1); 2175 } 2176 2177 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2178 .addUse(Fma4.getReg(0)) 2179 .addUse(Fma3.getReg(0)) 2180 .addUse(Mul.getReg(0)) 2181 .addUse(Scale) 2182 .setMIFlags(Flags); 2183 2184 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2185 .addUse(Fmas.getReg(0)) 2186 .addUse(RHS) 2187 .addUse(LHS) 2188 .setMIFlags(Flags); 2189 2190 MI.eraseFromParent(); 2191 return true; 2192 } 2193 2194 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2195 MachineRegisterInfo &MRI, 2196 MachineIRBuilder &B) const { 2197 B.setInstr(MI); 2198 Register Res = MI.getOperand(0).getReg(); 2199 Register LHS = MI.getOperand(2).getReg(); 2200 Register RHS = MI.getOperand(3).getReg(); 2201 uint16_t Flags = MI.getFlags(); 2202 2203 LLT S32 = LLT::scalar(32); 2204 LLT S1 = LLT::scalar(1); 2205 2206 auto Abs = B.buildFAbs(S32, RHS, Flags); 2207 const APFloat C0Val(1.0f); 2208 2209 auto C0 = B.buildConstant(S32, 0x6f800000); 2210 auto C1 = B.buildConstant(S32, 0x2f800000); 2211 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2212 2213 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2214 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2215 2216 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2217 2218 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2219 .addUse(Mul0.getReg(0)) 2220 .setMIFlags(Flags); 2221 2222 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2223 2224 B.buildFMul(Res, Sel, Mul1, Flags); 2225 2226 MI.eraseFromParent(); 2227 return true; 2228 } 2229 2230 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2231 MachineRegisterInfo &MRI, 2232 MachineIRBuilder &B) const { 2233 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2234 if (!MFI->isEntryFunction()) { 2235 return legalizePreloadedArgIntrin(MI, MRI, B, 2236 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2237 } 2238 2239 B.setInstr(MI); 2240 2241 uint64_t Offset = 2242 ST.getTargetLowering()->getImplicitParameterOffset( 2243 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2244 Register DstReg = MI.getOperand(0).getReg(); 2245 LLT DstTy = MRI.getType(DstReg); 2246 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2247 2248 const ArgDescriptor *Arg; 2249 const TargetRegisterClass *RC; 2250 std::tie(Arg, RC) 2251 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2252 if (!Arg) 2253 return false; 2254 2255 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2256 if (!loadInputValue(KernargPtrReg, B, Arg)) 2257 return false; 2258 2259 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2260 MI.eraseFromParent(); 2261 return true; 2262 } 2263 2264 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2265 MachineRegisterInfo &MRI, 2266 MachineIRBuilder &B, 2267 unsigned AddrSpace) const { 2268 B.setInstr(MI); 2269 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2270 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2271 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2272 MI.eraseFromParent(); 2273 return true; 2274 } 2275 2276 /// Handle register layout difference for f16 images for some subtargets. 2277 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2278 MachineRegisterInfo &MRI, 2279 Register Reg) const { 2280 if (!ST.hasUnpackedD16VMem()) 2281 return Reg; 2282 2283 const LLT S16 = LLT::scalar(16); 2284 const LLT S32 = LLT::scalar(32); 2285 LLT StoreVT = MRI.getType(Reg); 2286 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2287 2288 auto Unmerge = B.buildUnmerge(S16, Reg); 2289 2290 SmallVector<Register, 4> WideRegs; 2291 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2292 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2293 2294 int NumElts = StoreVT.getNumElements(); 2295 2296 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2297 } 2298 2299 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 2300 MachineRegisterInfo &MRI, 2301 MachineIRBuilder &B, 2302 bool IsFormat) const { 2303 // TODO: Reject f16 format on targets where unsupported. 2304 Register VData = MI.getOperand(1).getReg(); 2305 LLT Ty = MRI.getType(VData); 2306 2307 B.setInstr(MI); 2308 2309 const LLT S32 = LLT::scalar(32); 2310 const LLT S16 = LLT::scalar(16); 2311 2312 // Fixup illegal register types for i8 stores. 2313 if (Ty == LLT::scalar(8) || Ty == S16) { 2314 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2315 MI.getOperand(1).setReg(AnyExt); 2316 return true; 2317 } 2318 2319 if (Ty.isVector()) { 2320 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2321 if (IsFormat) 2322 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2323 return true; 2324 } 2325 2326 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2327 } 2328 2329 return Ty == S32; 2330 } 2331 2332 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2333 MachineRegisterInfo &MRI, 2334 MachineIRBuilder &B) const { 2335 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2336 auto IntrID = MI.getIntrinsicID(); 2337 switch (IntrID) { 2338 case Intrinsic::amdgcn_if: 2339 case Intrinsic::amdgcn_else: { 2340 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2341 const SIRegisterInfo *TRI 2342 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2343 2344 B.setInstr(*BrCond); 2345 Register Def = MI.getOperand(1).getReg(); 2346 Register Use = MI.getOperand(3).getReg(); 2347 2348 if (IntrID == Intrinsic::amdgcn_if) { 2349 B.buildInstr(AMDGPU::SI_IF) 2350 .addDef(Def) 2351 .addUse(Use) 2352 .addMBB(BrCond->getOperand(1).getMBB()); 2353 } else { 2354 B.buildInstr(AMDGPU::SI_ELSE) 2355 .addDef(Def) 2356 .addUse(Use) 2357 .addMBB(BrCond->getOperand(1).getMBB()) 2358 .addImm(0); 2359 } 2360 2361 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2362 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2363 MI.eraseFromParent(); 2364 BrCond->eraseFromParent(); 2365 return true; 2366 } 2367 2368 return false; 2369 } 2370 case Intrinsic::amdgcn_loop: { 2371 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2372 const SIRegisterInfo *TRI 2373 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2374 2375 B.setInstr(*BrCond); 2376 Register Reg = MI.getOperand(2).getReg(); 2377 B.buildInstr(AMDGPU::SI_LOOP) 2378 .addUse(Reg) 2379 .addMBB(BrCond->getOperand(1).getMBB()); 2380 MI.eraseFromParent(); 2381 BrCond->eraseFromParent(); 2382 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2383 return true; 2384 } 2385 2386 return false; 2387 } 2388 case Intrinsic::amdgcn_kernarg_segment_ptr: 2389 return legalizePreloadedArgIntrin( 2390 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2391 case Intrinsic::amdgcn_implicitarg_ptr: 2392 return legalizeImplicitArgPtr(MI, MRI, B); 2393 case Intrinsic::amdgcn_workitem_id_x: 2394 return legalizePreloadedArgIntrin(MI, MRI, B, 2395 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2396 case Intrinsic::amdgcn_workitem_id_y: 2397 return legalizePreloadedArgIntrin(MI, MRI, B, 2398 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2399 case Intrinsic::amdgcn_workitem_id_z: 2400 return legalizePreloadedArgIntrin(MI, MRI, B, 2401 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2402 case Intrinsic::amdgcn_workgroup_id_x: 2403 return legalizePreloadedArgIntrin(MI, MRI, B, 2404 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2405 case Intrinsic::amdgcn_workgroup_id_y: 2406 return legalizePreloadedArgIntrin(MI, MRI, B, 2407 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2408 case Intrinsic::amdgcn_workgroup_id_z: 2409 return legalizePreloadedArgIntrin(MI, MRI, B, 2410 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2411 case Intrinsic::amdgcn_dispatch_ptr: 2412 return legalizePreloadedArgIntrin(MI, MRI, B, 2413 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2414 case Intrinsic::amdgcn_queue_ptr: 2415 return legalizePreloadedArgIntrin(MI, MRI, B, 2416 AMDGPUFunctionArgInfo::QUEUE_PTR); 2417 case Intrinsic::amdgcn_implicit_buffer_ptr: 2418 return legalizePreloadedArgIntrin( 2419 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2420 case Intrinsic::amdgcn_dispatch_id: 2421 return legalizePreloadedArgIntrin(MI, MRI, B, 2422 AMDGPUFunctionArgInfo::DISPATCH_ID); 2423 case Intrinsic::amdgcn_fdiv_fast: 2424 return legalizeFDIVFastIntrin(MI, MRI, B); 2425 case Intrinsic::amdgcn_is_shared: 2426 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2427 case Intrinsic::amdgcn_is_private: 2428 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2429 case Intrinsic::amdgcn_wavefrontsize: { 2430 B.setInstr(MI); 2431 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2432 MI.eraseFromParent(); 2433 return true; 2434 } 2435 case Intrinsic::amdgcn_raw_buffer_store: 2436 return legalizeRawBufferStore(MI, MRI, B, false); 2437 case Intrinsic::amdgcn_raw_buffer_store_format: 2438 return legalizeRawBufferStore(MI, MRI, B, true); 2439 default: 2440 return true; 2441 } 2442 2443 return true; 2444 } 2445