1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); // VCC branches 248 setAction({G_BRCOND, S32}, Legal); // SCC branches 249 250 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 251 // elements for v3s16 252 getActionDefinitionsBuilder(G_PHI) 253 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 254 .legalFor(AllS32Vectors) 255 .legalFor(AllS64Vectors) 256 .legalFor(AddrSpaces64) 257 .legalFor(AddrSpaces32) 258 .clampScalar(0, S32, S256) 259 .widenScalarToNextPow2(0, 32) 260 .clampMaxNumElements(0, S32, 16) 261 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 262 .legalIf(isPointer(0)); 263 264 if (ST.has16BitInsts()) { 265 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 266 .legalFor({S32, S16}) 267 .clampScalar(0, S16, S32) 268 .scalarize(0); 269 } else { 270 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 271 .legalFor({S32}) 272 .clampScalar(0, S32, S32) 273 .scalarize(0); 274 } 275 276 // FIXME: Not really legal. Placeholder for custom lowering. 277 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 278 .legalFor({S32, S64}) 279 .clampScalar(0, S32, S64) 280 .widenScalarToNextPow2(0, 32) 281 .scalarize(0); 282 283 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 284 .legalFor({S32}) 285 .clampScalar(0, S32, S32) 286 .scalarize(0); 287 288 // Report legal for any types we can handle anywhere. For the cases only legal 289 // on the SALU, RegBankSelect will be able to re-legalize. 290 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 291 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 292 .clampScalar(0, S32, S64) 293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 294 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 295 .widenScalarToNextPow2(0) 296 .scalarize(0); 297 298 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 299 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 300 .legalFor({{S32, S1}, {S32, S32}}) 301 .clampScalar(0, S32, S32) 302 .scalarize(0); // TODO: Implement. 303 304 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 305 .lower(); 306 307 getActionDefinitionsBuilder(G_BITCAST) 308 // Don't worry about the size constraint. 309 .legalIf(all(isRegisterType(0), isRegisterType(1))) 310 // FIXME: Testing hack 311 .legalForCartesianProduct({S16, LLT::vector(2, 8), }) 312 .lower(); 313 314 315 getActionDefinitionsBuilder(G_FCONSTANT) 316 .legalFor({S32, S64, S16}) 317 .clampScalar(0, S16, S64); 318 319 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 320 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 321 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 322 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 323 .clampScalarOrElt(0, S32, S1024) 324 .legalIf(isMultiple32(0)) 325 .widenScalarToNextPow2(0, 32) 326 .clampMaxNumElements(0, S32, 16); 327 328 329 // FIXME: i1 operands to intrinsics should always be legal, but other i1 330 // values may not be legal. We need to figure out how to distinguish 331 // between these two scenarios. 332 getActionDefinitionsBuilder(G_CONSTANT) 333 .legalFor({S1, S32, S64, S16, GlobalPtr, 334 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 335 .clampScalar(0, S32, S64) 336 .widenScalarToNextPow2(0) 337 .legalIf(isPointer(0)); 338 339 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 340 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 341 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 342 343 344 auto &FPOpActions = getActionDefinitionsBuilder( 345 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 346 .legalFor({S32, S64}); 347 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 348 .customFor({S32, S64}); 349 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 350 .customFor({S32, S64}); 351 352 if (ST.has16BitInsts()) { 353 if (ST.hasVOP3PInsts()) 354 FPOpActions.legalFor({S16, V2S16}); 355 else 356 FPOpActions.legalFor({S16}); 357 358 TrigActions.customFor({S16}); 359 FDIVActions.customFor({S16}); 360 } 361 362 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 363 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 364 365 if (ST.hasVOP3PInsts()) { 366 MinNumMaxNum.customFor(FPTypesPK16) 367 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 368 .clampMaxNumElements(0, S16, 2) 369 .clampScalar(0, S16, S64) 370 .scalarize(0); 371 } else if (ST.has16BitInsts()) { 372 MinNumMaxNum.customFor(FPTypes16) 373 .clampScalar(0, S16, S64) 374 .scalarize(0); 375 } else { 376 MinNumMaxNum.customFor(FPTypesBase) 377 .clampScalar(0, S32, S64) 378 .scalarize(0); 379 } 380 381 if (ST.hasVOP3PInsts()) 382 FPOpActions.clampMaxNumElements(0, S16, 2); 383 384 FPOpActions 385 .scalarize(0) 386 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 387 388 TrigActions 389 .scalarize(0) 390 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 391 392 FDIVActions 393 .scalarize(0) 394 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 395 396 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 397 .legalFor(FPTypesPK16) 398 .clampMaxNumElements(0, S16, 2) 399 .scalarize(0) 400 .clampScalar(0, S16, S64); 401 402 // TODO: Implement 403 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 404 405 if (ST.has16BitInsts()) { 406 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 407 .legalFor({S32, S64, S16}) 408 .scalarize(0) 409 .clampScalar(0, S16, S64); 410 } else { 411 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 412 .legalFor({S32, S64}) 413 .scalarize(0) 414 .clampScalar(0, S32, S64); 415 } 416 417 getActionDefinitionsBuilder(G_FPTRUNC) 418 .legalFor({{S32, S64}, {S16, S32}}) 419 .scalarize(0); 420 421 getActionDefinitionsBuilder(G_FPEXT) 422 .legalFor({{S64, S32}, {S32, S16}}) 423 .lowerFor({{S64, S16}}) // FIXME: Implement 424 .scalarize(0); 425 426 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 427 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 428 429 getActionDefinitionsBuilder(G_FSUB) 430 // Use actual fsub instruction 431 .legalFor({S32}) 432 // Must use fadd + fneg 433 .lowerFor({S64, S16, V2S16}) 434 .scalarize(0) 435 .clampScalar(0, S32, S64); 436 437 // Whether this is legal depends on the floating point mode for the function. 438 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 439 if (ST.hasMadF16()) 440 FMad.customFor({S32, S16}); 441 else 442 FMad.customFor({S32}); 443 FMad.scalarize(0) 444 .lower(); 445 446 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 447 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 448 {S32, S1}, {S64, S1}, {S16, S1}, 449 {S96, S32}, 450 // FIXME: Hack 451 {S64, LLT::scalar(33)}, 452 {S32, S8}, {S32, LLT::scalar(24)}}) 453 .scalarize(0) 454 .clampScalar(0, S32, S64); 455 456 // TODO: Split s1->s64 during regbankselect for VALU. 457 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 458 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 459 .lowerFor({{S32, S64}}) 460 .lowerIf(typeIs(1, S1)) 461 .customFor({{S64, S64}}); 462 if (ST.has16BitInsts()) 463 IToFP.legalFor({{S16, S16}}); 464 IToFP.clampScalar(1, S32, S64) 465 .scalarize(0); 466 467 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 468 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 469 if (ST.has16BitInsts()) 470 FPToI.legalFor({{S16, S16}}); 471 else 472 FPToI.minScalar(1, S32); 473 474 FPToI.minScalar(0, S32) 475 .scalarize(0); 476 477 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 478 .scalarize(0) 479 .lower(); 480 481 if (ST.has16BitInsts()) { 482 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 483 .legalFor({S16, S32, S64}) 484 .clampScalar(0, S16, S64) 485 .scalarize(0); 486 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 487 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 488 .legalFor({S32, S64}) 489 .clampScalar(0, S32, S64) 490 .scalarize(0); 491 } else { 492 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 493 .legalFor({S32}) 494 .customFor({S64}) 495 .clampScalar(0, S32, S64) 496 .scalarize(0); 497 } 498 499 getActionDefinitionsBuilder(G_PTR_ADD) 500 .legalForCartesianProduct(AddrSpaces64, {S64}) 501 .legalForCartesianProduct(AddrSpaces32, {S32}) 502 .scalarize(0); 503 504 getActionDefinitionsBuilder(G_PTR_MASK) 505 .scalarize(0) 506 .alwaysLegal(); 507 508 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 509 510 auto &CmpBuilder = 511 getActionDefinitionsBuilder(G_ICMP) 512 // The compare output type differs based on the register bank of the output, 513 // so make both s1 and s32 legal. 514 // 515 // Scalar compares producing output in scc will be promoted to s32, as that 516 // is the allocatable register type that will be needed for the copy from 517 // scc. This will be promoted during RegBankSelect, and we assume something 518 // before that won't try to use s32 result types. 519 // 520 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 521 // bank. 522 .legalForCartesianProduct( 523 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 524 .legalForCartesianProduct( 525 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 526 if (ST.has16BitInsts()) { 527 CmpBuilder.legalFor({{S1, S16}}); 528 } 529 530 CmpBuilder 531 .widenScalarToNextPow2(1) 532 .clampScalar(1, S32, S64) 533 .scalarize(0) 534 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 535 536 getActionDefinitionsBuilder(G_FCMP) 537 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 538 .widenScalarToNextPow2(1) 539 .clampScalar(1, S32, S64) 540 .scalarize(0); 541 542 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 543 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 544 G_FLOG, G_FLOG2, G_FLOG10}) 545 .legalFor({S32}) 546 .scalarize(0); 547 548 // The 64-bit versions produce 32-bit results, but only on the SALU. 549 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 550 G_CTTZ, G_CTTZ_ZERO_UNDEF, 551 G_CTPOP}) 552 .legalFor({{S32, S32}, {S32, S64}}) 553 .clampScalar(0, S32, S32) 554 .clampScalar(1, S32, S64) 555 .scalarize(0) 556 .widenScalarToNextPow2(0, 32) 557 .widenScalarToNextPow2(1, 32); 558 559 // TODO: Expand for > s32 560 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 561 .legalFor({S32}) 562 .clampScalar(0, S32, S32) 563 .scalarize(0); 564 565 if (ST.has16BitInsts()) { 566 if (ST.hasVOP3PInsts()) { 567 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 568 .legalFor({S32, S16, V2S16}) 569 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 570 .clampMaxNumElements(0, S16, 2) 571 .clampScalar(0, S16, S32) 572 .widenScalarToNextPow2(0) 573 .scalarize(0); 574 } else { 575 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 576 .legalFor({S32, S16}) 577 .widenScalarToNextPow2(0) 578 .clampScalar(0, S16, S32) 579 .scalarize(0); 580 } 581 } else { 582 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 583 .legalFor({S32}) 584 .clampScalar(0, S32, S32) 585 .widenScalarToNextPow2(0) 586 .scalarize(0); 587 } 588 589 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 590 return [=](const LegalityQuery &Query) { 591 return Query.Types[TypeIdx0].getSizeInBits() < 592 Query.Types[TypeIdx1].getSizeInBits(); 593 }; 594 }; 595 596 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 597 return [=](const LegalityQuery &Query) { 598 return Query.Types[TypeIdx0].getSizeInBits() > 599 Query.Types[TypeIdx1].getSizeInBits(); 600 }; 601 }; 602 603 getActionDefinitionsBuilder(G_INTTOPTR) 604 // List the common cases 605 .legalForCartesianProduct(AddrSpaces64, {S64}) 606 .legalForCartesianProduct(AddrSpaces32, {S32}) 607 .scalarize(0) 608 // Accept any address space as long as the size matches 609 .legalIf(sameSize(0, 1)) 610 .widenScalarIf(smallerThan(1, 0), 611 [](const LegalityQuery &Query) { 612 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 613 }) 614 .narrowScalarIf(greaterThan(1, 0), 615 [](const LegalityQuery &Query) { 616 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 617 }); 618 619 getActionDefinitionsBuilder(G_PTRTOINT) 620 // List the common cases 621 .legalForCartesianProduct(AddrSpaces64, {S64}) 622 .legalForCartesianProduct(AddrSpaces32, {S32}) 623 .scalarize(0) 624 // Accept any address space as long as the size matches 625 .legalIf(sameSize(0, 1)) 626 .widenScalarIf(smallerThan(0, 1), 627 [](const LegalityQuery &Query) { 628 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 629 }) 630 .narrowScalarIf( 631 greaterThan(0, 1), 632 [](const LegalityQuery &Query) { 633 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 634 }); 635 636 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 637 .scalarize(0) 638 .custom(); 639 640 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 641 // handle some operations by just promoting the register during 642 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 643 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 644 switch (AS) { 645 // FIXME: Private element size. 646 case AMDGPUAS::PRIVATE_ADDRESS: 647 return 32; 648 // FIXME: Check subtarget 649 case AMDGPUAS::LOCAL_ADDRESS: 650 return ST.useDS128() ? 128 : 64; 651 652 // Treat constant and global as identical. SMRD loads are sometimes usable 653 // for global loads (ideally constant address space should be eliminated) 654 // depending on the context. Legality cannot be context dependent, but 655 // RegBankSelect can split the load as necessary depending on the pointer 656 // register bank/uniformity and if the memory is invariant or not written in 657 // a kernel. 658 case AMDGPUAS::CONSTANT_ADDRESS: 659 case AMDGPUAS::GLOBAL_ADDRESS: 660 return 512; 661 default: 662 return 128; 663 } 664 }; 665 666 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 667 const LLT DstTy = Query.Types[0]; 668 669 // Split vector extloads. 670 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 671 unsigned Align = Query.MMODescrs[0].AlignInBits; 672 673 if (MemSize < DstTy.getSizeInBits()) 674 MemSize = std::max(MemSize, Align); 675 676 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 677 return true; 678 679 const LLT PtrTy = Query.Types[1]; 680 unsigned AS = PtrTy.getAddressSpace(); 681 if (MemSize > maxSizeForAddrSpace(AS)) 682 return true; 683 684 // Catch weird sized loads that don't evenly divide into the access sizes 685 // TODO: May be able to widen depending on alignment etc. 686 unsigned NumRegs = MemSize / 32; 687 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 688 return true; 689 690 if (Align < MemSize) { 691 const SITargetLowering *TLI = ST.getTargetLowering(); 692 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 693 } 694 695 return false; 696 }; 697 698 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 699 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 700 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 701 702 // TODO: Refine based on subtargets which support unaligned access or 128-bit 703 // LDS 704 // TODO: Unsupported flat for SI. 705 706 for (unsigned Op : {G_LOAD, G_STORE}) { 707 const bool IsStore = Op == G_STORE; 708 709 auto &Actions = getActionDefinitionsBuilder(Op); 710 // Whitelist the common cases. 711 // TODO: Pointer loads 712 // TODO: Wide constant loads 713 // TODO: Only CI+ has 3x loads 714 // TODO: Loads to s16 on gfx9 715 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 716 {V2S32, GlobalPtr, 64, GlobalAlign32}, 717 {V3S32, GlobalPtr, 96, GlobalAlign32}, 718 {S96, GlobalPtr, 96, GlobalAlign32}, 719 {V4S32, GlobalPtr, 128, GlobalAlign32}, 720 {S128, GlobalPtr, 128, GlobalAlign32}, 721 {S64, GlobalPtr, 64, GlobalAlign32}, 722 {V2S64, GlobalPtr, 128, GlobalAlign32}, 723 {V2S16, GlobalPtr, 32, GlobalAlign32}, 724 {S32, GlobalPtr, 8, GlobalAlign8}, 725 {S32, GlobalPtr, 16, GlobalAlign16}, 726 727 {S32, LocalPtr, 32, 32}, 728 {S64, LocalPtr, 64, 32}, 729 {V2S32, LocalPtr, 64, 32}, 730 {S32, LocalPtr, 8, 8}, 731 {S32, LocalPtr, 16, 16}, 732 {V2S16, LocalPtr, 32, 32}, 733 734 {S32, PrivatePtr, 32, 32}, 735 {S32, PrivatePtr, 8, 8}, 736 {S32, PrivatePtr, 16, 16}, 737 {V2S16, PrivatePtr, 32, 32}, 738 739 {S32, FlatPtr, 32, GlobalAlign32}, 740 {S32, FlatPtr, 16, GlobalAlign16}, 741 {S32, FlatPtr, 8, GlobalAlign8}, 742 {V2S16, FlatPtr, 32, GlobalAlign32}, 743 744 {S32, ConstantPtr, 32, GlobalAlign32}, 745 {V2S32, ConstantPtr, 64, GlobalAlign32}, 746 {V3S32, ConstantPtr, 96, GlobalAlign32}, 747 {V4S32, ConstantPtr, 128, GlobalAlign32}, 748 {S64, ConstantPtr, 64, GlobalAlign32}, 749 {S128, ConstantPtr, 128, GlobalAlign32}, 750 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 751 Actions 752 .customIf(typeIs(1, Constant32Ptr)) 753 .narrowScalarIf( 754 [=](const LegalityQuery &Query) -> bool { 755 return !Query.Types[0].isVector() && needToSplitLoad(Query); 756 }, 757 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 758 const LLT DstTy = Query.Types[0]; 759 const LLT PtrTy = Query.Types[1]; 760 761 const unsigned DstSize = DstTy.getSizeInBits(); 762 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 763 764 // Split extloads. 765 if (DstSize > MemSize) 766 return std::make_pair(0, LLT::scalar(MemSize)); 767 768 if (DstSize > 32 && (DstSize % 32 != 0)) { 769 // FIXME: Need a way to specify non-extload of larger size if 770 // suitably aligned. 771 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 772 } 773 774 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 775 if (MemSize > MaxSize) 776 return std::make_pair(0, LLT::scalar(MaxSize)); 777 778 unsigned Align = Query.MMODescrs[0].AlignInBits; 779 return std::make_pair(0, LLT::scalar(Align)); 780 }) 781 .fewerElementsIf( 782 [=](const LegalityQuery &Query) -> bool { 783 return Query.Types[0].isVector() && needToSplitLoad(Query); 784 }, 785 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 786 const LLT DstTy = Query.Types[0]; 787 const LLT PtrTy = Query.Types[1]; 788 789 LLT EltTy = DstTy.getElementType(); 790 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 791 792 // Split if it's too large for the address space. 793 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 794 unsigned NumElts = DstTy.getNumElements(); 795 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 796 797 // FIXME: Refine when odd breakdowns handled 798 // The scalars will need to be re-legalized. 799 if (NumPieces == 1 || NumPieces >= NumElts || 800 NumElts % NumPieces != 0) 801 return std::make_pair(0, EltTy); 802 803 return std::make_pair(0, 804 LLT::vector(NumElts / NumPieces, EltTy)); 805 } 806 807 // Need to split because of alignment. 808 unsigned Align = Query.MMODescrs[0].AlignInBits; 809 unsigned EltSize = EltTy.getSizeInBits(); 810 if (EltSize > Align && 811 (EltSize / Align < DstTy.getNumElements())) { 812 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 813 } 814 815 // May need relegalization for the scalars. 816 return std::make_pair(0, EltTy); 817 }) 818 .minScalar(0, S32); 819 820 if (IsStore) 821 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 822 823 // TODO: Need a bitcast lower option? 824 Actions 825 .legalIf([=](const LegalityQuery &Query) { 826 const LLT Ty0 = Query.Types[0]; 827 unsigned Size = Ty0.getSizeInBits(); 828 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 829 unsigned Align = Query.MMODescrs[0].AlignInBits; 830 831 // FIXME: Widening store from alignment not valid. 832 if (MemSize < Size) 833 MemSize = std::max(MemSize, Align); 834 835 // No extending vector loads. 836 if (Size > MemSize && Ty0.isVector()) 837 return false; 838 839 switch (MemSize) { 840 case 8: 841 case 16: 842 return Size == 32; 843 case 32: 844 case 64: 845 case 128: 846 return true; 847 case 96: 848 return ST.hasDwordx3LoadStores(); 849 case 256: 850 case 512: 851 return true; 852 default: 853 return false; 854 } 855 }) 856 .widenScalarToNextPow2(0) 857 // TODO: v3s32->v4s32 with alignment 858 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 859 } 860 861 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 862 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 863 {S32, GlobalPtr, 16, 2 * 8}, 864 {S32, LocalPtr, 8, 8}, 865 {S32, LocalPtr, 16, 16}, 866 {S32, PrivatePtr, 8, 8}, 867 {S32, PrivatePtr, 16, 16}, 868 {S32, ConstantPtr, 8, 8}, 869 {S32, ConstantPtr, 16, 2 * 8}}); 870 if (ST.hasFlatAddressSpace()) { 871 ExtLoads.legalForTypesWithMemDesc( 872 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 873 } 874 875 ExtLoads.clampScalar(0, S32, S32) 876 .widenScalarToNextPow2(0) 877 .unsupportedIfMemSizeNotPow2() 878 .lower(); 879 880 auto &Atomics = getActionDefinitionsBuilder( 881 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 882 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 883 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 884 G_ATOMICRMW_UMIN}) 885 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 886 {S64, GlobalPtr}, {S64, LocalPtr}}); 887 if (ST.hasFlatAddressSpace()) { 888 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 889 } 890 891 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 892 .legalFor({{S32, LocalPtr}}); 893 894 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 895 // demarshalling 896 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 897 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 898 {S32, FlatPtr}, {S64, FlatPtr}}) 899 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 900 {S32, RegionPtr}, {S64, RegionPtr}}); 901 902 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 903 .lower(); 904 905 // TODO: Pointer types, any 32-bit or 64-bit vector 906 907 // Condition should be s32 for scalar, s1 for vector. 908 getActionDefinitionsBuilder(G_SELECT) 909 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 910 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 911 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 912 .clampScalar(0, S16, S64) 913 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 914 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 915 .scalarize(1) 916 .clampMaxNumElements(0, S32, 2) 917 .clampMaxNumElements(0, LocalPtr, 2) 918 .clampMaxNumElements(0, PrivatePtr, 2) 919 .scalarize(0) 920 .widenScalarToNextPow2(0) 921 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 922 923 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 924 // be more flexible with the shift amount type. 925 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 926 .legalFor({{S32, S32}, {S64, S32}}); 927 if (ST.has16BitInsts()) { 928 if (ST.hasVOP3PInsts()) { 929 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 930 .clampMaxNumElements(0, S16, 2); 931 } else 932 Shifts.legalFor({{S16, S32}, {S16, S16}}); 933 934 // TODO: Support 16-bit shift amounts 935 Shifts.clampScalar(1, S32, S32); 936 Shifts.clampScalar(0, S16, S64); 937 Shifts.widenScalarToNextPow2(0, 16); 938 } else { 939 // Make sure we legalize the shift amount type first, as the general 940 // expansion for the shifted type will produce much worse code if it hasn't 941 // been truncated already. 942 Shifts.clampScalar(1, S32, S32); 943 Shifts.clampScalar(0, S32, S64); 944 Shifts.widenScalarToNextPow2(0, 32); 945 } 946 Shifts.scalarize(0); 947 948 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 949 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 950 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 951 unsigned IdxTypeIdx = 2; 952 953 getActionDefinitionsBuilder(Op) 954 .customIf([=](const LegalityQuery &Query) { 955 const LLT EltTy = Query.Types[EltTypeIdx]; 956 const LLT VecTy = Query.Types[VecTypeIdx]; 957 const LLT IdxTy = Query.Types[IdxTypeIdx]; 958 return (EltTy.getSizeInBits() == 16 || 959 EltTy.getSizeInBits() % 32 == 0) && 960 VecTy.getSizeInBits() % 32 == 0 && 961 VecTy.getSizeInBits() <= 1024 && 962 IdxTy.getSizeInBits() == 32; 963 }) 964 .clampScalar(EltTypeIdx, S32, S64) 965 .clampScalar(VecTypeIdx, S32, S64) 966 .clampScalar(IdxTypeIdx, S32, S32); 967 } 968 969 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 970 .unsupportedIf([=](const LegalityQuery &Query) { 971 const LLT &EltTy = Query.Types[1].getElementType(); 972 return Query.Types[0] != EltTy; 973 }); 974 975 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 976 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 977 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 978 979 // FIXME: Doesn't handle extract of illegal sizes. 980 getActionDefinitionsBuilder(Op) 981 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 982 // FIXME: Multiples of 16 should not be legal. 983 .legalIf([=](const LegalityQuery &Query) { 984 const LLT BigTy = Query.Types[BigTyIdx]; 985 const LLT LitTy = Query.Types[LitTyIdx]; 986 return (BigTy.getSizeInBits() % 32 == 0) && 987 (LitTy.getSizeInBits() % 16 == 0); 988 }) 989 .widenScalarIf( 990 [=](const LegalityQuery &Query) { 991 const LLT BigTy = Query.Types[BigTyIdx]; 992 return (BigTy.getScalarSizeInBits() < 16); 993 }, 994 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 995 .widenScalarIf( 996 [=](const LegalityQuery &Query) { 997 const LLT LitTy = Query.Types[LitTyIdx]; 998 return (LitTy.getScalarSizeInBits() < 16); 999 }, 1000 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1001 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1002 .widenScalarToNextPow2(BigTyIdx, 32); 1003 1004 } 1005 1006 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1007 .legalForCartesianProduct(AllS32Vectors, {S32}) 1008 .legalForCartesianProduct(AllS64Vectors, {S64}) 1009 .clampNumElements(0, V16S32, V32S32) 1010 .clampNumElements(0, V2S64, V16S64) 1011 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1012 1013 if (ST.hasScalarPackInsts()) 1014 BuildVector.legalFor({V2S16, S32}); 1015 1016 BuildVector 1017 .minScalarSameAs(1, 0) 1018 .legalIf(isRegisterType(0)) 1019 .minScalarOrElt(0, S32); 1020 1021 if (ST.hasScalarPackInsts()) { 1022 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1023 .legalFor({V2S16, S32}) 1024 .lower(); 1025 } else { 1026 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1027 .lower(); 1028 } 1029 1030 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1031 .legalIf(isRegisterType(0)); 1032 1033 // TODO: Don't fully scalarize v2s16 pieces 1034 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1035 1036 // Merge/Unmerge 1037 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1038 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1039 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1040 1041 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1042 const LLT &Ty = Query.Types[TypeIdx]; 1043 if (Ty.isVector()) { 1044 const LLT &EltTy = Ty.getElementType(); 1045 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1046 return true; 1047 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1048 return true; 1049 } 1050 return false; 1051 }; 1052 1053 auto &Builder = getActionDefinitionsBuilder(Op) 1054 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1055 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1056 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1057 // valid. 1058 .clampScalar(LitTyIdx, S16, S256) 1059 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1060 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1061 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1062 elementTypeIs(1, S16)), 1063 changeTo(1, V2S16)) 1064 // Break up vectors with weird elements into scalars 1065 .fewerElementsIf( 1066 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1067 scalarize(0)) 1068 .fewerElementsIf( 1069 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1070 scalarize(1)) 1071 .clampScalar(BigTyIdx, S32, S1024) 1072 .lowerFor({{S16, V2S16}}); 1073 1074 if (Op == G_MERGE_VALUES) { 1075 Builder.widenScalarIf( 1076 // TODO: Use 16-bit shifts if legal for 8-bit values? 1077 [=](const LegalityQuery &Query) { 1078 const LLT Ty = Query.Types[LitTyIdx]; 1079 return Ty.getSizeInBits() < 32; 1080 }, 1081 changeTo(LitTyIdx, S32)); 1082 } 1083 1084 Builder.widenScalarIf( 1085 [=](const LegalityQuery &Query) { 1086 const LLT Ty = Query.Types[BigTyIdx]; 1087 return !isPowerOf2_32(Ty.getSizeInBits()) && 1088 Ty.getSizeInBits() % 16 != 0; 1089 }, 1090 [=](const LegalityQuery &Query) { 1091 // Pick the next power of 2, or a multiple of 64 over 128. 1092 // Whichever is smaller. 1093 const LLT &Ty = Query.Types[BigTyIdx]; 1094 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1095 if (NewSizeInBits >= 256) { 1096 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1097 if (RoundedTo < NewSizeInBits) 1098 NewSizeInBits = RoundedTo; 1099 } 1100 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1101 }) 1102 .legalIf([=](const LegalityQuery &Query) { 1103 const LLT &BigTy = Query.Types[BigTyIdx]; 1104 const LLT &LitTy = Query.Types[LitTyIdx]; 1105 1106 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1107 return false; 1108 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1109 return false; 1110 1111 return BigTy.getSizeInBits() % 16 == 0 && 1112 LitTy.getSizeInBits() % 16 == 0 && 1113 BigTy.getSizeInBits() <= 1024; 1114 }) 1115 // Any vectors left are the wrong size. Scalarize them. 1116 .scalarize(0) 1117 .scalarize(1); 1118 } 1119 1120 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1121 1122 getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower(); 1123 1124 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1125 .legalFor({S64}); 1126 1127 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1128 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1129 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1130 .unsupported(); 1131 1132 computeTables(); 1133 verify(*ST.getInstrInfo()); 1134 } 1135 1136 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1137 MachineRegisterInfo &MRI, 1138 MachineIRBuilder &B, 1139 GISelChangeObserver &Observer) const { 1140 switch (MI.getOpcode()) { 1141 case TargetOpcode::G_ADDRSPACE_CAST: 1142 return legalizeAddrSpaceCast(MI, MRI, B); 1143 case TargetOpcode::G_FRINT: 1144 return legalizeFrint(MI, MRI, B); 1145 case TargetOpcode::G_FCEIL: 1146 return legalizeFceil(MI, MRI, B); 1147 case TargetOpcode::G_INTRINSIC_TRUNC: 1148 return legalizeIntrinsicTrunc(MI, MRI, B); 1149 case TargetOpcode::G_SITOFP: 1150 return legalizeITOFP(MI, MRI, B, true); 1151 case TargetOpcode::G_UITOFP: 1152 return legalizeITOFP(MI, MRI, B, false); 1153 case TargetOpcode::G_FMINNUM: 1154 case TargetOpcode::G_FMAXNUM: 1155 case TargetOpcode::G_FMINNUM_IEEE: 1156 case TargetOpcode::G_FMAXNUM_IEEE: 1157 return legalizeMinNumMaxNum(MI, MRI, B); 1158 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1159 return legalizeExtractVectorElt(MI, MRI, B); 1160 case TargetOpcode::G_INSERT_VECTOR_ELT: 1161 return legalizeInsertVectorElt(MI, MRI, B); 1162 case TargetOpcode::G_FSIN: 1163 case TargetOpcode::G_FCOS: 1164 return legalizeSinCos(MI, MRI, B); 1165 case TargetOpcode::G_GLOBAL_VALUE: 1166 return legalizeGlobalValue(MI, MRI, B); 1167 case TargetOpcode::G_LOAD: 1168 return legalizeLoad(MI, MRI, B, Observer); 1169 case TargetOpcode::G_FMAD: 1170 return legalizeFMad(MI, MRI, B); 1171 case TargetOpcode::G_FDIV: 1172 return legalizeFDIV(MI, MRI, B); 1173 case TargetOpcode::G_ATOMIC_CMPXCHG: 1174 return legalizeAtomicCmpXChg(MI, MRI, B); 1175 default: 1176 return false; 1177 } 1178 1179 llvm_unreachable("expected switch to return"); 1180 } 1181 1182 Register AMDGPULegalizerInfo::getSegmentAperture( 1183 unsigned AS, 1184 MachineRegisterInfo &MRI, 1185 MachineIRBuilder &B) const { 1186 MachineFunction &MF = B.getMF(); 1187 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1188 const LLT S32 = LLT::scalar(32); 1189 1190 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1191 1192 if (ST.hasApertureRegs()) { 1193 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1194 // getreg. 1195 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1196 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1197 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1198 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1199 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1200 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1201 unsigned Encoding = 1202 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1203 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1204 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1205 1206 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1207 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1208 1209 B.buildInstr(AMDGPU::S_GETREG_B32) 1210 .addDef(GetReg) 1211 .addImm(Encoding); 1212 MRI.setType(GetReg, S32); 1213 1214 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1215 B.buildInstr(TargetOpcode::G_SHL) 1216 .addDef(ApertureReg) 1217 .addUse(GetReg) 1218 .addUse(ShiftAmt.getReg(0)); 1219 1220 return ApertureReg; 1221 } 1222 1223 Register QueuePtr = MRI.createGenericVirtualRegister( 1224 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1225 1226 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1227 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1228 return Register(); 1229 1230 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1231 // private_segment_aperture_base_hi. 1232 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1233 1234 // TODO: can we be smarter about machine pointer info? 1235 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1236 MachineMemOperand *MMO = MF.getMachineMemOperand( 1237 PtrInfo, 1238 MachineMemOperand::MOLoad | 1239 MachineMemOperand::MODereferenceable | 1240 MachineMemOperand::MOInvariant, 1241 4, 1242 MinAlign(64, StructOffset)); 1243 1244 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1245 Register LoadAddr; 1246 1247 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1248 B.buildLoad(LoadResult, LoadAddr, *MMO); 1249 return LoadResult; 1250 } 1251 1252 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1253 MachineInstr &MI, MachineRegisterInfo &MRI, 1254 MachineIRBuilder &B) const { 1255 MachineFunction &MF = B.getMF(); 1256 1257 B.setInstr(MI); 1258 1259 const LLT S32 = LLT::scalar(32); 1260 Register Dst = MI.getOperand(0).getReg(); 1261 Register Src = MI.getOperand(1).getReg(); 1262 1263 LLT DstTy = MRI.getType(Dst); 1264 LLT SrcTy = MRI.getType(Src); 1265 unsigned DestAS = DstTy.getAddressSpace(); 1266 unsigned SrcAS = SrcTy.getAddressSpace(); 1267 1268 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1269 // vector element. 1270 assert(!DstTy.isVector()); 1271 1272 const AMDGPUTargetMachine &TM 1273 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1274 1275 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1276 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1277 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1278 return true; 1279 } 1280 1281 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1282 // Truncate. 1283 B.buildExtract(Dst, Src, 0); 1284 MI.eraseFromParent(); 1285 return true; 1286 } 1287 1288 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1289 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1290 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1291 1292 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1293 // another. Merge operands are required to be the same type, but creating an 1294 // extra ptrtoint would be kind of pointless. 1295 auto HighAddr = B.buildConstant( 1296 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1297 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1298 MI.eraseFromParent(); 1299 return true; 1300 } 1301 1302 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1303 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1304 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1305 unsigned NullVal = TM.getNullPointerValue(DestAS); 1306 1307 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1308 auto FlatNull = B.buildConstant(SrcTy, 0); 1309 1310 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1311 1312 // Extract low 32-bits of the pointer. 1313 B.buildExtract(PtrLo32, Src, 0); 1314 1315 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1316 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1317 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1318 1319 MI.eraseFromParent(); 1320 return true; 1321 } 1322 1323 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1324 return false; 1325 1326 if (!ST.hasFlatAddressSpace()) 1327 return false; 1328 1329 auto SegmentNull = 1330 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1331 auto FlatNull = 1332 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1333 1334 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1335 if (!ApertureReg.isValid()) 1336 return false; 1337 1338 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1339 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1340 1341 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1342 1343 // Coerce the type of the low half of the result so we can use merge_values. 1344 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1345 B.buildInstr(TargetOpcode::G_PTRTOINT) 1346 .addDef(SrcAsInt) 1347 .addUse(Src); 1348 1349 // TODO: Should we allow mismatched types but matching sizes in merges to 1350 // avoid the ptrtoint? 1351 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1352 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1353 1354 MI.eraseFromParent(); 1355 return true; 1356 } 1357 1358 bool AMDGPULegalizerInfo::legalizeFrint( 1359 MachineInstr &MI, MachineRegisterInfo &MRI, 1360 MachineIRBuilder &B) const { 1361 B.setInstr(MI); 1362 1363 Register Src = MI.getOperand(1).getReg(); 1364 LLT Ty = MRI.getType(Src); 1365 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1366 1367 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1368 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1369 1370 auto C1 = B.buildFConstant(Ty, C1Val); 1371 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1372 1373 // TODO: Should this propagate fast-math-flags? 1374 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1375 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1376 1377 auto C2 = B.buildFConstant(Ty, C2Val); 1378 auto Fabs = B.buildFAbs(Ty, Src); 1379 1380 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1381 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1382 return true; 1383 } 1384 1385 bool AMDGPULegalizerInfo::legalizeFceil( 1386 MachineInstr &MI, MachineRegisterInfo &MRI, 1387 MachineIRBuilder &B) const { 1388 B.setInstr(MI); 1389 1390 const LLT S1 = LLT::scalar(1); 1391 const LLT S64 = LLT::scalar(64); 1392 1393 Register Src = MI.getOperand(1).getReg(); 1394 assert(MRI.getType(Src) == S64); 1395 1396 // result = trunc(src) 1397 // if (src > 0.0 && src != result) 1398 // result += 1.0 1399 1400 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1401 1402 const auto Zero = B.buildFConstant(S64, 0.0); 1403 const auto One = B.buildFConstant(S64, 1.0); 1404 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1405 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1406 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1407 auto Add = B.buildSelect(S64, And, One, Zero); 1408 1409 // TODO: Should this propagate fast-math-flags? 1410 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1411 return true; 1412 } 1413 1414 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1415 MachineIRBuilder &B) { 1416 const unsigned FractBits = 52; 1417 const unsigned ExpBits = 11; 1418 LLT S32 = LLT::scalar(32); 1419 1420 auto Const0 = B.buildConstant(S32, FractBits - 32); 1421 auto Const1 = B.buildConstant(S32, ExpBits); 1422 1423 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1424 .addUse(Const0.getReg(0)) 1425 .addUse(Const1.getReg(0)); 1426 1427 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1428 } 1429 1430 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1431 MachineInstr &MI, MachineRegisterInfo &MRI, 1432 MachineIRBuilder &B) const { 1433 B.setInstr(MI); 1434 1435 const LLT S1 = LLT::scalar(1); 1436 const LLT S32 = LLT::scalar(32); 1437 const LLT S64 = LLT::scalar(64); 1438 1439 Register Src = MI.getOperand(1).getReg(); 1440 assert(MRI.getType(Src) == S64); 1441 1442 // TODO: Should this use extract since the low half is unused? 1443 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1444 Register Hi = Unmerge.getReg(1); 1445 1446 // Extract the upper half, since this is where we will find the sign and 1447 // exponent. 1448 auto Exp = extractF64Exponent(Hi, B); 1449 1450 const unsigned FractBits = 52; 1451 1452 // Extract the sign bit. 1453 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1454 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1455 1456 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1457 1458 const auto Zero32 = B.buildConstant(S32, 0); 1459 1460 // Extend back to 64-bits. 1461 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1462 1463 auto Shr = B.buildAShr(S64, FractMask, Exp); 1464 auto Not = B.buildNot(S64, Shr); 1465 auto Tmp0 = B.buildAnd(S64, Src, Not); 1466 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1467 1468 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1469 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1470 1471 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1472 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1473 return true; 1474 } 1475 1476 bool AMDGPULegalizerInfo::legalizeITOFP( 1477 MachineInstr &MI, MachineRegisterInfo &MRI, 1478 MachineIRBuilder &B, bool Signed) const { 1479 B.setInstr(MI); 1480 1481 Register Dst = MI.getOperand(0).getReg(); 1482 Register Src = MI.getOperand(1).getReg(); 1483 1484 const LLT S64 = LLT::scalar(64); 1485 const LLT S32 = LLT::scalar(32); 1486 1487 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1488 1489 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1490 1491 auto CvtHi = Signed ? 1492 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1493 B.buildUITOFP(S64, Unmerge.getReg(1)); 1494 1495 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1496 1497 auto ThirtyTwo = B.buildConstant(S32, 32); 1498 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1499 .addUse(CvtHi.getReg(0)) 1500 .addUse(ThirtyTwo.getReg(0)); 1501 1502 // TODO: Should this propagate fast-math-flags? 1503 B.buildFAdd(Dst, LdExp, CvtLo); 1504 MI.eraseFromParent(); 1505 return true; 1506 } 1507 1508 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1509 MachineInstr &MI, MachineRegisterInfo &MRI, 1510 MachineIRBuilder &B) const { 1511 MachineFunction &MF = B.getMF(); 1512 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1513 1514 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1515 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1516 1517 // With ieee_mode disabled, the instructions have the correct behavior 1518 // already for G_FMINNUM/G_FMAXNUM 1519 if (!MFI->getMode().IEEE) 1520 return !IsIEEEOp; 1521 1522 if (IsIEEEOp) 1523 return true; 1524 1525 MachineIRBuilder HelperBuilder(MI); 1526 GISelObserverWrapper DummyObserver; 1527 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1528 HelperBuilder.setInstr(MI); 1529 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1530 } 1531 1532 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1533 MachineInstr &MI, MachineRegisterInfo &MRI, 1534 MachineIRBuilder &B) const { 1535 // TODO: Should move some of this into LegalizerHelper. 1536 1537 // TODO: Promote dynamic indexing of s16 to s32 1538 // TODO: Dynamic s64 indexing is only legal for SGPR. 1539 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1540 if (!IdxVal) // Dynamic case will be selected to register indexing. 1541 return true; 1542 1543 Register Dst = MI.getOperand(0).getReg(); 1544 Register Vec = MI.getOperand(1).getReg(); 1545 1546 LLT VecTy = MRI.getType(Vec); 1547 LLT EltTy = VecTy.getElementType(); 1548 assert(EltTy == MRI.getType(Dst)); 1549 1550 B.setInstr(MI); 1551 1552 if (IdxVal.getValue() < VecTy.getNumElements()) 1553 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1554 else 1555 B.buildUndef(Dst); 1556 1557 MI.eraseFromParent(); 1558 return true; 1559 } 1560 1561 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1562 MachineInstr &MI, MachineRegisterInfo &MRI, 1563 MachineIRBuilder &B) const { 1564 // TODO: Should move some of this into LegalizerHelper. 1565 1566 // TODO: Promote dynamic indexing of s16 to s32 1567 // TODO: Dynamic s64 indexing is only legal for SGPR. 1568 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1569 if (!IdxVal) // Dynamic case will be selected to register indexing. 1570 return true; 1571 1572 Register Dst = MI.getOperand(0).getReg(); 1573 Register Vec = MI.getOperand(1).getReg(); 1574 Register Ins = MI.getOperand(2).getReg(); 1575 1576 LLT VecTy = MRI.getType(Vec); 1577 LLT EltTy = VecTy.getElementType(); 1578 assert(EltTy == MRI.getType(Ins)); 1579 1580 B.setInstr(MI); 1581 1582 if (IdxVal.getValue() < VecTy.getNumElements()) 1583 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1584 else 1585 B.buildUndef(Dst); 1586 1587 MI.eraseFromParent(); 1588 return true; 1589 } 1590 1591 bool AMDGPULegalizerInfo::legalizeSinCos( 1592 MachineInstr &MI, MachineRegisterInfo &MRI, 1593 MachineIRBuilder &B) const { 1594 B.setInstr(MI); 1595 1596 Register DstReg = MI.getOperand(0).getReg(); 1597 Register SrcReg = MI.getOperand(1).getReg(); 1598 LLT Ty = MRI.getType(DstReg); 1599 unsigned Flags = MI.getFlags(); 1600 1601 Register TrigVal; 1602 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1603 if (ST.hasTrigReducedRange()) { 1604 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1605 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1606 .addUse(MulVal.getReg(0)) 1607 .setMIFlags(Flags).getReg(0); 1608 } else 1609 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1610 1611 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1612 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1613 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1614 .addUse(TrigVal) 1615 .setMIFlags(Flags); 1616 MI.eraseFromParent(); 1617 return true; 1618 } 1619 1620 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1621 Register DstReg, LLT PtrTy, 1622 MachineIRBuilder &B, const GlobalValue *GV, 1623 unsigned Offset, unsigned GAFlags) const { 1624 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1625 // to the following code sequence: 1626 // 1627 // For constant address space: 1628 // s_getpc_b64 s[0:1] 1629 // s_add_u32 s0, s0, $symbol 1630 // s_addc_u32 s1, s1, 0 1631 // 1632 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1633 // a fixup or relocation is emitted to replace $symbol with a literal 1634 // constant, which is a pc-relative offset from the encoding of the $symbol 1635 // operand to the global variable. 1636 // 1637 // For global address space: 1638 // s_getpc_b64 s[0:1] 1639 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1640 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1641 // 1642 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1643 // fixups or relocations are emitted to replace $symbol@*@lo and 1644 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1645 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1646 // operand to the global variable. 1647 // 1648 // What we want here is an offset from the value returned by s_getpc 1649 // (which is the address of the s_add_u32 instruction) to the global 1650 // variable, but since the encoding of $symbol starts 4 bytes after the start 1651 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1652 // small. This requires us to add 4 to the global variable offset in order to 1653 // compute the correct address. 1654 1655 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1656 1657 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1658 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1659 1660 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1661 .addDef(PCReg); 1662 1663 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1664 if (GAFlags == SIInstrInfo::MO_NONE) 1665 MIB.addImm(0); 1666 else 1667 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1668 1669 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1670 1671 if (PtrTy.getSizeInBits() == 32) 1672 B.buildExtract(DstReg, PCReg, 0); 1673 return true; 1674 } 1675 1676 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1677 MachineInstr &MI, MachineRegisterInfo &MRI, 1678 MachineIRBuilder &B) const { 1679 Register DstReg = MI.getOperand(0).getReg(); 1680 LLT Ty = MRI.getType(DstReg); 1681 unsigned AS = Ty.getAddressSpace(); 1682 1683 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1684 MachineFunction &MF = B.getMF(); 1685 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1686 B.setInstr(MI); 1687 1688 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1689 if (!MFI->isEntryFunction()) { 1690 const Function &Fn = MF.getFunction(); 1691 DiagnosticInfoUnsupported BadLDSDecl( 1692 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1693 Fn.getContext().diagnose(BadLDSDecl); 1694 } 1695 1696 // TODO: We could emit code to handle the initialization somewhere. 1697 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1698 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1699 MI.eraseFromParent(); 1700 return true; 1701 } 1702 1703 const Function &Fn = MF.getFunction(); 1704 DiagnosticInfoUnsupported BadInit( 1705 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1706 Fn.getContext().diagnose(BadInit); 1707 return true; 1708 } 1709 1710 const SITargetLowering *TLI = ST.getTargetLowering(); 1711 1712 if (TLI->shouldEmitFixup(GV)) { 1713 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1714 MI.eraseFromParent(); 1715 return true; 1716 } 1717 1718 if (TLI->shouldEmitPCReloc(GV)) { 1719 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1720 MI.eraseFromParent(); 1721 return true; 1722 } 1723 1724 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1725 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1726 1727 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1728 MachinePointerInfo::getGOT(MF), 1729 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1730 MachineMemOperand::MOInvariant, 1731 8 /*Size*/, 8 /*Align*/); 1732 1733 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1734 1735 if (Ty.getSizeInBits() == 32) { 1736 // Truncate if this is a 32-bit constant adrdess. 1737 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1738 B.buildExtract(DstReg, Load, 0); 1739 } else 1740 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1741 1742 MI.eraseFromParent(); 1743 return true; 1744 } 1745 1746 bool AMDGPULegalizerInfo::legalizeLoad( 1747 MachineInstr &MI, MachineRegisterInfo &MRI, 1748 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1749 B.setInstr(MI); 1750 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1751 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1752 Observer.changingInstr(MI); 1753 MI.getOperand(1).setReg(Cast.getReg(0)); 1754 Observer.changedInstr(MI); 1755 return true; 1756 } 1757 1758 bool AMDGPULegalizerInfo::legalizeFMad( 1759 MachineInstr &MI, MachineRegisterInfo &MRI, 1760 MachineIRBuilder &B) const { 1761 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1762 assert(Ty.isScalar()); 1763 1764 MachineFunction &MF = B.getMF(); 1765 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1766 1767 // TODO: Always legal with future ftz flag. 1768 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1769 return true; 1770 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1771 return true; 1772 1773 1774 MachineIRBuilder HelperBuilder(MI); 1775 GISelObserverWrapper DummyObserver; 1776 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1777 HelperBuilder.setMBB(*MI.getParent()); 1778 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1779 } 1780 1781 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1782 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1783 Register DstReg = MI.getOperand(0).getReg(); 1784 Register PtrReg = MI.getOperand(1).getReg(); 1785 Register CmpVal = MI.getOperand(2).getReg(); 1786 Register NewVal = MI.getOperand(3).getReg(); 1787 1788 assert(SITargetLowering::isFlatGlobalAddrSpace( 1789 MRI.getType(PtrReg).getAddressSpace()) && 1790 "this should not have been custom lowered"); 1791 1792 LLT ValTy = MRI.getType(CmpVal); 1793 LLT VecTy = LLT::vector(2, ValTy); 1794 1795 B.setInstr(MI); 1796 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1797 1798 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1799 .addDef(DstReg) 1800 .addUse(PtrReg) 1801 .addUse(PackedVal) 1802 .setMemRefs(MI.memoperands()); 1803 1804 MI.eraseFromParent(); 1805 return true; 1806 } 1807 1808 // Return the use branch instruction, otherwise null if the usage is invalid. 1809 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1810 MachineRegisterInfo &MRI, 1811 MachineInstr *&Br) { 1812 Register CondDef = MI.getOperand(0).getReg(); 1813 if (!MRI.hasOneNonDBGUse(CondDef)) 1814 return nullptr; 1815 1816 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1817 if (UseMI.getParent() != MI.getParent() || 1818 UseMI.getOpcode() != AMDGPU::G_BRCOND) 1819 return nullptr; 1820 1821 // Make sure the cond br is followed by a G_BR 1822 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 1823 if (Next != MI.getParent()->end()) { 1824 if (Next->getOpcode() != AMDGPU::G_BR) 1825 return nullptr; 1826 Br = &*Next; 1827 } 1828 1829 return &UseMI; 1830 } 1831 1832 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1833 Register Reg, LLT Ty) const { 1834 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1835 if (LiveIn) 1836 return LiveIn; 1837 1838 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1839 MRI.addLiveIn(Reg, NewReg); 1840 return NewReg; 1841 } 1842 1843 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1844 const ArgDescriptor *Arg) const { 1845 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1846 return false; // TODO: Handle these 1847 1848 assert(Arg->getRegister().isPhysical()); 1849 1850 MachineRegisterInfo &MRI = *B.getMRI(); 1851 1852 LLT Ty = MRI.getType(DstReg); 1853 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1854 1855 if (Arg->isMasked()) { 1856 // TODO: Should we try to emit this once in the entry block? 1857 const LLT S32 = LLT::scalar(32); 1858 const unsigned Mask = Arg->getMask(); 1859 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1860 1861 Register AndMaskSrc = LiveIn; 1862 1863 if (Shift != 0) { 1864 auto ShiftAmt = B.buildConstant(S32, Shift); 1865 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1866 } 1867 1868 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1869 } else 1870 B.buildCopy(DstReg, LiveIn); 1871 1872 // Insert the argument copy if it doens't already exist. 1873 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1874 if (!MRI.getVRegDef(LiveIn)) { 1875 // FIXME: Should have scoped insert pt 1876 MachineBasicBlock &OrigInsBB = B.getMBB(); 1877 auto OrigInsPt = B.getInsertPt(); 1878 1879 MachineBasicBlock &EntryMBB = B.getMF().front(); 1880 EntryMBB.addLiveIn(Arg->getRegister()); 1881 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1882 B.buildCopy(LiveIn, Arg->getRegister()); 1883 1884 B.setInsertPt(OrigInsBB, OrigInsPt); 1885 } 1886 1887 return true; 1888 } 1889 1890 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1891 MachineInstr &MI, 1892 MachineRegisterInfo &MRI, 1893 MachineIRBuilder &B, 1894 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1895 B.setInstr(MI); 1896 1897 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1898 1899 const ArgDescriptor *Arg; 1900 const TargetRegisterClass *RC; 1901 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1902 if (!Arg) { 1903 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1904 return false; 1905 } 1906 1907 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1908 MI.eraseFromParent(); 1909 return true; 1910 } 1911 1912 return false; 1913 } 1914 1915 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1916 MachineRegisterInfo &MRI, 1917 MachineIRBuilder &B) const { 1918 B.setInstr(MI); 1919 Register Dst = MI.getOperand(0).getReg(); 1920 LLT DstTy = MRI.getType(Dst); 1921 LLT S16 = LLT::scalar(16); 1922 LLT S32 = LLT::scalar(32); 1923 LLT S64 = LLT::scalar(64); 1924 1925 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1926 return true; 1927 1928 if (DstTy == S16) 1929 return legalizeFDIV16(MI, MRI, B); 1930 if (DstTy == S32) 1931 return legalizeFDIV32(MI, MRI, B); 1932 if (DstTy == S64) 1933 return legalizeFDIV64(MI, MRI, B); 1934 1935 return false; 1936 } 1937 1938 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1939 MachineRegisterInfo &MRI, 1940 MachineIRBuilder &B) const { 1941 Register Res = MI.getOperand(0).getReg(); 1942 Register LHS = MI.getOperand(1).getReg(); 1943 Register RHS = MI.getOperand(2).getReg(); 1944 1945 uint16_t Flags = MI.getFlags(); 1946 1947 LLT ResTy = MRI.getType(Res); 1948 LLT S32 = LLT::scalar(32); 1949 LLT S64 = LLT::scalar(64); 1950 1951 const MachineFunction &MF = B.getMF(); 1952 bool Unsafe = 1953 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1954 1955 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1956 return false; 1957 1958 if (!Unsafe && ResTy == S32 && 1959 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 1960 return false; 1961 1962 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1963 // 1 / x -> RCP(x) 1964 if (CLHS->isExactlyValue(1.0)) { 1965 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1966 .addUse(RHS) 1967 .setMIFlags(Flags); 1968 1969 MI.eraseFromParent(); 1970 return true; 1971 } 1972 1973 // -1 / x -> RCP( FNEG(x) ) 1974 if (CLHS->isExactlyValue(-1.0)) { 1975 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1976 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1977 .addUse(FNeg.getReg(0)) 1978 .setMIFlags(Flags); 1979 1980 MI.eraseFromParent(); 1981 return true; 1982 } 1983 } 1984 1985 // x / y -> x * (1.0 / y) 1986 if (Unsafe) { 1987 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1988 .addUse(RHS) 1989 .setMIFlags(Flags); 1990 B.buildFMul(Res, LHS, RCP, Flags); 1991 1992 MI.eraseFromParent(); 1993 return true; 1994 } 1995 1996 return false; 1997 } 1998 1999 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2000 MachineRegisterInfo &MRI, 2001 MachineIRBuilder &B) const { 2002 B.setInstr(MI); 2003 Register Res = MI.getOperand(0).getReg(); 2004 Register LHS = MI.getOperand(1).getReg(); 2005 Register RHS = MI.getOperand(2).getReg(); 2006 2007 uint16_t Flags = MI.getFlags(); 2008 2009 LLT S16 = LLT::scalar(16); 2010 LLT S32 = LLT::scalar(32); 2011 2012 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2013 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2014 2015 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2016 .addUse(RHSExt.getReg(0)) 2017 .setMIFlags(Flags); 2018 2019 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2020 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2021 2022 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2023 .addUse(RDst.getReg(0)) 2024 .addUse(RHS) 2025 .addUse(LHS) 2026 .setMIFlags(Flags); 2027 2028 MI.eraseFromParent(); 2029 return true; 2030 } 2031 2032 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2033 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2034 static void toggleSPDenormMode(bool Enable, 2035 MachineIRBuilder &B, 2036 const GCNSubtarget &ST, 2037 AMDGPU::SIModeRegisterDefaults Mode) { 2038 // Set SP denorm mode to this value. 2039 unsigned SPDenormMode = 2040 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2041 2042 if (ST.hasDenormModeInst()) { 2043 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2044 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 2045 ? FP_DENORM_FLUSH_NONE 2046 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2047 2048 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2049 B.buildInstr(AMDGPU::S_DENORM_MODE) 2050 .addImm(NewDenormModeValue); 2051 2052 } else { 2053 // Select FP32 bit field in mode register. 2054 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2055 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2056 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2057 2058 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2059 .addImm(SPDenormMode) 2060 .addImm(SPDenormModeBitField); 2061 } 2062 } 2063 2064 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2065 MachineRegisterInfo &MRI, 2066 MachineIRBuilder &B) const { 2067 B.setInstr(MI); 2068 Register Res = MI.getOperand(0).getReg(); 2069 Register LHS = MI.getOperand(1).getReg(); 2070 Register RHS = MI.getOperand(2).getReg(); 2071 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2072 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2073 2074 uint16_t Flags = MI.getFlags(); 2075 2076 LLT S32 = LLT::scalar(32); 2077 LLT S1 = LLT::scalar(1); 2078 2079 auto One = B.buildFConstant(S32, 1.0f); 2080 2081 auto DenominatorScaled = 2082 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2083 .addUse(RHS) 2084 .addUse(LHS) 2085 .addImm(1) 2086 .setMIFlags(Flags); 2087 auto NumeratorScaled = 2088 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2089 .addUse(LHS) 2090 .addUse(RHS) 2091 .addImm(0) 2092 .setMIFlags(Flags); 2093 2094 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2095 .addUse(DenominatorScaled.getReg(0)) 2096 .setMIFlags(Flags); 2097 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2098 2099 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2100 // aren't modeled as reading it. 2101 if (!Mode.FP32Denormals) 2102 toggleSPDenormMode(true, B, ST, Mode); 2103 2104 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2105 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2106 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2107 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2108 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2109 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2110 2111 if (!Mode.FP32Denormals) 2112 toggleSPDenormMode(false, B, ST, Mode); 2113 2114 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2115 .addUse(Fma4.getReg(0)) 2116 .addUse(Fma1.getReg(0)) 2117 .addUse(Fma3.getReg(0)) 2118 .addUse(NumeratorScaled.getReg(1)) 2119 .setMIFlags(Flags); 2120 2121 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2122 .addUse(Fmas.getReg(0)) 2123 .addUse(RHS) 2124 .addUse(LHS) 2125 .setMIFlags(Flags); 2126 2127 MI.eraseFromParent(); 2128 return true; 2129 } 2130 2131 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2132 MachineRegisterInfo &MRI, 2133 MachineIRBuilder &B) const { 2134 B.setInstr(MI); 2135 Register Res = MI.getOperand(0).getReg(); 2136 Register LHS = MI.getOperand(1).getReg(); 2137 Register RHS = MI.getOperand(2).getReg(); 2138 2139 uint16_t Flags = MI.getFlags(); 2140 2141 LLT S64 = LLT::scalar(64); 2142 LLT S1 = LLT::scalar(1); 2143 2144 auto One = B.buildFConstant(S64, 1.0); 2145 2146 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2147 .addUse(LHS) 2148 .addUse(RHS) 2149 .addImm(1) 2150 .setMIFlags(Flags); 2151 2152 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2153 2154 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2155 .addUse(DivScale0.getReg(0)) 2156 .setMIFlags(Flags); 2157 2158 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2159 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2160 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2161 2162 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2163 .addUse(LHS) 2164 .addUse(RHS) 2165 .addImm(0) 2166 .setMIFlags(Flags); 2167 2168 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2169 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2170 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2171 2172 Register Scale; 2173 if (!ST.hasUsableDivScaleConditionOutput()) { 2174 // Workaround a hardware bug on SI where the condition output from div_scale 2175 // is not usable. 2176 2177 Scale = MRI.createGenericVirtualRegister(S1); 2178 2179 LLT S32 = LLT::scalar(32); 2180 2181 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2182 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2183 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2184 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2185 2186 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2187 Scale1Unmerge.getReg(1)); 2188 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2189 Scale0Unmerge.getReg(1)); 2190 B.buildXor(Scale, CmpNum, CmpDen); 2191 } else { 2192 Scale = DivScale1.getReg(1); 2193 } 2194 2195 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2196 .addUse(Fma4.getReg(0)) 2197 .addUse(Fma3.getReg(0)) 2198 .addUse(Mul.getReg(0)) 2199 .addUse(Scale) 2200 .setMIFlags(Flags); 2201 2202 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2203 .addUse(Fmas.getReg(0)) 2204 .addUse(RHS) 2205 .addUse(LHS) 2206 .setMIFlags(Flags); 2207 2208 MI.eraseFromParent(); 2209 return true; 2210 } 2211 2212 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2213 MachineRegisterInfo &MRI, 2214 MachineIRBuilder &B) const { 2215 B.setInstr(MI); 2216 Register Res = MI.getOperand(0).getReg(); 2217 Register LHS = MI.getOperand(2).getReg(); 2218 Register RHS = MI.getOperand(3).getReg(); 2219 uint16_t Flags = MI.getFlags(); 2220 2221 LLT S32 = LLT::scalar(32); 2222 LLT S1 = LLT::scalar(1); 2223 2224 auto Abs = B.buildFAbs(S32, RHS, Flags); 2225 const APFloat C0Val(1.0f); 2226 2227 auto C0 = B.buildConstant(S32, 0x6f800000); 2228 auto C1 = B.buildConstant(S32, 0x2f800000); 2229 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2230 2231 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2232 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2233 2234 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2235 2236 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2237 .addUse(Mul0.getReg(0)) 2238 .setMIFlags(Flags); 2239 2240 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2241 2242 B.buildFMul(Res, Sel, Mul1, Flags); 2243 2244 MI.eraseFromParent(); 2245 return true; 2246 } 2247 2248 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2249 MachineRegisterInfo &MRI, 2250 MachineIRBuilder &B) const { 2251 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2252 if (!MFI->isEntryFunction()) { 2253 return legalizePreloadedArgIntrin(MI, MRI, B, 2254 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2255 } 2256 2257 B.setInstr(MI); 2258 2259 uint64_t Offset = 2260 ST.getTargetLowering()->getImplicitParameterOffset( 2261 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2262 Register DstReg = MI.getOperand(0).getReg(); 2263 LLT DstTy = MRI.getType(DstReg); 2264 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2265 2266 const ArgDescriptor *Arg; 2267 const TargetRegisterClass *RC; 2268 std::tie(Arg, RC) 2269 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2270 if (!Arg) 2271 return false; 2272 2273 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2274 if (!loadInputValue(KernargPtrReg, B, Arg)) 2275 return false; 2276 2277 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2278 MI.eraseFromParent(); 2279 return true; 2280 } 2281 2282 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2283 MachineRegisterInfo &MRI, 2284 MachineIRBuilder &B, 2285 unsigned AddrSpace) const { 2286 B.setInstr(MI); 2287 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2288 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2289 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2290 MI.eraseFromParent(); 2291 return true; 2292 } 2293 2294 /// Handle register layout difference for f16 images for some subtargets. 2295 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2296 MachineRegisterInfo &MRI, 2297 Register Reg) const { 2298 if (!ST.hasUnpackedD16VMem()) 2299 return Reg; 2300 2301 const LLT S16 = LLT::scalar(16); 2302 const LLT S32 = LLT::scalar(32); 2303 LLT StoreVT = MRI.getType(Reg); 2304 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2305 2306 auto Unmerge = B.buildUnmerge(S16, Reg); 2307 2308 SmallVector<Register, 4> WideRegs; 2309 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2310 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2311 2312 int NumElts = StoreVT.getNumElements(); 2313 2314 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2315 } 2316 2317 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 2318 MachineRegisterInfo &MRI, 2319 MachineIRBuilder &B, 2320 bool IsFormat) const { 2321 // TODO: Reject f16 format on targets where unsupported. 2322 Register VData = MI.getOperand(1).getReg(); 2323 LLT Ty = MRI.getType(VData); 2324 2325 B.setInstr(MI); 2326 2327 const LLT S32 = LLT::scalar(32); 2328 const LLT S16 = LLT::scalar(16); 2329 2330 // Fixup illegal register types for i8 stores. 2331 if (Ty == LLT::scalar(8) || Ty == S16) { 2332 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2333 MI.getOperand(1).setReg(AnyExt); 2334 return true; 2335 } 2336 2337 if (Ty.isVector()) { 2338 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2339 if (IsFormat) 2340 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2341 return true; 2342 } 2343 2344 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2345 } 2346 2347 return Ty == S32; 2348 } 2349 2350 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2351 MachineRegisterInfo &MRI, 2352 MachineIRBuilder &B) const { 2353 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2354 auto IntrID = MI.getIntrinsicID(); 2355 switch (IntrID) { 2356 case Intrinsic::amdgcn_if: 2357 case Intrinsic::amdgcn_else: { 2358 MachineInstr *Br = nullptr; 2359 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2360 const SIRegisterInfo *TRI 2361 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2362 2363 B.setInstr(*BrCond); 2364 Register Def = MI.getOperand(1).getReg(); 2365 Register Use = MI.getOperand(3).getReg(); 2366 2367 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 2368 if (Br) 2369 BrTarget = Br->getOperand(0).getMBB(); 2370 2371 if (IntrID == Intrinsic::amdgcn_if) { 2372 B.buildInstr(AMDGPU::SI_IF) 2373 .addDef(Def) 2374 .addUse(Use) 2375 .addMBB(BrTarget); 2376 } else { 2377 B.buildInstr(AMDGPU::SI_ELSE) 2378 .addDef(Def) 2379 .addUse(Use) 2380 .addMBB(BrTarget) 2381 .addImm(0); 2382 } 2383 2384 if (Br) 2385 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 2386 2387 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2388 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2389 MI.eraseFromParent(); 2390 BrCond->eraseFromParent(); 2391 return true; 2392 } 2393 2394 return false; 2395 } 2396 case Intrinsic::amdgcn_loop: { 2397 MachineInstr *Br = nullptr; 2398 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2399 const SIRegisterInfo *TRI 2400 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2401 2402 B.setInstr(*BrCond); 2403 2404 // FIXME: Need to adjust branch targets based on unconditional branch. 2405 Register Reg = MI.getOperand(2).getReg(); 2406 B.buildInstr(AMDGPU::SI_LOOP) 2407 .addUse(Reg) 2408 .addMBB(BrCond->getOperand(1).getMBB()); 2409 MI.eraseFromParent(); 2410 BrCond->eraseFromParent(); 2411 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2412 return true; 2413 } 2414 2415 return false; 2416 } 2417 case Intrinsic::amdgcn_kernarg_segment_ptr: 2418 return legalizePreloadedArgIntrin( 2419 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2420 case Intrinsic::amdgcn_implicitarg_ptr: 2421 return legalizeImplicitArgPtr(MI, MRI, B); 2422 case Intrinsic::amdgcn_workitem_id_x: 2423 return legalizePreloadedArgIntrin(MI, MRI, B, 2424 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2425 case Intrinsic::amdgcn_workitem_id_y: 2426 return legalizePreloadedArgIntrin(MI, MRI, B, 2427 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2428 case Intrinsic::amdgcn_workitem_id_z: 2429 return legalizePreloadedArgIntrin(MI, MRI, B, 2430 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2431 case Intrinsic::amdgcn_workgroup_id_x: 2432 return legalizePreloadedArgIntrin(MI, MRI, B, 2433 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2434 case Intrinsic::amdgcn_workgroup_id_y: 2435 return legalizePreloadedArgIntrin(MI, MRI, B, 2436 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2437 case Intrinsic::amdgcn_workgroup_id_z: 2438 return legalizePreloadedArgIntrin(MI, MRI, B, 2439 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2440 case Intrinsic::amdgcn_dispatch_ptr: 2441 return legalizePreloadedArgIntrin(MI, MRI, B, 2442 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2443 case Intrinsic::amdgcn_queue_ptr: 2444 return legalizePreloadedArgIntrin(MI, MRI, B, 2445 AMDGPUFunctionArgInfo::QUEUE_PTR); 2446 case Intrinsic::amdgcn_implicit_buffer_ptr: 2447 return legalizePreloadedArgIntrin( 2448 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2449 case Intrinsic::amdgcn_dispatch_id: 2450 return legalizePreloadedArgIntrin(MI, MRI, B, 2451 AMDGPUFunctionArgInfo::DISPATCH_ID); 2452 case Intrinsic::amdgcn_fdiv_fast: 2453 return legalizeFDIVFastIntrin(MI, MRI, B); 2454 case Intrinsic::amdgcn_is_shared: 2455 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2456 case Intrinsic::amdgcn_is_private: 2457 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2458 case Intrinsic::amdgcn_wavefrontsize: { 2459 B.setInstr(MI); 2460 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2461 MI.eraseFromParent(); 2462 return true; 2463 } 2464 case Intrinsic::amdgcn_raw_buffer_store: 2465 return legalizeRawBufferStore(MI, MRI, B, false); 2466 case Intrinsic::amdgcn_raw_buffer_store_format: 2467 return legalizeRawBufferStore(MI, MRI, B, true); 2468 default: 2469 return true; 2470 } 2471 2472 return true; 2473 } 2474