1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); // VCC branches 248 setAction({G_BRCOND, S32}, Legal); // SCC branches 249 250 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 251 // elements for v3s16 252 getActionDefinitionsBuilder(G_PHI) 253 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 254 .legalFor(AllS32Vectors) 255 .legalFor(AllS64Vectors) 256 .legalFor(AddrSpaces64) 257 .legalFor(AddrSpaces32) 258 .clampScalar(0, S32, S256) 259 .widenScalarToNextPow2(0, 32) 260 .clampMaxNumElements(0, S32, 16) 261 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 262 .legalIf(isPointer(0)); 263 264 if (ST.has16BitInsts()) { 265 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 266 .legalFor({S32, S16}) 267 .clampScalar(0, S16, S32) 268 .scalarize(0); 269 } else { 270 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 271 .legalFor({S32}) 272 .clampScalar(0, S32, S32) 273 .scalarize(0); 274 } 275 276 // FIXME: Not really legal. Placeholder for custom lowering. 277 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 278 .legalFor({S32, S64}) 279 .clampScalar(0, S32, S64) 280 .widenScalarToNextPow2(0, 32) 281 .scalarize(0); 282 283 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 284 .legalFor({S32}) 285 .clampScalar(0, S32, S32) 286 .scalarize(0); 287 288 // Report legal for any types we can handle anywhere. For the cases only legal 289 // on the SALU, RegBankSelect will be able to re-legalize. 290 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 291 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 292 .clampScalar(0, S32, S64) 293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 294 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 295 .widenScalarToNextPow2(0) 296 .scalarize(0); 297 298 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 299 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 300 .legalFor({{S32, S1}, {S32, S32}}) 301 .clampScalar(0, S32, S32) 302 .scalarize(0); // TODO: Implement. 303 304 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 305 .lower(); 306 307 getActionDefinitionsBuilder(G_BITCAST) 308 // Don't worry about the size constraint. 309 .legalIf(all(isRegisterType(0), isRegisterType(1))) 310 // FIXME: Testing hack 311 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 312 313 getActionDefinitionsBuilder(G_FCONSTANT) 314 .legalFor({S32, S64, S16}) 315 .clampScalar(0, S16, S64); 316 317 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 318 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 319 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 320 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 321 .clampScalarOrElt(0, S32, S1024) 322 .legalIf(isMultiple32(0)) 323 .widenScalarToNextPow2(0, 32) 324 .clampMaxNumElements(0, S32, 16); 325 326 327 // FIXME: i1 operands to intrinsics should always be legal, but other i1 328 // values may not be legal. We need to figure out how to distinguish 329 // between these two scenarios. 330 getActionDefinitionsBuilder(G_CONSTANT) 331 .legalFor({S1, S32, S64, S16, GlobalPtr, 332 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 333 .clampScalar(0, S32, S64) 334 .widenScalarToNextPow2(0) 335 .legalIf(isPointer(0)); 336 337 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 338 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 339 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 340 341 342 auto &FPOpActions = getActionDefinitionsBuilder( 343 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 344 .legalFor({S32, S64}); 345 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 346 .customFor({S32, S64}); 347 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 348 .customFor({S32, S64}); 349 350 if (ST.has16BitInsts()) { 351 if (ST.hasVOP3PInsts()) 352 FPOpActions.legalFor({S16, V2S16}); 353 else 354 FPOpActions.legalFor({S16}); 355 356 TrigActions.customFor({S16}); 357 FDIVActions.customFor({S16}); 358 } 359 360 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 361 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 362 363 if (ST.hasVOP3PInsts()) { 364 MinNumMaxNum.customFor(FPTypesPK16) 365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 366 .clampMaxNumElements(0, S16, 2) 367 .clampScalar(0, S16, S64) 368 .scalarize(0); 369 } else if (ST.has16BitInsts()) { 370 MinNumMaxNum.customFor(FPTypes16) 371 .clampScalar(0, S16, S64) 372 .scalarize(0); 373 } else { 374 MinNumMaxNum.customFor(FPTypesBase) 375 .clampScalar(0, S32, S64) 376 .scalarize(0); 377 } 378 379 if (ST.hasVOP3PInsts()) 380 FPOpActions.clampMaxNumElements(0, S16, 2); 381 382 FPOpActions 383 .scalarize(0) 384 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 385 386 TrigActions 387 .scalarize(0) 388 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 389 390 FDIVActions 391 .scalarize(0) 392 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 393 394 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 395 .legalFor(FPTypesPK16) 396 .clampMaxNumElements(0, S16, 2) 397 .scalarize(0) 398 .clampScalar(0, S16, S64); 399 400 // TODO: Implement 401 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 402 403 if (ST.has16BitInsts()) { 404 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 405 .legalFor({S32, S64, S16}) 406 .scalarize(0) 407 .clampScalar(0, S16, S64); 408 } else { 409 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 410 .legalFor({S32, S64}) 411 .scalarize(0) 412 .clampScalar(0, S32, S64); 413 } 414 415 getActionDefinitionsBuilder(G_FPTRUNC) 416 .legalFor({{S32, S64}, {S16, S32}}) 417 .scalarize(0); 418 419 getActionDefinitionsBuilder(G_FPEXT) 420 .legalFor({{S64, S32}, {S32, S16}}) 421 .lowerFor({{S64, S16}}) // FIXME: Implement 422 .scalarize(0); 423 424 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 425 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 426 427 getActionDefinitionsBuilder(G_FSUB) 428 // Use actual fsub instruction 429 .legalFor({S32}) 430 // Must use fadd + fneg 431 .lowerFor({S64, S16, V2S16}) 432 .scalarize(0) 433 .clampScalar(0, S32, S64); 434 435 // Whether this is legal depends on the floating point mode for the function. 436 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 437 if (ST.hasMadF16()) 438 FMad.customFor({S32, S16}); 439 else 440 FMad.customFor({S32}); 441 FMad.scalarize(0) 442 .lower(); 443 444 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 445 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 446 {S32, S1}, {S64, S1}, {S16, S1}, 447 {S96, S32}, 448 // FIXME: Hack 449 {S64, LLT::scalar(33)}, 450 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 451 .scalarize(0); 452 453 // TODO: Split s1->s64 during regbankselect for VALU. 454 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 455 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 456 .lowerFor({{S32, S64}}) 457 .lowerIf(typeIs(1, S1)) 458 .customFor({{S64, S64}}); 459 if (ST.has16BitInsts()) 460 IToFP.legalFor({{S16, S16}}); 461 IToFP.clampScalar(1, S32, S64) 462 .scalarize(0); 463 464 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 465 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 466 if (ST.has16BitInsts()) 467 FPToI.legalFor({{S16, S16}}); 468 else 469 FPToI.minScalar(1, S32); 470 471 FPToI.minScalar(0, S32) 472 .scalarize(0); 473 474 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 475 .scalarize(0) 476 .lower(); 477 478 if (ST.has16BitInsts()) { 479 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 480 .legalFor({S16, S32, S64}) 481 .clampScalar(0, S16, S64) 482 .scalarize(0); 483 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 484 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 485 .legalFor({S32, S64}) 486 .clampScalar(0, S32, S64) 487 .scalarize(0); 488 } else { 489 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 490 .legalFor({S32}) 491 .customFor({S64}) 492 .clampScalar(0, S32, S64) 493 .scalarize(0); 494 } 495 496 getActionDefinitionsBuilder(G_PTR_ADD) 497 .legalForCartesianProduct(AddrSpaces64, {S64}) 498 .legalForCartesianProduct(AddrSpaces32, {S32}) 499 .scalarize(0); 500 501 getActionDefinitionsBuilder(G_PTR_MASK) 502 .scalarize(0) 503 .alwaysLegal(); 504 505 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 506 507 auto &CmpBuilder = 508 getActionDefinitionsBuilder(G_ICMP) 509 // The compare output type differs based on the register bank of the output, 510 // so make both s1 and s32 legal. 511 // 512 // Scalar compares producing output in scc will be promoted to s32, as that 513 // is the allocatable register type that will be needed for the copy from 514 // scc. This will be promoted during RegBankSelect, and we assume something 515 // before that won't try to use s32 result types. 516 // 517 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 518 // bank. 519 .legalForCartesianProduct( 520 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 521 .legalForCartesianProduct( 522 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 523 if (ST.has16BitInsts()) { 524 CmpBuilder.legalFor({{S1, S16}}); 525 } 526 527 CmpBuilder 528 .widenScalarToNextPow2(1) 529 .clampScalar(1, S32, S64) 530 .scalarize(0) 531 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 532 533 getActionDefinitionsBuilder(G_FCMP) 534 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 535 .widenScalarToNextPow2(1) 536 .clampScalar(1, S32, S64) 537 .scalarize(0); 538 539 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 540 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 541 G_FLOG, G_FLOG2, G_FLOG10}) 542 .legalFor({S32}) 543 .scalarize(0); 544 545 // The 64-bit versions produce 32-bit results, but only on the SALU. 546 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 547 G_CTTZ, G_CTTZ_ZERO_UNDEF, 548 G_CTPOP}) 549 .legalFor({{S32, S32}, {S32, S64}}) 550 .clampScalar(0, S32, S32) 551 .clampScalar(1, S32, S64) 552 .scalarize(0) 553 .widenScalarToNextPow2(0, 32) 554 .widenScalarToNextPow2(1, 32); 555 556 // TODO: Expand for > s32 557 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 558 .legalFor({S32}) 559 .clampScalar(0, S32, S32) 560 .scalarize(0); 561 562 if (ST.has16BitInsts()) { 563 if (ST.hasVOP3PInsts()) { 564 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 565 .legalFor({S32, S16, V2S16}) 566 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 567 .clampMaxNumElements(0, S16, 2) 568 .clampScalar(0, S16, S32) 569 .widenScalarToNextPow2(0) 570 .scalarize(0); 571 } else { 572 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 573 .legalFor({S32, S16}) 574 .widenScalarToNextPow2(0) 575 .clampScalar(0, S16, S32) 576 .scalarize(0); 577 } 578 } else { 579 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 580 .legalFor({S32}) 581 .clampScalar(0, S32, S32) 582 .widenScalarToNextPow2(0) 583 .scalarize(0); 584 } 585 586 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 587 return [=](const LegalityQuery &Query) { 588 return Query.Types[TypeIdx0].getSizeInBits() < 589 Query.Types[TypeIdx1].getSizeInBits(); 590 }; 591 }; 592 593 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 594 return [=](const LegalityQuery &Query) { 595 return Query.Types[TypeIdx0].getSizeInBits() > 596 Query.Types[TypeIdx1].getSizeInBits(); 597 }; 598 }; 599 600 getActionDefinitionsBuilder(G_INTTOPTR) 601 // List the common cases 602 .legalForCartesianProduct(AddrSpaces64, {S64}) 603 .legalForCartesianProduct(AddrSpaces32, {S32}) 604 .scalarize(0) 605 // Accept any address space as long as the size matches 606 .legalIf(sameSize(0, 1)) 607 .widenScalarIf(smallerThan(1, 0), 608 [](const LegalityQuery &Query) { 609 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 610 }) 611 .narrowScalarIf(greaterThan(1, 0), 612 [](const LegalityQuery &Query) { 613 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 614 }); 615 616 getActionDefinitionsBuilder(G_PTRTOINT) 617 // List the common cases 618 .legalForCartesianProduct(AddrSpaces64, {S64}) 619 .legalForCartesianProduct(AddrSpaces32, {S32}) 620 .scalarize(0) 621 // Accept any address space as long as the size matches 622 .legalIf(sameSize(0, 1)) 623 .widenScalarIf(smallerThan(0, 1), 624 [](const LegalityQuery &Query) { 625 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 626 }) 627 .narrowScalarIf( 628 greaterThan(0, 1), 629 [](const LegalityQuery &Query) { 630 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 631 }); 632 633 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 634 .scalarize(0) 635 .custom(); 636 637 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 638 // handle some operations by just promoting the register during 639 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 640 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 641 switch (AS) { 642 // FIXME: Private element size. 643 case AMDGPUAS::PRIVATE_ADDRESS: 644 return 32; 645 // FIXME: Check subtarget 646 case AMDGPUAS::LOCAL_ADDRESS: 647 return ST.useDS128() ? 128 : 64; 648 649 // Treat constant and global as identical. SMRD loads are sometimes usable 650 // for global loads (ideally constant address space should be eliminated) 651 // depending on the context. Legality cannot be context dependent, but 652 // RegBankSelect can split the load as necessary depending on the pointer 653 // register bank/uniformity and if the memory is invariant or not written in 654 // a kernel. 655 case AMDGPUAS::CONSTANT_ADDRESS: 656 case AMDGPUAS::GLOBAL_ADDRESS: 657 return 512; 658 default: 659 return 128; 660 } 661 }; 662 663 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 664 const LLT DstTy = Query.Types[0]; 665 666 // Split vector extloads. 667 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 668 unsigned Align = Query.MMODescrs[0].AlignInBits; 669 670 if (MemSize < DstTy.getSizeInBits()) 671 MemSize = std::max(MemSize, Align); 672 673 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 674 return true; 675 676 const LLT PtrTy = Query.Types[1]; 677 unsigned AS = PtrTy.getAddressSpace(); 678 if (MemSize > maxSizeForAddrSpace(AS)) 679 return true; 680 681 // Catch weird sized loads that don't evenly divide into the access sizes 682 // TODO: May be able to widen depending on alignment etc. 683 unsigned NumRegs = MemSize / 32; 684 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 685 return true; 686 687 if (Align < MemSize) { 688 const SITargetLowering *TLI = ST.getTargetLowering(); 689 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 690 } 691 692 return false; 693 }; 694 695 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 696 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 697 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 698 699 // TODO: Refine based on subtargets which support unaligned access or 128-bit 700 // LDS 701 // TODO: Unsupported flat for SI. 702 703 for (unsigned Op : {G_LOAD, G_STORE}) { 704 const bool IsStore = Op == G_STORE; 705 706 auto &Actions = getActionDefinitionsBuilder(Op); 707 // Whitelist the common cases. 708 // TODO: Pointer loads 709 // TODO: Wide constant loads 710 // TODO: Only CI+ has 3x loads 711 // TODO: Loads to s16 on gfx9 712 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 713 {V2S32, GlobalPtr, 64, GlobalAlign32}, 714 {V3S32, GlobalPtr, 96, GlobalAlign32}, 715 {S96, GlobalPtr, 96, GlobalAlign32}, 716 {V4S32, GlobalPtr, 128, GlobalAlign32}, 717 {S128, GlobalPtr, 128, GlobalAlign32}, 718 {S64, GlobalPtr, 64, GlobalAlign32}, 719 {V2S64, GlobalPtr, 128, GlobalAlign32}, 720 {V2S16, GlobalPtr, 32, GlobalAlign32}, 721 {S32, GlobalPtr, 8, GlobalAlign8}, 722 {S32, GlobalPtr, 16, GlobalAlign16}, 723 724 {S32, LocalPtr, 32, 32}, 725 {S64, LocalPtr, 64, 32}, 726 {V2S32, LocalPtr, 64, 32}, 727 {S32, LocalPtr, 8, 8}, 728 {S32, LocalPtr, 16, 16}, 729 {V2S16, LocalPtr, 32, 32}, 730 731 {S32, PrivatePtr, 32, 32}, 732 {S32, PrivatePtr, 8, 8}, 733 {S32, PrivatePtr, 16, 16}, 734 {V2S16, PrivatePtr, 32, 32}, 735 736 {S32, FlatPtr, 32, GlobalAlign32}, 737 {S32, FlatPtr, 16, GlobalAlign16}, 738 {S32, FlatPtr, 8, GlobalAlign8}, 739 {V2S16, FlatPtr, 32, GlobalAlign32}, 740 741 {S32, ConstantPtr, 32, GlobalAlign32}, 742 {V2S32, ConstantPtr, 64, GlobalAlign32}, 743 {V3S32, ConstantPtr, 96, GlobalAlign32}, 744 {V4S32, ConstantPtr, 128, GlobalAlign32}, 745 {S64, ConstantPtr, 64, GlobalAlign32}, 746 {S128, ConstantPtr, 128, GlobalAlign32}, 747 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 748 Actions 749 .customIf(typeIs(1, Constant32Ptr)) 750 .narrowScalarIf( 751 [=](const LegalityQuery &Query) -> bool { 752 return !Query.Types[0].isVector() && needToSplitLoad(Query); 753 }, 754 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 755 const LLT DstTy = Query.Types[0]; 756 const LLT PtrTy = Query.Types[1]; 757 758 const unsigned DstSize = DstTy.getSizeInBits(); 759 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 760 761 // Split extloads. 762 if (DstSize > MemSize) 763 return std::make_pair(0, LLT::scalar(MemSize)); 764 765 if (DstSize > 32 && (DstSize % 32 != 0)) { 766 // FIXME: Need a way to specify non-extload of larger size if 767 // suitably aligned. 768 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 769 } 770 771 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 772 if (MemSize > MaxSize) 773 return std::make_pair(0, LLT::scalar(MaxSize)); 774 775 unsigned Align = Query.MMODescrs[0].AlignInBits; 776 return std::make_pair(0, LLT::scalar(Align)); 777 }) 778 .fewerElementsIf( 779 [=](const LegalityQuery &Query) -> bool { 780 return Query.Types[0].isVector() && needToSplitLoad(Query); 781 }, 782 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 783 const LLT DstTy = Query.Types[0]; 784 const LLT PtrTy = Query.Types[1]; 785 786 LLT EltTy = DstTy.getElementType(); 787 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 788 789 // Split if it's too large for the address space. 790 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 791 unsigned NumElts = DstTy.getNumElements(); 792 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 793 794 // FIXME: Refine when odd breakdowns handled 795 // The scalars will need to be re-legalized. 796 if (NumPieces == 1 || NumPieces >= NumElts || 797 NumElts % NumPieces != 0) 798 return std::make_pair(0, EltTy); 799 800 return std::make_pair(0, 801 LLT::vector(NumElts / NumPieces, EltTy)); 802 } 803 804 // Need to split because of alignment. 805 unsigned Align = Query.MMODescrs[0].AlignInBits; 806 unsigned EltSize = EltTy.getSizeInBits(); 807 if (EltSize > Align && 808 (EltSize / Align < DstTy.getNumElements())) { 809 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 810 } 811 812 // May need relegalization for the scalars. 813 return std::make_pair(0, EltTy); 814 }) 815 .minScalar(0, S32); 816 817 if (IsStore) 818 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 819 820 // TODO: Need a bitcast lower option? 821 Actions 822 .legalIf([=](const LegalityQuery &Query) { 823 const LLT Ty0 = Query.Types[0]; 824 unsigned Size = Ty0.getSizeInBits(); 825 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 826 unsigned Align = Query.MMODescrs[0].AlignInBits; 827 828 // FIXME: Widening store from alignment not valid. 829 if (MemSize < Size) 830 MemSize = std::max(MemSize, Align); 831 832 // No extending vector loads. 833 if (Size > MemSize && Ty0.isVector()) 834 return false; 835 836 switch (MemSize) { 837 case 8: 838 case 16: 839 return Size == 32; 840 case 32: 841 case 64: 842 case 128: 843 return true; 844 case 96: 845 return ST.hasDwordx3LoadStores(); 846 case 256: 847 case 512: 848 return true; 849 default: 850 return false; 851 } 852 }) 853 .widenScalarToNextPow2(0) 854 // TODO: v3s32->v4s32 with alignment 855 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 856 } 857 858 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 859 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 860 {S32, GlobalPtr, 16, 2 * 8}, 861 {S32, LocalPtr, 8, 8}, 862 {S32, LocalPtr, 16, 16}, 863 {S32, PrivatePtr, 8, 8}, 864 {S32, PrivatePtr, 16, 16}, 865 {S32, ConstantPtr, 8, 8}, 866 {S32, ConstantPtr, 16, 2 * 8}}); 867 if (ST.hasFlatAddressSpace()) { 868 ExtLoads.legalForTypesWithMemDesc( 869 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 870 } 871 872 ExtLoads.clampScalar(0, S32, S32) 873 .widenScalarToNextPow2(0) 874 .unsupportedIfMemSizeNotPow2() 875 .lower(); 876 877 auto &Atomics = getActionDefinitionsBuilder( 878 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 879 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 880 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 881 G_ATOMICRMW_UMIN}) 882 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 883 {S64, GlobalPtr}, {S64, LocalPtr}}); 884 if (ST.hasFlatAddressSpace()) { 885 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 886 } 887 888 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 889 .legalFor({{S32, LocalPtr}}); 890 891 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 892 // demarshalling 893 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 894 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 895 {S32, FlatPtr}, {S64, FlatPtr}}) 896 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 897 {S32, RegionPtr}, {S64, RegionPtr}}); 898 899 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 900 .lower(); 901 902 // TODO: Pointer types, any 32-bit or 64-bit vector 903 904 // Condition should be s32 for scalar, s1 for vector. 905 getActionDefinitionsBuilder(G_SELECT) 906 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 907 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 908 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 909 .clampScalar(0, S16, S64) 910 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 911 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 912 .scalarize(1) 913 .clampMaxNumElements(0, S32, 2) 914 .clampMaxNumElements(0, LocalPtr, 2) 915 .clampMaxNumElements(0, PrivatePtr, 2) 916 .scalarize(0) 917 .widenScalarToNextPow2(0) 918 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 919 920 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 921 // be more flexible with the shift amount type. 922 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 923 .legalFor({{S32, S32}, {S64, S32}}); 924 if (ST.has16BitInsts()) { 925 if (ST.hasVOP3PInsts()) { 926 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 927 .clampMaxNumElements(0, S16, 2); 928 } else 929 Shifts.legalFor({{S16, S32}, {S16, S16}}); 930 931 // TODO: Support 16-bit shift amounts 932 Shifts.clampScalar(1, S32, S32); 933 Shifts.clampScalar(0, S16, S64); 934 Shifts.widenScalarToNextPow2(0, 16); 935 } else { 936 // Make sure we legalize the shift amount type first, as the general 937 // expansion for the shifted type will produce much worse code if it hasn't 938 // been truncated already. 939 Shifts.clampScalar(1, S32, S32); 940 Shifts.clampScalar(0, S32, S64); 941 Shifts.widenScalarToNextPow2(0, 32); 942 } 943 Shifts.scalarize(0); 944 945 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 946 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 947 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 948 unsigned IdxTypeIdx = 2; 949 950 getActionDefinitionsBuilder(Op) 951 .customIf([=](const LegalityQuery &Query) { 952 const LLT EltTy = Query.Types[EltTypeIdx]; 953 const LLT VecTy = Query.Types[VecTypeIdx]; 954 const LLT IdxTy = Query.Types[IdxTypeIdx]; 955 return (EltTy.getSizeInBits() == 16 || 956 EltTy.getSizeInBits() % 32 == 0) && 957 VecTy.getSizeInBits() % 32 == 0 && 958 VecTy.getSizeInBits() <= 1024 && 959 IdxTy.getSizeInBits() == 32; 960 }) 961 .clampScalar(EltTypeIdx, S32, S64) 962 .clampScalar(VecTypeIdx, S32, S64) 963 .clampScalar(IdxTypeIdx, S32, S32); 964 } 965 966 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 967 .unsupportedIf([=](const LegalityQuery &Query) { 968 const LLT &EltTy = Query.Types[1].getElementType(); 969 return Query.Types[0] != EltTy; 970 }); 971 972 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 973 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 974 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 975 976 // FIXME: Doesn't handle extract of illegal sizes. 977 getActionDefinitionsBuilder(Op) 978 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 979 // FIXME: Multiples of 16 should not be legal. 980 .legalIf([=](const LegalityQuery &Query) { 981 const LLT BigTy = Query.Types[BigTyIdx]; 982 const LLT LitTy = Query.Types[LitTyIdx]; 983 return (BigTy.getSizeInBits() % 32 == 0) && 984 (LitTy.getSizeInBits() % 16 == 0); 985 }) 986 .widenScalarIf( 987 [=](const LegalityQuery &Query) { 988 const LLT BigTy = Query.Types[BigTyIdx]; 989 return (BigTy.getScalarSizeInBits() < 16); 990 }, 991 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 992 .widenScalarIf( 993 [=](const LegalityQuery &Query) { 994 const LLT LitTy = Query.Types[LitTyIdx]; 995 return (LitTy.getScalarSizeInBits() < 16); 996 }, 997 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 998 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 999 .widenScalarToNextPow2(BigTyIdx, 32); 1000 1001 } 1002 1003 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1004 .legalForCartesianProduct(AllS32Vectors, {S32}) 1005 .legalForCartesianProduct(AllS64Vectors, {S64}) 1006 .clampNumElements(0, V16S32, V32S32) 1007 .clampNumElements(0, V2S64, V16S64) 1008 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1009 1010 if (ST.hasScalarPackInsts()) 1011 BuildVector.legalFor({V2S16, S32}); 1012 1013 BuildVector 1014 .minScalarSameAs(1, 0) 1015 .legalIf(isRegisterType(0)) 1016 .minScalarOrElt(0, S32); 1017 1018 if (ST.hasScalarPackInsts()) { 1019 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1020 .legalFor({V2S16, S32}) 1021 .lower(); 1022 } else { 1023 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1024 .lower(); 1025 } 1026 1027 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1028 .legalIf(isRegisterType(0)); 1029 1030 // TODO: Don't fully scalarize v2s16 pieces 1031 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1032 1033 // Merge/Unmerge 1034 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1035 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1036 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1037 1038 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1039 const LLT &Ty = Query.Types[TypeIdx]; 1040 if (Ty.isVector()) { 1041 const LLT &EltTy = Ty.getElementType(); 1042 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1043 return true; 1044 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1045 return true; 1046 } 1047 return false; 1048 }; 1049 1050 auto &Builder = getActionDefinitionsBuilder(Op) 1051 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1052 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1053 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1054 // valid. 1055 .clampScalar(LitTyIdx, S16, S256) 1056 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1057 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1058 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1059 elementTypeIs(1, S16)), 1060 changeTo(1, V2S16)) 1061 // Break up vectors with weird elements into scalars 1062 .fewerElementsIf( 1063 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1064 scalarize(0)) 1065 .fewerElementsIf( 1066 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1067 scalarize(1)) 1068 .clampScalar(BigTyIdx, S32, S1024) 1069 .lowerFor({{S16, V2S16}}); 1070 1071 if (Op == G_MERGE_VALUES) { 1072 Builder.widenScalarIf( 1073 // TODO: Use 16-bit shifts if legal for 8-bit values? 1074 [=](const LegalityQuery &Query) { 1075 const LLT Ty = Query.Types[LitTyIdx]; 1076 return Ty.getSizeInBits() < 32; 1077 }, 1078 changeTo(LitTyIdx, S32)); 1079 } 1080 1081 Builder.widenScalarIf( 1082 [=](const LegalityQuery &Query) { 1083 const LLT Ty = Query.Types[BigTyIdx]; 1084 return !isPowerOf2_32(Ty.getSizeInBits()) && 1085 Ty.getSizeInBits() % 16 != 0; 1086 }, 1087 [=](const LegalityQuery &Query) { 1088 // Pick the next power of 2, or a multiple of 64 over 128. 1089 // Whichever is smaller. 1090 const LLT &Ty = Query.Types[BigTyIdx]; 1091 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1092 if (NewSizeInBits >= 256) { 1093 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1094 if (RoundedTo < NewSizeInBits) 1095 NewSizeInBits = RoundedTo; 1096 } 1097 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1098 }) 1099 .legalIf([=](const LegalityQuery &Query) { 1100 const LLT &BigTy = Query.Types[BigTyIdx]; 1101 const LLT &LitTy = Query.Types[LitTyIdx]; 1102 1103 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1104 return false; 1105 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1106 return false; 1107 1108 return BigTy.getSizeInBits() % 16 == 0 && 1109 LitTy.getSizeInBits() % 16 == 0 && 1110 BigTy.getSizeInBits() <= 1024; 1111 }) 1112 // Any vectors left are the wrong size. Scalarize them. 1113 .scalarize(0) 1114 .scalarize(1); 1115 } 1116 1117 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1118 1119 getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower(); 1120 1121 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1122 .legalFor({S64}); 1123 1124 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1125 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1126 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1127 .unsupported(); 1128 1129 computeTables(); 1130 verify(*ST.getInstrInfo()); 1131 } 1132 1133 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1134 MachineRegisterInfo &MRI, 1135 MachineIRBuilder &B, 1136 GISelChangeObserver &Observer) const { 1137 switch (MI.getOpcode()) { 1138 case TargetOpcode::G_ADDRSPACE_CAST: 1139 return legalizeAddrSpaceCast(MI, MRI, B); 1140 case TargetOpcode::G_FRINT: 1141 return legalizeFrint(MI, MRI, B); 1142 case TargetOpcode::G_FCEIL: 1143 return legalizeFceil(MI, MRI, B); 1144 case TargetOpcode::G_INTRINSIC_TRUNC: 1145 return legalizeIntrinsicTrunc(MI, MRI, B); 1146 case TargetOpcode::G_SITOFP: 1147 return legalizeITOFP(MI, MRI, B, true); 1148 case TargetOpcode::G_UITOFP: 1149 return legalizeITOFP(MI, MRI, B, false); 1150 case TargetOpcode::G_FMINNUM: 1151 case TargetOpcode::G_FMAXNUM: 1152 case TargetOpcode::G_FMINNUM_IEEE: 1153 case TargetOpcode::G_FMAXNUM_IEEE: 1154 return legalizeMinNumMaxNum(MI, MRI, B); 1155 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1156 return legalizeExtractVectorElt(MI, MRI, B); 1157 case TargetOpcode::G_INSERT_VECTOR_ELT: 1158 return legalizeInsertVectorElt(MI, MRI, B); 1159 case TargetOpcode::G_FSIN: 1160 case TargetOpcode::G_FCOS: 1161 return legalizeSinCos(MI, MRI, B); 1162 case TargetOpcode::G_GLOBAL_VALUE: 1163 return legalizeGlobalValue(MI, MRI, B); 1164 case TargetOpcode::G_LOAD: 1165 return legalizeLoad(MI, MRI, B, Observer); 1166 case TargetOpcode::G_FMAD: 1167 return legalizeFMad(MI, MRI, B); 1168 case TargetOpcode::G_FDIV: 1169 return legalizeFDIV(MI, MRI, B); 1170 case TargetOpcode::G_ATOMIC_CMPXCHG: 1171 return legalizeAtomicCmpXChg(MI, MRI, B); 1172 default: 1173 return false; 1174 } 1175 1176 llvm_unreachable("expected switch to return"); 1177 } 1178 1179 Register AMDGPULegalizerInfo::getSegmentAperture( 1180 unsigned AS, 1181 MachineRegisterInfo &MRI, 1182 MachineIRBuilder &B) const { 1183 MachineFunction &MF = B.getMF(); 1184 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1185 const LLT S32 = LLT::scalar(32); 1186 1187 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1188 1189 if (ST.hasApertureRegs()) { 1190 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1191 // getreg. 1192 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1193 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1194 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1195 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1196 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1197 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1198 unsigned Encoding = 1199 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1200 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1201 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1202 1203 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1204 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1205 1206 B.buildInstr(AMDGPU::S_GETREG_B32) 1207 .addDef(GetReg) 1208 .addImm(Encoding); 1209 MRI.setType(GetReg, S32); 1210 1211 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1212 B.buildInstr(TargetOpcode::G_SHL) 1213 .addDef(ApertureReg) 1214 .addUse(GetReg) 1215 .addUse(ShiftAmt.getReg(0)); 1216 1217 return ApertureReg; 1218 } 1219 1220 Register QueuePtr = MRI.createGenericVirtualRegister( 1221 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1222 1223 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1224 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1225 return Register(); 1226 1227 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1228 // private_segment_aperture_base_hi. 1229 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1230 1231 // TODO: can we be smarter about machine pointer info? 1232 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1233 MachineMemOperand *MMO = MF.getMachineMemOperand( 1234 PtrInfo, 1235 MachineMemOperand::MOLoad | 1236 MachineMemOperand::MODereferenceable | 1237 MachineMemOperand::MOInvariant, 1238 4, 1239 MinAlign(64, StructOffset)); 1240 1241 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1242 Register LoadAddr; 1243 1244 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1245 B.buildLoad(LoadResult, LoadAddr, *MMO); 1246 return LoadResult; 1247 } 1248 1249 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1250 MachineInstr &MI, MachineRegisterInfo &MRI, 1251 MachineIRBuilder &B) const { 1252 MachineFunction &MF = B.getMF(); 1253 1254 B.setInstr(MI); 1255 1256 const LLT S32 = LLT::scalar(32); 1257 Register Dst = MI.getOperand(0).getReg(); 1258 Register Src = MI.getOperand(1).getReg(); 1259 1260 LLT DstTy = MRI.getType(Dst); 1261 LLT SrcTy = MRI.getType(Src); 1262 unsigned DestAS = DstTy.getAddressSpace(); 1263 unsigned SrcAS = SrcTy.getAddressSpace(); 1264 1265 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1266 // vector element. 1267 assert(!DstTy.isVector()); 1268 1269 const AMDGPUTargetMachine &TM 1270 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1271 1272 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1273 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1274 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1275 return true; 1276 } 1277 1278 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1279 // Truncate. 1280 B.buildExtract(Dst, Src, 0); 1281 MI.eraseFromParent(); 1282 return true; 1283 } 1284 1285 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1286 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1287 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1288 1289 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1290 // another. Merge operands are required to be the same type, but creating an 1291 // extra ptrtoint would be kind of pointless. 1292 auto HighAddr = B.buildConstant( 1293 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1294 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1295 MI.eraseFromParent(); 1296 return true; 1297 } 1298 1299 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1300 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1301 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1302 unsigned NullVal = TM.getNullPointerValue(DestAS); 1303 1304 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1305 auto FlatNull = B.buildConstant(SrcTy, 0); 1306 1307 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1308 1309 // Extract low 32-bits of the pointer. 1310 B.buildExtract(PtrLo32, Src, 0); 1311 1312 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1313 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1314 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1315 1316 MI.eraseFromParent(); 1317 return true; 1318 } 1319 1320 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1321 return false; 1322 1323 if (!ST.hasFlatAddressSpace()) 1324 return false; 1325 1326 auto SegmentNull = 1327 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1328 auto FlatNull = 1329 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1330 1331 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1332 if (!ApertureReg.isValid()) 1333 return false; 1334 1335 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1336 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1337 1338 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1339 1340 // Coerce the type of the low half of the result so we can use merge_values. 1341 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1342 B.buildInstr(TargetOpcode::G_PTRTOINT) 1343 .addDef(SrcAsInt) 1344 .addUse(Src); 1345 1346 // TODO: Should we allow mismatched types but matching sizes in merges to 1347 // avoid the ptrtoint? 1348 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1349 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1350 1351 MI.eraseFromParent(); 1352 return true; 1353 } 1354 1355 bool AMDGPULegalizerInfo::legalizeFrint( 1356 MachineInstr &MI, MachineRegisterInfo &MRI, 1357 MachineIRBuilder &B) const { 1358 B.setInstr(MI); 1359 1360 Register Src = MI.getOperand(1).getReg(); 1361 LLT Ty = MRI.getType(Src); 1362 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1363 1364 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1365 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1366 1367 auto C1 = B.buildFConstant(Ty, C1Val); 1368 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1369 1370 // TODO: Should this propagate fast-math-flags? 1371 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1372 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1373 1374 auto C2 = B.buildFConstant(Ty, C2Val); 1375 auto Fabs = B.buildFAbs(Ty, Src); 1376 1377 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1378 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1379 return true; 1380 } 1381 1382 bool AMDGPULegalizerInfo::legalizeFceil( 1383 MachineInstr &MI, MachineRegisterInfo &MRI, 1384 MachineIRBuilder &B) const { 1385 B.setInstr(MI); 1386 1387 const LLT S1 = LLT::scalar(1); 1388 const LLT S64 = LLT::scalar(64); 1389 1390 Register Src = MI.getOperand(1).getReg(); 1391 assert(MRI.getType(Src) == S64); 1392 1393 // result = trunc(src) 1394 // if (src > 0.0 && src != result) 1395 // result += 1.0 1396 1397 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1398 1399 const auto Zero = B.buildFConstant(S64, 0.0); 1400 const auto One = B.buildFConstant(S64, 1.0); 1401 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1402 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1403 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1404 auto Add = B.buildSelect(S64, And, One, Zero); 1405 1406 // TODO: Should this propagate fast-math-flags? 1407 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1408 return true; 1409 } 1410 1411 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1412 MachineIRBuilder &B) { 1413 const unsigned FractBits = 52; 1414 const unsigned ExpBits = 11; 1415 LLT S32 = LLT::scalar(32); 1416 1417 auto Const0 = B.buildConstant(S32, FractBits - 32); 1418 auto Const1 = B.buildConstant(S32, ExpBits); 1419 1420 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1421 .addUse(Const0.getReg(0)) 1422 .addUse(Const1.getReg(0)); 1423 1424 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1425 } 1426 1427 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1428 MachineInstr &MI, MachineRegisterInfo &MRI, 1429 MachineIRBuilder &B) const { 1430 B.setInstr(MI); 1431 1432 const LLT S1 = LLT::scalar(1); 1433 const LLT S32 = LLT::scalar(32); 1434 const LLT S64 = LLT::scalar(64); 1435 1436 Register Src = MI.getOperand(1).getReg(); 1437 assert(MRI.getType(Src) == S64); 1438 1439 // TODO: Should this use extract since the low half is unused? 1440 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1441 Register Hi = Unmerge.getReg(1); 1442 1443 // Extract the upper half, since this is where we will find the sign and 1444 // exponent. 1445 auto Exp = extractF64Exponent(Hi, B); 1446 1447 const unsigned FractBits = 52; 1448 1449 // Extract the sign bit. 1450 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1451 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1452 1453 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1454 1455 const auto Zero32 = B.buildConstant(S32, 0); 1456 1457 // Extend back to 64-bits. 1458 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1459 1460 auto Shr = B.buildAShr(S64, FractMask, Exp); 1461 auto Not = B.buildNot(S64, Shr); 1462 auto Tmp0 = B.buildAnd(S64, Src, Not); 1463 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1464 1465 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1466 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1467 1468 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1469 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1470 return true; 1471 } 1472 1473 bool AMDGPULegalizerInfo::legalizeITOFP( 1474 MachineInstr &MI, MachineRegisterInfo &MRI, 1475 MachineIRBuilder &B, bool Signed) const { 1476 B.setInstr(MI); 1477 1478 Register Dst = MI.getOperand(0).getReg(); 1479 Register Src = MI.getOperand(1).getReg(); 1480 1481 const LLT S64 = LLT::scalar(64); 1482 const LLT S32 = LLT::scalar(32); 1483 1484 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1485 1486 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1487 1488 auto CvtHi = Signed ? 1489 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1490 B.buildUITOFP(S64, Unmerge.getReg(1)); 1491 1492 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1493 1494 auto ThirtyTwo = B.buildConstant(S32, 32); 1495 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1496 .addUse(CvtHi.getReg(0)) 1497 .addUse(ThirtyTwo.getReg(0)); 1498 1499 // TODO: Should this propagate fast-math-flags? 1500 B.buildFAdd(Dst, LdExp, CvtLo); 1501 MI.eraseFromParent(); 1502 return true; 1503 } 1504 1505 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1506 MachineInstr &MI, MachineRegisterInfo &MRI, 1507 MachineIRBuilder &B) const { 1508 MachineFunction &MF = B.getMF(); 1509 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1510 1511 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1512 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1513 1514 // With ieee_mode disabled, the instructions have the correct behavior 1515 // already for G_FMINNUM/G_FMAXNUM 1516 if (!MFI->getMode().IEEE) 1517 return !IsIEEEOp; 1518 1519 if (IsIEEEOp) 1520 return true; 1521 1522 MachineIRBuilder HelperBuilder(MI); 1523 GISelObserverWrapper DummyObserver; 1524 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1525 HelperBuilder.setInstr(MI); 1526 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1527 } 1528 1529 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1530 MachineInstr &MI, MachineRegisterInfo &MRI, 1531 MachineIRBuilder &B) const { 1532 // TODO: Should move some of this into LegalizerHelper. 1533 1534 // TODO: Promote dynamic indexing of s16 to s32 1535 // TODO: Dynamic s64 indexing is only legal for SGPR. 1536 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1537 if (!IdxVal) // Dynamic case will be selected to register indexing. 1538 return true; 1539 1540 Register Dst = MI.getOperand(0).getReg(); 1541 Register Vec = MI.getOperand(1).getReg(); 1542 1543 LLT VecTy = MRI.getType(Vec); 1544 LLT EltTy = VecTy.getElementType(); 1545 assert(EltTy == MRI.getType(Dst)); 1546 1547 B.setInstr(MI); 1548 1549 if (IdxVal.getValue() < VecTy.getNumElements()) 1550 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1551 else 1552 B.buildUndef(Dst); 1553 1554 MI.eraseFromParent(); 1555 return true; 1556 } 1557 1558 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1559 MachineInstr &MI, MachineRegisterInfo &MRI, 1560 MachineIRBuilder &B) const { 1561 // TODO: Should move some of this into LegalizerHelper. 1562 1563 // TODO: Promote dynamic indexing of s16 to s32 1564 // TODO: Dynamic s64 indexing is only legal for SGPR. 1565 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1566 if (!IdxVal) // Dynamic case will be selected to register indexing. 1567 return true; 1568 1569 Register Dst = MI.getOperand(0).getReg(); 1570 Register Vec = MI.getOperand(1).getReg(); 1571 Register Ins = MI.getOperand(2).getReg(); 1572 1573 LLT VecTy = MRI.getType(Vec); 1574 LLT EltTy = VecTy.getElementType(); 1575 assert(EltTy == MRI.getType(Ins)); 1576 1577 B.setInstr(MI); 1578 1579 if (IdxVal.getValue() < VecTy.getNumElements()) 1580 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1581 else 1582 B.buildUndef(Dst); 1583 1584 MI.eraseFromParent(); 1585 return true; 1586 } 1587 1588 bool AMDGPULegalizerInfo::legalizeSinCos( 1589 MachineInstr &MI, MachineRegisterInfo &MRI, 1590 MachineIRBuilder &B) const { 1591 B.setInstr(MI); 1592 1593 Register DstReg = MI.getOperand(0).getReg(); 1594 Register SrcReg = MI.getOperand(1).getReg(); 1595 LLT Ty = MRI.getType(DstReg); 1596 unsigned Flags = MI.getFlags(); 1597 1598 Register TrigVal; 1599 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1600 if (ST.hasTrigReducedRange()) { 1601 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1602 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1603 .addUse(MulVal.getReg(0)) 1604 .setMIFlags(Flags).getReg(0); 1605 } else 1606 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1607 1608 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1609 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1610 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1611 .addUse(TrigVal) 1612 .setMIFlags(Flags); 1613 MI.eraseFromParent(); 1614 return true; 1615 } 1616 1617 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1618 Register DstReg, LLT PtrTy, 1619 MachineIRBuilder &B, const GlobalValue *GV, 1620 unsigned Offset, unsigned GAFlags) const { 1621 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1622 // to the following code sequence: 1623 // 1624 // For constant address space: 1625 // s_getpc_b64 s[0:1] 1626 // s_add_u32 s0, s0, $symbol 1627 // s_addc_u32 s1, s1, 0 1628 // 1629 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1630 // a fixup or relocation is emitted to replace $symbol with a literal 1631 // constant, which is a pc-relative offset from the encoding of the $symbol 1632 // operand to the global variable. 1633 // 1634 // For global address space: 1635 // s_getpc_b64 s[0:1] 1636 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1637 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1638 // 1639 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1640 // fixups or relocations are emitted to replace $symbol@*@lo and 1641 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1642 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1643 // operand to the global variable. 1644 // 1645 // What we want here is an offset from the value returned by s_getpc 1646 // (which is the address of the s_add_u32 instruction) to the global 1647 // variable, but since the encoding of $symbol starts 4 bytes after the start 1648 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1649 // small. This requires us to add 4 to the global variable offset in order to 1650 // compute the correct address. 1651 1652 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1653 1654 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1655 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1656 1657 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1658 .addDef(PCReg); 1659 1660 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1661 if (GAFlags == SIInstrInfo::MO_NONE) 1662 MIB.addImm(0); 1663 else 1664 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1665 1666 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1667 1668 if (PtrTy.getSizeInBits() == 32) 1669 B.buildExtract(DstReg, PCReg, 0); 1670 return true; 1671 } 1672 1673 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1674 MachineInstr &MI, MachineRegisterInfo &MRI, 1675 MachineIRBuilder &B) const { 1676 Register DstReg = MI.getOperand(0).getReg(); 1677 LLT Ty = MRI.getType(DstReg); 1678 unsigned AS = Ty.getAddressSpace(); 1679 1680 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1681 MachineFunction &MF = B.getMF(); 1682 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1683 B.setInstr(MI); 1684 1685 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1686 if (!MFI->isEntryFunction()) { 1687 const Function &Fn = MF.getFunction(); 1688 DiagnosticInfoUnsupported BadLDSDecl( 1689 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1690 Fn.getContext().diagnose(BadLDSDecl); 1691 } 1692 1693 // TODO: We could emit code to handle the initialization somewhere. 1694 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1695 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1696 MI.eraseFromParent(); 1697 return true; 1698 } 1699 1700 const Function &Fn = MF.getFunction(); 1701 DiagnosticInfoUnsupported BadInit( 1702 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1703 Fn.getContext().diagnose(BadInit); 1704 return true; 1705 } 1706 1707 const SITargetLowering *TLI = ST.getTargetLowering(); 1708 1709 if (TLI->shouldEmitFixup(GV)) { 1710 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1711 MI.eraseFromParent(); 1712 return true; 1713 } 1714 1715 if (TLI->shouldEmitPCReloc(GV)) { 1716 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1717 MI.eraseFromParent(); 1718 return true; 1719 } 1720 1721 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1722 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1723 1724 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1725 MachinePointerInfo::getGOT(MF), 1726 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1727 MachineMemOperand::MOInvariant, 1728 8 /*Size*/, 8 /*Align*/); 1729 1730 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1731 1732 if (Ty.getSizeInBits() == 32) { 1733 // Truncate if this is a 32-bit constant adrdess. 1734 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1735 B.buildExtract(DstReg, Load, 0); 1736 } else 1737 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1738 1739 MI.eraseFromParent(); 1740 return true; 1741 } 1742 1743 bool AMDGPULegalizerInfo::legalizeLoad( 1744 MachineInstr &MI, MachineRegisterInfo &MRI, 1745 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1746 B.setInstr(MI); 1747 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1748 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1749 Observer.changingInstr(MI); 1750 MI.getOperand(1).setReg(Cast.getReg(0)); 1751 Observer.changedInstr(MI); 1752 return true; 1753 } 1754 1755 bool AMDGPULegalizerInfo::legalizeFMad( 1756 MachineInstr &MI, MachineRegisterInfo &MRI, 1757 MachineIRBuilder &B) const { 1758 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1759 assert(Ty.isScalar()); 1760 1761 MachineFunction &MF = B.getMF(); 1762 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1763 1764 // TODO: Always legal with future ftz flag. 1765 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1766 return true; 1767 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1768 return true; 1769 1770 1771 MachineIRBuilder HelperBuilder(MI); 1772 GISelObserverWrapper DummyObserver; 1773 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1774 HelperBuilder.setMBB(*MI.getParent()); 1775 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1776 } 1777 1778 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1779 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1780 Register DstReg = MI.getOperand(0).getReg(); 1781 Register PtrReg = MI.getOperand(1).getReg(); 1782 Register CmpVal = MI.getOperand(2).getReg(); 1783 Register NewVal = MI.getOperand(3).getReg(); 1784 1785 assert(SITargetLowering::isFlatGlobalAddrSpace( 1786 MRI.getType(PtrReg).getAddressSpace()) && 1787 "this should not have been custom lowered"); 1788 1789 LLT ValTy = MRI.getType(CmpVal); 1790 LLT VecTy = LLT::vector(2, ValTy); 1791 1792 B.setInstr(MI); 1793 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1794 1795 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1796 .addDef(DstReg) 1797 .addUse(PtrReg) 1798 .addUse(PackedVal) 1799 .setMemRefs(MI.memoperands()); 1800 1801 MI.eraseFromParent(); 1802 return true; 1803 } 1804 1805 // Return the use branch instruction, otherwise null if the usage is invalid. 1806 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1807 MachineRegisterInfo &MRI) { 1808 Register CondDef = MI.getOperand(0).getReg(); 1809 if (!MRI.hasOneNonDBGUse(CondDef)) 1810 return nullptr; 1811 1812 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1813 return UseMI.getParent() == MI.getParent() && 1814 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1815 } 1816 1817 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1818 Register Reg, LLT Ty) const { 1819 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1820 if (LiveIn) 1821 return LiveIn; 1822 1823 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1824 MRI.addLiveIn(Reg, NewReg); 1825 return NewReg; 1826 } 1827 1828 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1829 const ArgDescriptor *Arg) const { 1830 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1831 return false; // TODO: Handle these 1832 1833 assert(Arg->getRegister().isPhysical()); 1834 1835 MachineRegisterInfo &MRI = *B.getMRI(); 1836 1837 LLT Ty = MRI.getType(DstReg); 1838 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1839 1840 if (Arg->isMasked()) { 1841 // TODO: Should we try to emit this once in the entry block? 1842 const LLT S32 = LLT::scalar(32); 1843 const unsigned Mask = Arg->getMask(); 1844 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1845 1846 Register AndMaskSrc = LiveIn; 1847 1848 if (Shift != 0) { 1849 auto ShiftAmt = B.buildConstant(S32, Shift); 1850 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1851 } 1852 1853 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1854 } else 1855 B.buildCopy(DstReg, LiveIn); 1856 1857 // Insert the argument copy if it doens't already exist. 1858 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1859 if (!MRI.getVRegDef(LiveIn)) { 1860 // FIXME: Should have scoped insert pt 1861 MachineBasicBlock &OrigInsBB = B.getMBB(); 1862 auto OrigInsPt = B.getInsertPt(); 1863 1864 MachineBasicBlock &EntryMBB = B.getMF().front(); 1865 EntryMBB.addLiveIn(Arg->getRegister()); 1866 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1867 B.buildCopy(LiveIn, Arg->getRegister()); 1868 1869 B.setInsertPt(OrigInsBB, OrigInsPt); 1870 } 1871 1872 return true; 1873 } 1874 1875 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1876 MachineInstr &MI, 1877 MachineRegisterInfo &MRI, 1878 MachineIRBuilder &B, 1879 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1880 B.setInstr(MI); 1881 1882 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1883 1884 const ArgDescriptor *Arg; 1885 const TargetRegisterClass *RC; 1886 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1887 if (!Arg) { 1888 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1889 return false; 1890 } 1891 1892 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1893 MI.eraseFromParent(); 1894 return true; 1895 } 1896 1897 return false; 1898 } 1899 1900 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1901 MachineRegisterInfo &MRI, 1902 MachineIRBuilder &B) const { 1903 B.setInstr(MI); 1904 Register Dst = MI.getOperand(0).getReg(); 1905 LLT DstTy = MRI.getType(Dst); 1906 LLT S16 = LLT::scalar(16); 1907 LLT S32 = LLT::scalar(32); 1908 LLT S64 = LLT::scalar(64); 1909 1910 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1911 return true; 1912 1913 if (DstTy == S16) 1914 return legalizeFDIV16(MI, MRI, B); 1915 if (DstTy == S32) 1916 return legalizeFDIV32(MI, MRI, B); 1917 if (DstTy == S64) 1918 return legalizeFDIV64(MI, MRI, B); 1919 1920 return false; 1921 } 1922 1923 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1924 MachineRegisterInfo &MRI, 1925 MachineIRBuilder &B) const { 1926 Register Res = MI.getOperand(0).getReg(); 1927 Register LHS = MI.getOperand(1).getReg(); 1928 Register RHS = MI.getOperand(2).getReg(); 1929 1930 uint16_t Flags = MI.getFlags(); 1931 1932 LLT ResTy = MRI.getType(Res); 1933 LLT S32 = LLT::scalar(32); 1934 LLT S64 = LLT::scalar(64); 1935 1936 const MachineFunction &MF = B.getMF(); 1937 bool Unsafe = 1938 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1939 1940 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1941 return false; 1942 1943 if (!Unsafe && ResTy == S32 && 1944 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 1945 return false; 1946 1947 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1948 // 1 / x -> RCP(x) 1949 if (CLHS->isExactlyValue(1.0)) { 1950 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1951 .addUse(RHS) 1952 .setMIFlags(Flags); 1953 1954 MI.eraseFromParent(); 1955 return true; 1956 } 1957 1958 // -1 / x -> RCP( FNEG(x) ) 1959 if (CLHS->isExactlyValue(-1.0)) { 1960 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1961 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1962 .addUse(FNeg.getReg(0)) 1963 .setMIFlags(Flags); 1964 1965 MI.eraseFromParent(); 1966 return true; 1967 } 1968 } 1969 1970 // x / y -> x * (1.0 / y) 1971 if (Unsafe) { 1972 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1973 .addUse(RHS) 1974 .setMIFlags(Flags); 1975 B.buildFMul(Res, LHS, RCP, Flags); 1976 1977 MI.eraseFromParent(); 1978 return true; 1979 } 1980 1981 return false; 1982 } 1983 1984 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 1985 MachineRegisterInfo &MRI, 1986 MachineIRBuilder &B) const { 1987 B.setInstr(MI); 1988 Register Res = MI.getOperand(0).getReg(); 1989 Register LHS = MI.getOperand(1).getReg(); 1990 Register RHS = MI.getOperand(2).getReg(); 1991 1992 uint16_t Flags = MI.getFlags(); 1993 1994 LLT S16 = LLT::scalar(16); 1995 LLT S32 = LLT::scalar(32); 1996 1997 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 1998 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 1999 2000 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2001 .addUse(RHSExt.getReg(0)) 2002 .setMIFlags(Flags); 2003 2004 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2005 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2006 2007 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2008 .addUse(RDst.getReg(0)) 2009 .addUse(RHS) 2010 .addUse(LHS) 2011 .setMIFlags(Flags); 2012 2013 MI.eraseFromParent(); 2014 return true; 2015 } 2016 2017 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2018 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2019 static void toggleSPDenormMode(bool Enable, 2020 MachineIRBuilder &B, 2021 const GCNSubtarget &ST, 2022 AMDGPU::SIModeRegisterDefaults Mode) { 2023 // Set SP denorm mode to this value. 2024 unsigned SPDenormMode = 2025 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2026 2027 if (ST.hasDenormModeInst()) { 2028 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2029 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 2030 ? FP_DENORM_FLUSH_NONE 2031 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2032 2033 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2034 B.buildInstr(AMDGPU::S_DENORM_MODE) 2035 .addImm(NewDenormModeValue); 2036 2037 } else { 2038 // Select FP32 bit field in mode register. 2039 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2040 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2041 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2042 2043 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2044 .addImm(SPDenormMode) 2045 .addImm(SPDenormModeBitField); 2046 } 2047 } 2048 2049 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2050 MachineRegisterInfo &MRI, 2051 MachineIRBuilder &B) const { 2052 B.setInstr(MI); 2053 Register Res = MI.getOperand(0).getReg(); 2054 Register LHS = MI.getOperand(1).getReg(); 2055 Register RHS = MI.getOperand(2).getReg(); 2056 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2057 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2058 2059 uint16_t Flags = MI.getFlags(); 2060 2061 LLT S32 = LLT::scalar(32); 2062 LLT S1 = LLT::scalar(1); 2063 2064 auto One = B.buildFConstant(S32, 1.0f); 2065 2066 auto DenominatorScaled = 2067 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2068 .addUse(RHS) 2069 .addUse(LHS) 2070 .addImm(1) 2071 .setMIFlags(Flags); 2072 auto NumeratorScaled = 2073 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2074 .addUse(LHS) 2075 .addUse(RHS) 2076 .addImm(0) 2077 .setMIFlags(Flags); 2078 2079 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2080 .addUse(DenominatorScaled.getReg(0)) 2081 .setMIFlags(Flags); 2082 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2083 2084 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2085 // aren't modeled as reading it. 2086 if (!Mode.FP32Denormals) 2087 toggleSPDenormMode(true, B, ST, Mode); 2088 2089 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2090 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2091 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2092 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2093 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2094 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2095 2096 if (!Mode.FP32Denormals) 2097 toggleSPDenormMode(false, B, ST, Mode); 2098 2099 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2100 .addUse(Fma4.getReg(0)) 2101 .addUse(Fma1.getReg(0)) 2102 .addUse(Fma3.getReg(0)) 2103 .addUse(NumeratorScaled.getReg(1)) 2104 .setMIFlags(Flags); 2105 2106 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2107 .addUse(Fmas.getReg(0)) 2108 .addUse(RHS) 2109 .addUse(LHS) 2110 .setMIFlags(Flags); 2111 2112 MI.eraseFromParent(); 2113 return true; 2114 } 2115 2116 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2117 MachineRegisterInfo &MRI, 2118 MachineIRBuilder &B) const { 2119 B.setInstr(MI); 2120 Register Res = MI.getOperand(0).getReg(); 2121 Register LHS = MI.getOperand(1).getReg(); 2122 Register RHS = MI.getOperand(2).getReg(); 2123 2124 uint16_t Flags = MI.getFlags(); 2125 2126 LLT S64 = LLT::scalar(64); 2127 LLT S1 = LLT::scalar(1); 2128 2129 auto One = B.buildFConstant(S64, 1.0); 2130 2131 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2132 .addUse(LHS) 2133 .addUse(RHS) 2134 .addImm(1) 2135 .setMIFlags(Flags); 2136 2137 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2138 2139 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2140 .addUse(DivScale0.getReg(0)) 2141 .setMIFlags(Flags); 2142 2143 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2144 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2145 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2146 2147 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2148 .addUse(LHS) 2149 .addUse(RHS) 2150 .addImm(0) 2151 .setMIFlags(Flags); 2152 2153 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2154 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2155 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2156 2157 Register Scale; 2158 if (!ST.hasUsableDivScaleConditionOutput()) { 2159 // Workaround a hardware bug on SI where the condition output from div_scale 2160 // is not usable. 2161 2162 Scale = MRI.createGenericVirtualRegister(S1); 2163 2164 LLT S32 = LLT::scalar(32); 2165 2166 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2167 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2168 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2169 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2170 2171 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2172 Scale1Unmerge.getReg(1)); 2173 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2174 Scale0Unmerge.getReg(1)); 2175 B.buildXor(Scale, CmpNum, CmpDen); 2176 } else { 2177 Scale = DivScale1.getReg(1); 2178 } 2179 2180 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2181 .addUse(Fma4.getReg(0)) 2182 .addUse(Fma3.getReg(0)) 2183 .addUse(Mul.getReg(0)) 2184 .addUse(Scale) 2185 .setMIFlags(Flags); 2186 2187 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2188 .addUse(Fmas.getReg(0)) 2189 .addUse(RHS) 2190 .addUse(LHS) 2191 .setMIFlags(Flags); 2192 2193 MI.eraseFromParent(); 2194 return true; 2195 } 2196 2197 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2198 MachineRegisterInfo &MRI, 2199 MachineIRBuilder &B) const { 2200 B.setInstr(MI); 2201 Register Res = MI.getOperand(0).getReg(); 2202 Register LHS = MI.getOperand(2).getReg(); 2203 Register RHS = MI.getOperand(3).getReg(); 2204 uint16_t Flags = MI.getFlags(); 2205 2206 LLT S32 = LLT::scalar(32); 2207 LLT S1 = LLT::scalar(1); 2208 2209 auto Abs = B.buildFAbs(S32, RHS, Flags); 2210 const APFloat C0Val(1.0f); 2211 2212 auto C0 = B.buildConstant(S32, 0x6f800000); 2213 auto C1 = B.buildConstant(S32, 0x2f800000); 2214 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2215 2216 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2217 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2218 2219 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2220 2221 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2222 .addUse(Mul0.getReg(0)) 2223 .setMIFlags(Flags); 2224 2225 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2226 2227 B.buildFMul(Res, Sel, Mul1, Flags); 2228 2229 MI.eraseFromParent(); 2230 return true; 2231 } 2232 2233 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2234 MachineRegisterInfo &MRI, 2235 MachineIRBuilder &B) const { 2236 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2237 if (!MFI->isEntryFunction()) { 2238 return legalizePreloadedArgIntrin(MI, MRI, B, 2239 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2240 } 2241 2242 B.setInstr(MI); 2243 2244 uint64_t Offset = 2245 ST.getTargetLowering()->getImplicitParameterOffset( 2246 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2247 Register DstReg = MI.getOperand(0).getReg(); 2248 LLT DstTy = MRI.getType(DstReg); 2249 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2250 2251 const ArgDescriptor *Arg; 2252 const TargetRegisterClass *RC; 2253 std::tie(Arg, RC) 2254 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2255 if (!Arg) 2256 return false; 2257 2258 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2259 if (!loadInputValue(KernargPtrReg, B, Arg)) 2260 return false; 2261 2262 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2263 MI.eraseFromParent(); 2264 return true; 2265 } 2266 2267 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2268 MachineRegisterInfo &MRI, 2269 MachineIRBuilder &B, 2270 unsigned AddrSpace) const { 2271 B.setInstr(MI); 2272 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2273 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2274 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2275 MI.eraseFromParent(); 2276 return true; 2277 } 2278 2279 /// Handle register layout difference for f16 images for some subtargets. 2280 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2281 MachineRegisterInfo &MRI, 2282 Register Reg) const { 2283 if (!ST.hasUnpackedD16VMem()) 2284 return Reg; 2285 2286 const LLT S16 = LLT::scalar(16); 2287 const LLT S32 = LLT::scalar(32); 2288 LLT StoreVT = MRI.getType(Reg); 2289 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2290 2291 auto Unmerge = B.buildUnmerge(S16, Reg); 2292 2293 SmallVector<Register, 4> WideRegs; 2294 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2295 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2296 2297 int NumElts = StoreVT.getNumElements(); 2298 2299 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2300 } 2301 2302 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 2303 MachineRegisterInfo &MRI, 2304 MachineIRBuilder &B, 2305 bool IsFormat) const { 2306 // TODO: Reject f16 format on targets where unsupported. 2307 Register VData = MI.getOperand(1).getReg(); 2308 LLT Ty = MRI.getType(VData); 2309 2310 B.setInstr(MI); 2311 2312 const LLT S32 = LLT::scalar(32); 2313 const LLT S16 = LLT::scalar(16); 2314 2315 // Fixup illegal register types for i8 stores. 2316 if (Ty == LLT::scalar(8) || Ty == S16) { 2317 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2318 MI.getOperand(1).setReg(AnyExt); 2319 return true; 2320 } 2321 2322 if (Ty.isVector()) { 2323 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2324 if (IsFormat) 2325 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2326 return true; 2327 } 2328 2329 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2330 } 2331 2332 return Ty == S32; 2333 } 2334 2335 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2336 MachineRegisterInfo &MRI, 2337 MachineIRBuilder &B) const { 2338 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2339 auto IntrID = MI.getIntrinsicID(); 2340 switch (IntrID) { 2341 case Intrinsic::amdgcn_if: 2342 case Intrinsic::amdgcn_else: { 2343 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2344 const SIRegisterInfo *TRI 2345 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2346 2347 B.setInstr(*BrCond); 2348 Register Def = MI.getOperand(1).getReg(); 2349 Register Use = MI.getOperand(3).getReg(); 2350 2351 if (IntrID == Intrinsic::amdgcn_if) { 2352 B.buildInstr(AMDGPU::SI_IF) 2353 .addDef(Def) 2354 .addUse(Use) 2355 .addMBB(BrCond->getOperand(1).getMBB()); 2356 } else { 2357 B.buildInstr(AMDGPU::SI_ELSE) 2358 .addDef(Def) 2359 .addUse(Use) 2360 .addMBB(BrCond->getOperand(1).getMBB()) 2361 .addImm(0); 2362 } 2363 2364 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2365 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2366 MI.eraseFromParent(); 2367 BrCond->eraseFromParent(); 2368 return true; 2369 } 2370 2371 return false; 2372 } 2373 case Intrinsic::amdgcn_loop: { 2374 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2375 const SIRegisterInfo *TRI 2376 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2377 2378 B.setInstr(*BrCond); 2379 Register Reg = MI.getOperand(2).getReg(); 2380 B.buildInstr(AMDGPU::SI_LOOP) 2381 .addUse(Reg) 2382 .addMBB(BrCond->getOperand(1).getMBB()); 2383 MI.eraseFromParent(); 2384 BrCond->eraseFromParent(); 2385 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2386 return true; 2387 } 2388 2389 return false; 2390 } 2391 case Intrinsic::amdgcn_kernarg_segment_ptr: 2392 return legalizePreloadedArgIntrin( 2393 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2394 case Intrinsic::amdgcn_implicitarg_ptr: 2395 return legalizeImplicitArgPtr(MI, MRI, B); 2396 case Intrinsic::amdgcn_workitem_id_x: 2397 return legalizePreloadedArgIntrin(MI, MRI, B, 2398 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2399 case Intrinsic::amdgcn_workitem_id_y: 2400 return legalizePreloadedArgIntrin(MI, MRI, B, 2401 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2402 case Intrinsic::amdgcn_workitem_id_z: 2403 return legalizePreloadedArgIntrin(MI, MRI, B, 2404 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2405 case Intrinsic::amdgcn_workgroup_id_x: 2406 return legalizePreloadedArgIntrin(MI, MRI, B, 2407 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2408 case Intrinsic::amdgcn_workgroup_id_y: 2409 return legalizePreloadedArgIntrin(MI, MRI, B, 2410 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2411 case Intrinsic::amdgcn_workgroup_id_z: 2412 return legalizePreloadedArgIntrin(MI, MRI, B, 2413 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2414 case Intrinsic::amdgcn_dispatch_ptr: 2415 return legalizePreloadedArgIntrin(MI, MRI, B, 2416 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2417 case Intrinsic::amdgcn_queue_ptr: 2418 return legalizePreloadedArgIntrin(MI, MRI, B, 2419 AMDGPUFunctionArgInfo::QUEUE_PTR); 2420 case Intrinsic::amdgcn_implicit_buffer_ptr: 2421 return legalizePreloadedArgIntrin( 2422 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2423 case Intrinsic::amdgcn_dispatch_id: 2424 return legalizePreloadedArgIntrin(MI, MRI, B, 2425 AMDGPUFunctionArgInfo::DISPATCH_ID); 2426 case Intrinsic::amdgcn_fdiv_fast: 2427 return legalizeFDIVFastIntrin(MI, MRI, B); 2428 case Intrinsic::amdgcn_is_shared: 2429 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2430 case Intrinsic::amdgcn_is_private: 2431 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2432 case Intrinsic::amdgcn_wavefrontsize: { 2433 B.setInstr(MI); 2434 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2435 MI.eraseFromParent(); 2436 return true; 2437 } 2438 case Intrinsic::amdgcn_raw_buffer_store: 2439 return legalizeRawBufferStore(MI, MRI, B, false); 2440 case Intrinsic::amdgcn_raw_buffer_store_format: 2441 return legalizeRawBufferStore(MI, MRI, B, true); 2442 default: 2443 return true; 2444 } 2445 2446 return true; 2447 } 2448