1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); // VCC branches 248 setAction({G_BRCOND, S32}, Legal); // SCC branches 249 250 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 251 // elements for v3s16 252 getActionDefinitionsBuilder(G_PHI) 253 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 254 .legalFor(AllS32Vectors) 255 .legalFor(AllS64Vectors) 256 .legalFor(AddrSpaces64) 257 .legalFor(AddrSpaces32) 258 .clampScalar(0, S32, S256) 259 .widenScalarToNextPow2(0, 32) 260 .clampMaxNumElements(0, S32, 16) 261 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 262 .legalIf(isPointer(0)); 263 264 if (ST.has16BitInsts()) { 265 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 266 .legalFor({S32, S16}) 267 .clampScalar(0, S16, S32) 268 .scalarize(0); 269 } else { 270 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 271 .legalFor({S32}) 272 .clampScalar(0, S32, S32) 273 .scalarize(0); 274 } 275 276 // FIXME: Not really legal. Placeholder for custom lowering. 277 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 278 .legalFor({S32, S64}) 279 .clampScalar(0, S32, S64) 280 .widenScalarToNextPow2(0, 32) 281 .scalarize(0); 282 283 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 284 .legalFor({S32}) 285 .clampScalar(0, S32, S32) 286 .scalarize(0); 287 288 // Report legal for any types we can handle anywhere. For the cases only legal 289 // on the SALU, RegBankSelect will be able to re-legalize. 290 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 291 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 292 .clampScalar(0, S32, S64) 293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 294 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 295 .widenScalarToNextPow2(0) 296 .scalarize(0); 297 298 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 299 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 300 .legalFor({{S32, S1}, {S32, S32}}) 301 .clampScalar(0, S32, S32) 302 .scalarize(0); // TODO: Implement. 303 304 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 305 .lower(); 306 307 getActionDefinitionsBuilder(G_BITCAST) 308 // Don't worry about the size constraint. 309 .legalIf(all(isRegisterType(0), isRegisterType(1))) 310 // FIXME: Testing hack 311 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 312 313 getActionDefinitionsBuilder(G_FCONSTANT) 314 .legalFor({S32, S64, S16}) 315 .clampScalar(0, S16, S64); 316 317 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 318 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 319 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 320 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 321 .clampScalarOrElt(0, S32, S1024) 322 .legalIf(isMultiple32(0)) 323 .widenScalarToNextPow2(0, 32) 324 .clampMaxNumElements(0, S32, 16); 325 326 327 // FIXME: i1 operands to intrinsics should always be legal, but other i1 328 // values may not be legal. We need to figure out how to distinguish 329 // between these two scenarios. 330 getActionDefinitionsBuilder(G_CONSTANT) 331 .legalFor({S1, S32, S64, S16, GlobalPtr, 332 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 333 .clampScalar(0, S32, S64) 334 .widenScalarToNextPow2(0) 335 .legalIf(isPointer(0)); 336 337 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 338 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 339 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 340 341 342 auto &FPOpActions = getActionDefinitionsBuilder( 343 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 344 .legalFor({S32, S64}); 345 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 346 .customFor({S32, S64}); 347 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 348 .customFor({S32, S64}); 349 350 if (ST.has16BitInsts()) { 351 if (ST.hasVOP3PInsts()) 352 FPOpActions.legalFor({S16, V2S16}); 353 else 354 FPOpActions.legalFor({S16}); 355 356 TrigActions.customFor({S16}); 357 FDIVActions.customFor({S16}); 358 } 359 360 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 361 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 362 363 if (ST.hasVOP3PInsts()) { 364 MinNumMaxNum.customFor(FPTypesPK16) 365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 366 .clampMaxNumElements(0, S16, 2) 367 .clampScalar(0, S16, S64) 368 .scalarize(0); 369 } else if (ST.has16BitInsts()) { 370 MinNumMaxNum.customFor(FPTypes16) 371 .clampScalar(0, S16, S64) 372 .scalarize(0); 373 } else { 374 MinNumMaxNum.customFor(FPTypesBase) 375 .clampScalar(0, S32, S64) 376 .scalarize(0); 377 } 378 379 if (ST.hasVOP3PInsts()) 380 FPOpActions.clampMaxNumElements(0, S16, 2); 381 382 FPOpActions 383 .scalarize(0) 384 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 385 386 TrigActions 387 .scalarize(0) 388 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 389 390 FDIVActions 391 .scalarize(0) 392 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 393 394 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 395 .legalFor(FPTypesPK16) 396 .clampMaxNumElements(0, S16, 2) 397 .scalarize(0) 398 .clampScalar(0, S16, S64); 399 400 // TODO: Implement 401 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 402 403 if (ST.has16BitInsts()) { 404 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 405 .legalFor({S32, S64, S16}) 406 .scalarize(0) 407 .clampScalar(0, S16, S64); 408 } else { 409 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 410 .legalFor({S32, S64}) 411 .scalarize(0) 412 .clampScalar(0, S32, S64); 413 } 414 415 getActionDefinitionsBuilder(G_FPTRUNC) 416 .legalFor({{S32, S64}, {S16, S32}}) 417 .scalarize(0); 418 419 getActionDefinitionsBuilder(G_FPEXT) 420 .legalFor({{S64, S32}, {S32, S16}}) 421 .lowerFor({{S64, S16}}) // FIXME: Implement 422 .scalarize(0); 423 424 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 425 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 426 427 getActionDefinitionsBuilder(G_FSUB) 428 // Use actual fsub instruction 429 .legalFor({S32}) 430 // Must use fadd + fneg 431 .lowerFor({S64, S16, V2S16}) 432 .scalarize(0) 433 .clampScalar(0, S32, S64); 434 435 // Whether this is legal depends on the floating point mode for the function. 436 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 437 if (ST.hasMadF16()) 438 FMad.customFor({S32, S16}); 439 else 440 FMad.customFor({S32}); 441 FMad.scalarize(0) 442 .lower(); 443 444 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 445 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 446 {S32, S1}, {S64, S1}, {S16, S1}, 447 {S96, S32}, 448 // FIXME: Hack 449 {S64, LLT::scalar(33)}, 450 {S32, S8}, {S32, LLT::scalar(24)}}) 451 .scalarize(0) 452 .clampScalar(0, S32, S64); 453 454 // TODO: Split s1->s64 during regbankselect for VALU. 455 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 456 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 457 .lowerFor({{S32, S64}}) 458 .lowerIf(typeIs(1, S1)) 459 .customFor({{S64, S64}}); 460 if (ST.has16BitInsts()) 461 IToFP.legalFor({{S16, S16}}); 462 IToFP.clampScalar(1, S32, S64) 463 .scalarize(0); 464 465 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 466 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 467 if (ST.has16BitInsts()) 468 FPToI.legalFor({{S16, S16}}); 469 else 470 FPToI.minScalar(1, S32); 471 472 FPToI.minScalar(0, S32) 473 .scalarize(0); 474 475 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 476 .scalarize(0) 477 .lower(); 478 479 if (ST.has16BitInsts()) { 480 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 481 .legalFor({S16, S32, S64}) 482 .clampScalar(0, S16, S64) 483 .scalarize(0); 484 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 485 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 486 .legalFor({S32, S64}) 487 .clampScalar(0, S32, S64) 488 .scalarize(0); 489 } else { 490 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 491 .legalFor({S32}) 492 .customFor({S64}) 493 .clampScalar(0, S32, S64) 494 .scalarize(0); 495 } 496 497 getActionDefinitionsBuilder(G_PTR_ADD) 498 .legalForCartesianProduct(AddrSpaces64, {S64}) 499 .legalForCartesianProduct(AddrSpaces32, {S32}) 500 .scalarize(0); 501 502 getActionDefinitionsBuilder(G_PTR_MASK) 503 .scalarize(0) 504 .alwaysLegal(); 505 506 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 507 508 auto &CmpBuilder = 509 getActionDefinitionsBuilder(G_ICMP) 510 // The compare output type differs based on the register bank of the output, 511 // so make both s1 and s32 legal. 512 // 513 // Scalar compares producing output in scc will be promoted to s32, as that 514 // is the allocatable register type that will be needed for the copy from 515 // scc. This will be promoted during RegBankSelect, and we assume something 516 // before that won't try to use s32 result types. 517 // 518 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 519 // bank. 520 .legalForCartesianProduct( 521 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 522 .legalForCartesianProduct( 523 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 524 if (ST.has16BitInsts()) { 525 CmpBuilder.legalFor({{S1, S16}}); 526 } 527 528 CmpBuilder 529 .widenScalarToNextPow2(1) 530 .clampScalar(1, S32, S64) 531 .scalarize(0) 532 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 533 534 getActionDefinitionsBuilder(G_FCMP) 535 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 536 .widenScalarToNextPow2(1) 537 .clampScalar(1, S32, S64) 538 .scalarize(0); 539 540 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 541 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 542 G_FLOG, G_FLOG2, G_FLOG10}) 543 .legalFor({S32}) 544 .scalarize(0); 545 546 // The 64-bit versions produce 32-bit results, but only on the SALU. 547 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 548 G_CTTZ, G_CTTZ_ZERO_UNDEF, 549 G_CTPOP}) 550 .legalFor({{S32, S32}, {S32, S64}}) 551 .clampScalar(0, S32, S32) 552 .clampScalar(1, S32, S64) 553 .scalarize(0) 554 .widenScalarToNextPow2(0, 32) 555 .widenScalarToNextPow2(1, 32); 556 557 // TODO: Expand for > s32 558 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 559 .legalFor({S32}) 560 .clampScalar(0, S32, S32) 561 .scalarize(0); 562 563 if (ST.has16BitInsts()) { 564 if (ST.hasVOP3PInsts()) { 565 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 566 .legalFor({S32, S16, V2S16}) 567 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 568 .clampMaxNumElements(0, S16, 2) 569 .clampScalar(0, S16, S32) 570 .widenScalarToNextPow2(0) 571 .scalarize(0); 572 } else { 573 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 574 .legalFor({S32, S16}) 575 .widenScalarToNextPow2(0) 576 .clampScalar(0, S16, S32) 577 .scalarize(0); 578 } 579 } else { 580 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 581 .legalFor({S32}) 582 .clampScalar(0, S32, S32) 583 .widenScalarToNextPow2(0) 584 .scalarize(0); 585 } 586 587 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 588 return [=](const LegalityQuery &Query) { 589 return Query.Types[TypeIdx0].getSizeInBits() < 590 Query.Types[TypeIdx1].getSizeInBits(); 591 }; 592 }; 593 594 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 595 return [=](const LegalityQuery &Query) { 596 return Query.Types[TypeIdx0].getSizeInBits() > 597 Query.Types[TypeIdx1].getSizeInBits(); 598 }; 599 }; 600 601 getActionDefinitionsBuilder(G_INTTOPTR) 602 // List the common cases 603 .legalForCartesianProduct(AddrSpaces64, {S64}) 604 .legalForCartesianProduct(AddrSpaces32, {S32}) 605 .scalarize(0) 606 // Accept any address space as long as the size matches 607 .legalIf(sameSize(0, 1)) 608 .widenScalarIf(smallerThan(1, 0), 609 [](const LegalityQuery &Query) { 610 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 611 }) 612 .narrowScalarIf(greaterThan(1, 0), 613 [](const LegalityQuery &Query) { 614 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 615 }); 616 617 getActionDefinitionsBuilder(G_PTRTOINT) 618 // List the common cases 619 .legalForCartesianProduct(AddrSpaces64, {S64}) 620 .legalForCartesianProduct(AddrSpaces32, {S32}) 621 .scalarize(0) 622 // Accept any address space as long as the size matches 623 .legalIf(sameSize(0, 1)) 624 .widenScalarIf(smallerThan(0, 1), 625 [](const LegalityQuery &Query) { 626 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 627 }) 628 .narrowScalarIf( 629 greaterThan(0, 1), 630 [](const LegalityQuery &Query) { 631 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 632 }); 633 634 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 635 .scalarize(0) 636 .custom(); 637 638 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 639 // handle some operations by just promoting the register during 640 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 641 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 642 switch (AS) { 643 // FIXME: Private element size. 644 case AMDGPUAS::PRIVATE_ADDRESS: 645 return 32; 646 // FIXME: Check subtarget 647 case AMDGPUAS::LOCAL_ADDRESS: 648 return ST.useDS128() ? 128 : 64; 649 650 // Treat constant and global as identical. SMRD loads are sometimes usable 651 // for global loads (ideally constant address space should be eliminated) 652 // depending on the context. Legality cannot be context dependent, but 653 // RegBankSelect can split the load as necessary depending on the pointer 654 // register bank/uniformity and if the memory is invariant or not written in 655 // a kernel. 656 case AMDGPUAS::CONSTANT_ADDRESS: 657 case AMDGPUAS::GLOBAL_ADDRESS: 658 return 512; 659 default: 660 return 128; 661 } 662 }; 663 664 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 665 const LLT DstTy = Query.Types[0]; 666 667 // Split vector extloads. 668 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 669 unsigned Align = Query.MMODescrs[0].AlignInBits; 670 671 if (MemSize < DstTy.getSizeInBits()) 672 MemSize = std::max(MemSize, Align); 673 674 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 675 return true; 676 677 const LLT PtrTy = Query.Types[1]; 678 unsigned AS = PtrTy.getAddressSpace(); 679 if (MemSize > maxSizeForAddrSpace(AS)) 680 return true; 681 682 // Catch weird sized loads that don't evenly divide into the access sizes 683 // TODO: May be able to widen depending on alignment etc. 684 unsigned NumRegs = MemSize / 32; 685 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 686 return true; 687 688 if (Align < MemSize) { 689 const SITargetLowering *TLI = ST.getTargetLowering(); 690 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 691 } 692 693 return false; 694 }; 695 696 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 697 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 698 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 699 700 // TODO: Refine based on subtargets which support unaligned access or 128-bit 701 // LDS 702 // TODO: Unsupported flat for SI. 703 704 for (unsigned Op : {G_LOAD, G_STORE}) { 705 const bool IsStore = Op == G_STORE; 706 707 auto &Actions = getActionDefinitionsBuilder(Op); 708 // Whitelist the common cases. 709 // TODO: Pointer loads 710 // TODO: Wide constant loads 711 // TODO: Only CI+ has 3x loads 712 // TODO: Loads to s16 on gfx9 713 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 714 {V2S32, GlobalPtr, 64, GlobalAlign32}, 715 {V3S32, GlobalPtr, 96, GlobalAlign32}, 716 {S96, GlobalPtr, 96, GlobalAlign32}, 717 {V4S32, GlobalPtr, 128, GlobalAlign32}, 718 {S128, GlobalPtr, 128, GlobalAlign32}, 719 {S64, GlobalPtr, 64, GlobalAlign32}, 720 {V2S64, GlobalPtr, 128, GlobalAlign32}, 721 {V2S16, GlobalPtr, 32, GlobalAlign32}, 722 {S32, GlobalPtr, 8, GlobalAlign8}, 723 {S32, GlobalPtr, 16, GlobalAlign16}, 724 725 {S32, LocalPtr, 32, 32}, 726 {S64, LocalPtr, 64, 32}, 727 {V2S32, LocalPtr, 64, 32}, 728 {S32, LocalPtr, 8, 8}, 729 {S32, LocalPtr, 16, 16}, 730 {V2S16, LocalPtr, 32, 32}, 731 732 {S32, PrivatePtr, 32, 32}, 733 {S32, PrivatePtr, 8, 8}, 734 {S32, PrivatePtr, 16, 16}, 735 {V2S16, PrivatePtr, 32, 32}, 736 737 {S32, FlatPtr, 32, GlobalAlign32}, 738 {S32, FlatPtr, 16, GlobalAlign16}, 739 {S32, FlatPtr, 8, GlobalAlign8}, 740 {V2S16, FlatPtr, 32, GlobalAlign32}, 741 742 {S32, ConstantPtr, 32, GlobalAlign32}, 743 {V2S32, ConstantPtr, 64, GlobalAlign32}, 744 {V3S32, ConstantPtr, 96, GlobalAlign32}, 745 {V4S32, ConstantPtr, 128, GlobalAlign32}, 746 {S64, ConstantPtr, 64, GlobalAlign32}, 747 {S128, ConstantPtr, 128, GlobalAlign32}, 748 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 749 Actions 750 .customIf(typeIs(1, Constant32Ptr)) 751 .narrowScalarIf( 752 [=](const LegalityQuery &Query) -> bool { 753 return !Query.Types[0].isVector() && needToSplitLoad(Query); 754 }, 755 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 756 const LLT DstTy = Query.Types[0]; 757 const LLT PtrTy = Query.Types[1]; 758 759 const unsigned DstSize = DstTy.getSizeInBits(); 760 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 761 762 // Split extloads. 763 if (DstSize > MemSize) 764 return std::make_pair(0, LLT::scalar(MemSize)); 765 766 if (DstSize > 32 && (DstSize % 32 != 0)) { 767 // FIXME: Need a way to specify non-extload of larger size if 768 // suitably aligned. 769 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 770 } 771 772 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 773 if (MemSize > MaxSize) 774 return std::make_pair(0, LLT::scalar(MaxSize)); 775 776 unsigned Align = Query.MMODescrs[0].AlignInBits; 777 return std::make_pair(0, LLT::scalar(Align)); 778 }) 779 .fewerElementsIf( 780 [=](const LegalityQuery &Query) -> bool { 781 return Query.Types[0].isVector() && needToSplitLoad(Query); 782 }, 783 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 784 const LLT DstTy = Query.Types[0]; 785 const LLT PtrTy = Query.Types[1]; 786 787 LLT EltTy = DstTy.getElementType(); 788 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 789 790 // Split if it's too large for the address space. 791 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 792 unsigned NumElts = DstTy.getNumElements(); 793 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 794 795 // FIXME: Refine when odd breakdowns handled 796 // The scalars will need to be re-legalized. 797 if (NumPieces == 1 || NumPieces >= NumElts || 798 NumElts % NumPieces != 0) 799 return std::make_pair(0, EltTy); 800 801 return std::make_pair(0, 802 LLT::vector(NumElts / NumPieces, EltTy)); 803 } 804 805 // Need to split because of alignment. 806 unsigned Align = Query.MMODescrs[0].AlignInBits; 807 unsigned EltSize = EltTy.getSizeInBits(); 808 if (EltSize > Align && 809 (EltSize / Align < DstTy.getNumElements())) { 810 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 811 } 812 813 // May need relegalization for the scalars. 814 return std::make_pair(0, EltTy); 815 }) 816 .minScalar(0, S32); 817 818 if (IsStore) 819 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 820 821 // TODO: Need a bitcast lower option? 822 Actions 823 .legalIf([=](const LegalityQuery &Query) { 824 const LLT Ty0 = Query.Types[0]; 825 unsigned Size = Ty0.getSizeInBits(); 826 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 827 unsigned Align = Query.MMODescrs[0].AlignInBits; 828 829 // FIXME: Widening store from alignment not valid. 830 if (MemSize < Size) 831 MemSize = std::max(MemSize, Align); 832 833 // No extending vector loads. 834 if (Size > MemSize && Ty0.isVector()) 835 return false; 836 837 switch (MemSize) { 838 case 8: 839 case 16: 840 return Size == 32; 841 case 32: 842 case 64: 843 case 128: 844 return true; 845 case 96: 846 return ST.hasDwordx3LoadStores(); 847 case 256: 848 case 512: 849 return true; 850 default: 851 return false; 852 } 853 }) 854 .widenScalarToNextPow2(0) 855 // TODO: v3s32->v4s32 with alignment 856 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 857 } 858 859 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 860 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 861 {S32, GlobalPtr, 16, 2 * 8}, 862 {S32, LocalPtr, 8, 8}, 863 {S32, LocalPtr, 16, 16}, 864 {S32, PrivatePtr, 8, 8}, 865 {S32, PrivatePtr, 16, 16}, 866 {S32, ConstantPtr, 8, 8}, 867 {S32, ConstantPtr, 16, 2 * 8}}); 868 if (ST.hasFlatAddressSpace()) { 869 ExtLoads.legalForTypesWithMemDesc( 870 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 871 } 872 873 ExtLoads.clampScalar(0, S32, S32) 874 .widenScalarToNextPow2(0) 875 .unsupportedIfMemSizeNotPow2() 876 .lower(); 877 878 auto &Atomics = getActionDefinitionsBuilder( 879 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 880 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 881 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 882 G_ATOMICRMW_UMIN}) 883 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 884 {S64, GlobalPtr}, {S64, LocalPtr}}); 885 if (ST.hasFlatAddressSpace()) { 886 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 887 } 888 889 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 890 .legalFor({{S32, LocalPtr}}); 891 892 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 893 // demarshalling 894 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 895 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 896 {S32, FlatPtr}, {S64, FlatPtr}}) 897 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 898 {S32, RegionPtr}, {S64, RegionPtr}}); 899 900 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 901 .lower(); 902 903 // TODO: Pointer types, any 32-bit or 64-bit vector 904 905 // Condition should be s32 for scalar, s1 for vector. 906 getActionDefinitionsBuilder(G_SELECT) 907 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 908 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 909 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 910 .clampScalar(0, S16, S64) 911 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 912 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 913 .scalarize(1) 914 .clampMaxNumElements(0, S32, 2) 915 .clampMaxNumElements(0, LocalPtr, 2) 916 .clampMaxNumElements(0, PrivatePtr, 2) 917 .scalarize(0) 918 .widenScalarToNextPow2(0) 919 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 920 921 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 922 // be more flexible with the shift amount type. 923 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 924 .legalFor({{S32, S32}, {S64, S32}}); 925 if (ST.has16BitInsts()) { 926 if (ST.hasVOP3PInsts()) { 927 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 928 .clampMaxNumElements(0, S16, 2); 929 } else 930 Shifts.legalFor({{S16, S32}, {S16, S16}}); 931 932 // TODO: Support 16-bit shift amounts 933 Shifts.clampScalar(1, S32, S32); 934 Shifts.clampScalar(0, S16, S64); 935 Shifts.widenScalarToNextPow2(0, 16); 936 } else { 937 // Make sure we legalize the shift amount type first, as the general 938 // expansion for the shifted type will produce much worse code if it hasn't 939 // been truncated already. 940 Shifts.clampScalar(1, S32, S32); 941 Shifts.clampScalar(0, S32, S64); 942 Shifts.widenScalarToNextPow2(0, 32); 943 } 944 Shifts.scalarize(0); 945 946 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 947 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 948 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 949 unsigned IdxTypeIdx = 2; 950 951 getActionDefinitionsBuilder(Op) 952 .customIf([=](const LegalityQuery &Query) { 953 const LLT EltTy = Query.Types[EltTypeIdx]; 954 const LLT VecTy = Query.Types[VecTypeIdx]; 955 const LLT IdxTy = Query.Types[IdxTypeIdx]; 956 return (EltTy.getSizeInBits() == 16 || 957 EltTy.getSizeInBits() % 32 == 0) && 958 VecTy.getSizeInBits() % 32 == 0 && 959 VecTy.getSizeInBits() <= 1024 && 960 IdxTy.getSizeInBits() == 32; 961 }) 962 .clampScalar(EltTypeIdx, S32, S64) 963 .clampScalar(VecTypeIdx, S32, S64) 964 .clampScalar(IdxTypeIdx, S32, S32); 965 } 966 967 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 968 .unsupportedIf([=](const LegalityQuery &Query) { 969 const LLT &EltTy = Query.Types[1].getElementType(); 970 return Query.Types[0] != EltTy; 971 }); 972 973 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 974 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 975 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 976 977 // FIXME: Doesn't handle extract of illegal sizes. 978 getActionDefinitionsBuilder(Op) 979 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 980 // FIXME: Multiples of 16 should not be legal. 981 .legalIf([=](const LegalityQuery &Query) { 982 const LLT BigTy = Query.Types[BigTyIdx]; 983 const LLT LitTy = Query.Types[LitTyIdx]; 984 return (BigTy.getSizeInBits() % 32 == 0) && 985 (LitTy.getSizeInBits() % 16 == 0); 986 }) 987 .widenScalarIf( 988 [=](const LegalityQuery &Query) { 989 const LLT BigTy = Query.Types[BigTyIdx]; 990 return (BigTy.getScalarSizeInBits() < 16); 991 }, 992 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 993 .widenScalarIf( 994 [=](const LegalityQuery &Query) { 995 const LLT LitTy = Query.Types[LitTyIdx]; 996 return (LitTy.getScalarSizeInBits() < 16); 997 }, 998 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 999 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1000 .widenScalarToNextPow2(BigTyIdx, 32); 1001 1002 } 1003 1004 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1005 .legalForCartesianProduct(AllS32Vectors, {S32}) 1006 .legalForCartesianProduct(AllS64Vectors, {S64}) 1007 .clampNumElements(0, V16S32, V32S32) 1008 .clampNumElements(0, V2S64, V16S64) 1009 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1010 1011 if (ST.hasScalarPackInsts()) 1012 BuildVector.legalFor({V2S16, S32}); 1013 1014 BuildVector 1015 .minScalarSameAs(1, 0) 1016 .legalIf(isRegisterType(0)) 1017 .minScalarOrElt(0, S32); 1018 1019 if (ST.hasScalarPackInsts()) { 1020 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1021 .legalFor({V2S16, S32}) 1022 .lower(); 1023 } else { 1024 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1025 .lower(); 1026 } 1027 1028 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1029 .legalIf(isRegisterType(0)); 1030 1031 // TODO: Don't fully scalarize v2s16 pieces 1032 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1033 1034 // Merge/Unmerge 1035 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1036 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1037 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1038 1039 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1040 const LLT &Ty = Query.Types[TypeIdx]; 1041 if (Ty.isVector()) { 1042 const LLT &EltTy = Ty.getElementType(); 1043 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1044 return true; 1045 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1046 return true; 1047 } 1048 return false; 1049 }; 1050 1051 auto &Builder = getActionDefinitionsBuilder(Op) 1052 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1053 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1054 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1055 // valid. 1056 .clampScalar(LitTyIdx, S16, S256) 1057 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1058 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1059 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1060 elementTypeIs(1, S16)), 1061 changeTo(1, V2S16)) 1062 // Break up vectors with weird elements into scalars 1063 .fewerElementsIf( 1064 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1065 scalarize(0)) 1066 .fewerElementsIf( 1067 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1068 scalarize(1)) 1069 .clampScalar(BigTyIdx, S32, S1024) 1070 .lowerFor({{S16, V2S16}}); 1071 1072 if (Op == G_MERGE_VALUES) { 1073 Builder.widenScalarIf( 1074 // TODO: Use 16-bit shifts if legal for 8-bit values? 1075 [=](const LegalityQuery &Query) { 1076 const LLT Ty = Query.Types[LitTyIdx]; 1077 return Ty.getSizeInBits() < 32; 1078 }, 1079 changeTo(LitTyIdx, S32)); 1080 } 1081 1082 Builder.widenScalarIf( 1083 [=](const LegalityQuery &Query) { 1084 const LLT Ty = Query.Types[BigTyIdx]; 1085 return !isPowerOf2_32(Ty.getSizeInBits()) && 1086 Ty.getSizeInBits() % 16 != 0; 1087 }, 1088 [=](const LegalityQuery &Query) { 1089 // Pick the next power of 2, or a multiple of 64 over 128. 1090 // Whichever is smaller. 1091 const LLT &Ty = Query.Types[BigTyIdx]; 1092 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1093 if (NewSizeInBits >= 256) { 1094 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1095 if (RoundedTo < NewSizeInBits) 1096 NewSizeInBits = RoundedTo; 1097 } 1098 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1099 }) 1100 .legalIf([=](const LegalityQuery &Query) { 1101 const LLT &BigTy = Query.Types[BigTyIdx]; 1102 const LLT &LitTy = Query.Types[LitTyIdx]; 1103 1104 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1105 return false; 1106 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1107 return false; 1108 1109 return BigTy.getSizeInBits() % 16 == 0 && 1110 LitTy.getSizeInBits() % 16 == 0 && 1111 BigTy.getSizeInBits() <= 1024; 1112 }) 1113 // Any vectors left are the wrong size. Scalarize them. 1114 .scalarize(0) 1115 .scalarize(1); 1116 } 1117 1118 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1119 1120 getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower(); 1121 1122 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1123 .legalFor({S64}); 1124 1125 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1126 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1127 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1128 .unsupported(); 1129 1130 computeTables(); 1131 verify(*ST.getInstrInfo()); 1132 } 1133 1134 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1135 MachineRegisterInfo &MRI, 1136 MachineIRBuilder &B, 1137 GISelChangeObserver &Observer) const { 1138 switch (MI.getOpcode()) { 1139 case TargetOpcode::G_ADDRSPACE_CAST: 1140 return legalizeAddrSpaceCast(MI, MRI, B); 1141 case TargetOpcode::G_FRINT: 1142 return legalizeFrint(MI, MRI, B); 1143 case TargetOpcode::G_FCEIL: 1144 return legalizeFceil(MI, MRI, B); 1145 case TargetOpcode::G_INTRINSIC_TRUNC: 1146 return legalizeIntrinsicTrunc(MI, MRI, B); 1147 case TargetOpcode::G_SITOFP: 1148 return legalizeITOFP(MI, MRI, B, true); 1149 case TargetOpcode::G_UITOFP: 1150 return legalizeITOFP(MI, MRI, B, false); 1151 case TargetOpcode::G_FMINNUM: 1152 case TargetOpcode::G_FMAXNUM: 1153 case TargetOpcode::G_FMINNUM_IEEE: 1154 case TargetOpcode::G_FMAXNUM_IEEE: 1155 return legalizeMinNumMaxNum(MI, MRI, B); 1156 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1157 return legalizeExtractVectorElt(MI, MRI, B); 1158 case TargetOpcode::G_INSERT_VECTOR_ELT: 1159 return legalizeInsertVectorElt(MI, MRI, B); 1160 case TargetOpcode::G_FSIN: 1161 case TargetOpcode::G_FCOS: 1162 return legalizeSinCos(MI, MRI, B); 1163 case TargetOpcode::G_GLOBAL_VALUE: 1164 return legalizeGlobalValue(MI, MRI, B); 1165 case TargetOpcode::G_LOAD: 1166 return legalizeLoad(MI, MRI, B, Observer); 1167 case TargetOpcode::G_FMAD: 1168 return legalizeFMad(MI, MRI, B); 1169 case TargetOpcode::G_FDIV: 1170 return legalizeFDIV(MI, MRI, B); 1171 case TargetOpcode::G_ATOMIC_CMPXCHG: 1172 return legalizeAtomicCmpXChg(MI, MRI, B); 1173 default: 1174 return false; 1175 } 1176 1177 llvm_unreachable("expected switch to return"); 1178 } 1179 1180 Register AMDGPULegalizerInfo::getSegmentAperture( 1181 unsigned AS, 1182 MachineRegisterInfo &MRI, 1183 MachineIRBuilder &B) const { 1184 MachineFunction &MF = B.getMF(); 1185 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1186 const LLT S32 = LLT::scalar(32); 1187 1188 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1189 1190 if (ST.hasApertureRegs()) { 1191 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1192 // getreg. 1193 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1194 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1195 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1196 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1197 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1198 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1199 unsigned Encoding = 1200 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1201 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1202 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1203 1204 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1205 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1206 1207 B.buildInstr(AMDGPU::S_GETREG_B32) 1208 .addDef(GetReg) 1209 .addImm(Encoding); 1210 MRI.setType(GetReg, S32); 1211 1212 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1213 B.buildInstr(TargetOpcode::G_SHL) 1214 .addDef(ApertureReg) 1215 .addUse(GetReg) 1216 .addUse(ShiftAmt.getReg(0)); 1217 1218 return ApertureReg; 1219 } 1220 1221 Register QueuePtr = MRI.createGenericVirtualRegister( 1222 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1223 1224 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1225 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1226 return Register(); 1227 1228 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1229 // private_segment_aperture_base_hi. 1230 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1231 1232 // TODO: can we be smarter about machine pointer info? 1233 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1234 MachineMemOperand *MMO = MF.getMachineMemOperand( 1235 PtrInfo, 1236 MachineMemOperand::MOLoad | 1237 MachineMemOperand::MODereferenceable | 1238 MachineMemOperand::MOInvariant, 1239 4, 1240 MinAlign(64, StructOffset)); 1241 1242 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1243 Register LoadAddr; 1244 1245 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1246 B.buildLoad(LoadResult, LoadAddr, *MMO); 1247 return LoadResult; 1248 } 1249 1250 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1251 MachineInstr &MI, MachineRegisterInfo &MRI, 1252 MachineIRBuilder &B) const { 1253 MachineFunction &MF = B.getMF(); 1254 1255 B.setInstr(MI); 1256 1257 const LLT S32 = LLT::scalar(32); 1258 Register Dst = MI.getOperand(0).getReg(); 1259 Register Src = MI.getOperand(1).getReg(); 1260 1261 LLT DstTy = MRI.getType(Dst); 1262 LLT SrcTy = MRI.getType(Src); 1263 unsigned DestAS = DstTy.getAddressSpace(); 1264 unsigned SrcAS = SrcTy.getAddressSpace(); 1265 1266 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1267 // vector element. 1268 assert(!DstTy.isVector()); 1269 1270 const AMDGPUTargetMachine &TM 1271 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1272 1273 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1274 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1275 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1276 return true; 1277 } 1278 1279 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1280 // Truncate. 1281 B.buildExtract(Dst, Src, 0); 1282 MI.eraseFromParent(); 1283 return true; 1284 } 1285 1286 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1287 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1288 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1289 1290 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1291 // another. Merge operands are required to be the same type, but creating an 1292 // extra ptrtoint would be kind of pointless. 1293 auto HighAddr = B.buildConstant( 1294 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1295 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1296 MI.eraseFromParent(); 1297 return true; 1298 } 1299 1300 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1301 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1302 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1303 unsigned NullVal = TM.getNullPointerValue(DestAS); 1304 1305 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1306 auto FlatNull = B.buildConstant(SrcTy, 0); 1307 1308 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1309 1310 // Extract low 32-bits of the pointer. 1311 B.buildExtract(PtrLo32, Src, 0); 1312 1313 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1314 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1315 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1316 1317 MI.eraseFromParent(); 1318 return true; 1319 } 1320 1321 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1322 return false; 1323 1324 if (!ST.hasFlatAddressSpace()) 1325 return false; 1326 1327 auto SegmentNull = 1328 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1329 auto FlatNull = 1330 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1331 1332 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1333 if (!ApertureReg.isValid()) 1334 return false; 1335 1336 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1337 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1338 1339 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1340 1341 // Coerce the type of the low half of the result so we can use merge_values. 1342 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1343 B.buildInstr(TargetOpcode::G_PTRTOINT) 1344 .addDef(SrcAsInt) 1345 .addUse(Src); 1346 1347 // TODO: Should we allow mismatched types but matching sizes in merges to 1348 // avoid the ptrtoint? 1349 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1350 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1351 1352 MI.eraseFromParent(); 1353 return true; 1354 } 1355 1356 bool AMDGPULegalizerInfo::legalizeFrint( 1357 MachineInstr &MI, MachineRegisterInfo &MRI, 1358 MachineIRBuilder &B) const { 1359 B.setInstr(MI); 1360 1361 Register Src = MI.getOperand(1).getReg(); 1362 LLT Ty = MRI.getType(Src); 1363 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1364 1365 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1366 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1367 1368 auto C1 = B.buildFConstant(Ty, C1Val); 1369 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1370 1371 // TODO: Should this propagate fast-math-flags? 1372 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1373 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1374 1375 auto C2 = B.buildFConstant(Ty, C2Val); 1376 auto Fabs = B.buildFAbs(Ty, Src); 1377 1378 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1379 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1380 return true; 1381 } 1382 1383 bool AMDGPULegalizerInfo::legalizeFceil( 1384 MachineInstr &MI, MachineRegisterInfo &MRI, 1385 MachineIRBuilder &B) const { 1386 B.setInstr(MI); 1387 1388 const LLT S1 = LLT::scalar(1); 1389 const LLT S64 = LLT::scalar(64); 1390 1391 Register Src = MI.getOperand(1).getReg(); 1392 assert(MRI.getType(Src) == S64); 1393 1394 // result = trunc(src) 1395 // if (src > 0.0 && src != result) 1396 // result += 1.0 1397 1398 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1399 1400 const auto Zero = B.buildFConstant(S64, 0.0); 1401 const auto One = B.buildFConstant(S64, 1.0); 1402 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1403 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1404 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1405 auto Add = B.buildSelect(S64, And, One, Zero); 1406 1407 // TODO: Should this propagate fast-math-flags? 1408 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1409 return true; 1410 } 1411 1412 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1413 MachineIRBuilder &B) { 1414 const unsigned FractBits = 52; 1415 const unsigned ExpBits = 11; 1416 LLT S32 = LLT::scalar(32); 1417 1418 auto Const0 = B.buildConstant(S32, FractBits - 32); 1419 auto Const1 = B.buildConstant(S32, ExpBits); 1420 1421 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1422 .addUse(Const0.getReg(0)) 1423 .addUse(Const1.getReg(0)); 1424 1425 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1426 } 1427 1428 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1429 MachineInstr &MI, MachineRegisterInfo &MRI, 1430 MachineIRBuilder &B) const { 1431 B.setInstr(MI); 1432 1433 const LLT S1 = LLT::scalar(1); 1434 const LLT S32 = LLT::scalar(32); 1435 const LLT S64 = LLT::scalar(64); 1436 1437 Register Src = MI.getOperand(1).getReg(); 1438 assert(MRI.getType(Src) == S64); 1439 1440 // TODO: Should this use extract since the low half is unused? 1441 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1442 Register Hi = Unmerge.getReg(1); 1443 1444 // Extract the upper half, since this is where we will find the sign and 1445 // exponent. 1446 auto Exp = extractF64Exponent(Hi, B); 1447 1448 const unsigned FractBits = 52; 1449 1450 // Extract the sign bit. 1451 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1452 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1453 1454 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1455 1456 const auto Zero32 = B.buildConstant(S32, 0); 1457 1458 // Extend back to 64-bits. 1459 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1460 1461 auto Shr = B.buildAShr(S64, FractMask, Exp); 1462 auto Not = B.buildNot(S64, Shr); 1463 auto Tmp0 = B.buildAnd(S64, Src, Not); 1464 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1465 1466 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1467 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1468 1469 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1470 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1471 return true; 1472 } 1473 1474 bool AMDGPULegalizerInfo::legalizeITOFP( 1475 MachineInstr &MI, MachineRegisterInfo &MRI, 1476 MachineIRBuilder &B, bool Signed) const { 1477 B.setInstr(MI); 1478 1479 Register Dst = MI.getOperand(0).getReg(); 1480 Register Src = MI.getOperand(1).getReg(); 1481 1482 const LLT S64 = LLT::scalar(64); 1483 const LLT S32 = LLT::scalar(32); 1484 1485 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1486 1487 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1488 1489 auto CvtHi = Signed ? 1490 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1491 B.buildUITOFP(S64, Unmerge.getReg(1)); 1492 1493 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1494 1495 auto ThirtyTwo = B.buildConstant(S32, 32); 1496 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1497 .addUse(CvtHi.getReg(0)) 1498 .addUse(ThirtyTwo.getReg(0)); 1499 1500 // TODO: Should this propagate fast-math-flags? 1501 B.buildFAdd(Dst, LdExp, CvtLo); 1502 MI.eraseFromParent(); 1503 return true; 1504 } 1505 1506 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1507 MachineInstr &MI, MachineRegisterInfo &MRI, 1508 MachineIRBuilder &B) const { 1509 MachineFunction &MF = B.getMF(); 1510 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1511 1512 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1513 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1514 1515 // With ieee_mode disabled, the instructions have the correct behavior 1516 // already for G_FMINNUM/G_FMAXNUM 1517 if (!MFI->getMode().IEEE) 1518 return !IsIEEEOp; 1519 1520 if (IsIEEEOp) 1521 return true; 1522 1523 MachineIRBuilder HelperBuilder(MI); 1524 GISelObserverWrapper DummyObserver; 1525 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1526 HelperBuilder.setInstr(MI); 1527 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1528 } 1529 1530 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1531 MachineInstr &MI, MachineRegisterInfo &MRI, 1532 MachineIRBuilder &B) const { 1533 // TODO: Should move some of this into LegalizerHelper. 1534 1535 // TODO: Promote dynamic indexing of s16 to s32 1536 // TODO: Dynamic s64 indexing is only legal for SGPR. 1537 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1538 if (!IdxVal) // Dynamic case will be selected to register indexing. 1539 return true; 1540 1541 Register Dst = MI.getOperand(0).getReg(); 1542 Register Vec = MI.getOperand(1).getReg(); 1543 1544 LLT VecTy = MRI.getType(Vec); 1545 LLT EltTy = VecTy.getElementType(); 1546 assert(EltTy == MRI.getType(Dst)); 1547 1548 B.setInstr(MI); 1549 1550 if (IdxVal.getValue() < VecTy.getNumElements()) 1551 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1552 else 1553 B.buildUndef(Dst); 1554 1555 MI.eraseFromParent(); 1556 return true; 1557 } 1558 1559 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1560 MachineInstr &MI, MachineRegisterInfo &MRI, 1561 MachineIRBuilder &B) const { 1562 // TODO: Should move some of this into LegalizerHelper. 1563 1564 // TODO: Promote dynamic indexing of s16 to s32 1565 // TODO: Dynamic s64 indexing is only legal for SGPR. 1566 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1567 if (!IdxVal) // Dynamic case will be selected to register indexing. 1568 return true; 1569 1570 Register Dst = MI.getOperand(0).getReg(); 1571 Register Vec = MI.getOperand(1).getReg(); 1572 Register Ins = MI.getOperand(2).getReg(); 1573 1574 LLT VecTy = MRI.getType(Vec); 1575 LLT EltTy = VecTy.getElementType(); 1576 assert(EltTy == MRI.getType(Ins)); 1577 1578 B.setInstr(MI); 1579 1580 if (IdxVal.getValue() < VecTy.getNumElements()) 1581 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1582 else 1583 B.buildUndef(Dst); 1584 1585 MI.eraseFromParent(); 1586 return true; 1587 } 1588 1589 bool AMDGPULegalizerInfo::legalizeSinCos( 1590 MachineInstr &MI, MachineRegisterInfo &MRI, 1591 MachineIRBuilder &B) const { 1592 B.setInstr(MI); 1593 1594 Register DstReg = MI.getOperand(0).getReg(); 1595 Register SrcReg = MI.getOperand(1).getReg(); 1596 LLT Ty = MRI.getType(DstReg); 1597 unsigned Flags = MI.getFlags(); 1598 1599 Register TrigVal; 1600 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1601 if (ST.hasTrigReducedRange()) { 1602 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1603 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1604 .addUse(MulVal.getReg(0)) 1605 .setMIFlags(Flags).getReg(0); 1606 } else 1607 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1608 1609 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1610 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1611 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1612 .addUse(TrigVal) 1613 .setMIFlags(Flags); 1614 MI.eraseFromParent(); 1615 return true; 1616 } 1617 1618 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1619 Register DstReg, LLT PtrTy, 1620 MachineIRBuilder &B, const GlobalValue *GV, 1621 unsigned Offset, unsigned GAFlags) const { 1622 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1623 // to the following code sequence: 1624 // 1625 // For constant address space: 1626 // s_getpc_b64 s[0:1] 1627 // s_add_u32 s0, s0, $symbol 1628 // s_addc_u32 s1, s1, 0 1629 // 1630 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1631 // a fixup or relocation is emitted to replace $symbol with a literal 1632 // constant, which is a pc-relative offset from the encoding of the $symbol 1633 // operand to the global variable. 1634 // 1635 // For global address space: 1636 // s_getpc_b64 s[0:1] 1637 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1638 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1639 // 1640 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1641 // fixups or relocations are emitted to replace $symbol@*@lo and 1642 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1643 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1644 // operand to the global variable. 1645 // 1646 // What we want here is an offset from the value returned by s_getpc 1647 // (which is the address of the s_add_u32 instruction) to the global 1648 // variable, but since the encoding of $symbol starts 4 bytes after the start 1649 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1650 // small. This requires us to add 4 to the global variable offset in order to 1651 // compute the correct address. 1652 1653 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1654 1655 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1656 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1657 1658 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1659 .addDef(PCReg); 1660 1661 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1662 if (GAFlags == SIInstrInfo::MO_NONE) 1663 MIB.addImm(0); 1664 else 1665 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1666 1667 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1668 1669 if (PtrTy.getSizeInBits() == 32) 1670 B.buildExtract(DstReg, PCReg, 0); 1671 return true; 1672 } 1673 1674 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1675 MachineInstr &MI, MachineRegisterInfo &MRI, 1676 MachineIRBuilder &B) const { 1677 Register DstReg = MI.getOperand(0).getReg(); 1678 LLT Ty = MRI.getType(DstReg); 1679 unsigned AS = Ty.getAddressSpace(); 1680 1681 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1682 MachineFunction &MF = B.getMF(); 1683 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1684 B.setInstr(MI); 1685 1686 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1687 if (!MFI->isEntryFunction()) { 1688 const Function &Fn = MF.getFunction(); 1689 DiagnosticInfoUnsupported BadLDSDecl( 1690 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1691 Fn.getContext().diagnose(BadLDSDecl); 1692 } 1693 1694 // TODO: We could emit code to handle the initialization somewhere. 1695 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1696 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1697 MI.eraseFromParent(); 1698 return true; 1699 } 1700 1701 const Function &Fn = MF.getFunction(); 1702 DiagnosticInfoUnsupported BadInit( 1703 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1704 Fn.getContext().diagnose(BadInit); 1705 return true; 1706 } 1707 1708 const SITargetLowering *TLI = ST.getTargetLowering(); 1709 1710 if (TLI->shouldEmitFixup(GV)) { 1711 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1712 MI.eraseFromParent(); 1713 return true; 1714 } 1715 1716 if (TLI->shouldEmitPCReloc(GV)) { 1717 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1718 MI.eraseFromParent(); 1719 return true; 1720 } 1721 1722 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1723 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1724 1725 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1726 MachinePointerInfo::getGOT(MF), 1727 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1728 MachineMemOperand::MOInvariant, 1729 8 /*Size*/, 8 /*Align*/); 1730 1731 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1732 1733 if (Ty.getSizeInBits() == 32) { 1734 // Truncate if this is a 32-bit constant adrdess. 1735 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1736 B.buildExtract(DstReg, Load, 0); 1737 } else 1738 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1739 1740 MI.eraseFromParent(); 1741 return true; 1742 } 1743 1744 bool AMDGPULegalizerInfo::legalizeLoad( 1745 MachineInstr &MI, MachineRegisterInfo &MRI, 1746 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1747 B.setInstr(MI); 1748 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1749 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1750 Observer.changingInstr(MI); 1751 MI.getOperand(1).setReg(Cast.getReg(0)); 1752 Observer.changedInstr(MI); 1753 return true; 1754 } 1755 1756 bool AMDGPULegalizerInfo::legalizeFMad( 1757 MachineInstr &MI, MachineRegisterInfo &MRI, 1758 MachineIRBuilder &B) const { 1759 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1760 assert(Ty.isScalar()); 1761 1762 MachineFunction &MF = B.getMF(); 1763 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1764 1765 // TODO: Always legal with future ftz flag. 1766 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1767 return true; 1768 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1769 return true; 1770 1771 1772 MachineIRBuilder HelperBuilder(MI); 1773 GISelObserverWrapper DummyObserver; 1774 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1775 HelperBuilder.setMBB(*MI.getParent()); 1776 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1777 } 1778 1779 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1780 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1781 Register DstReg = MI.getOperand(0).getReg(); 1782 Register PtrReg = MI.getOperand(1).getReg(); 1783 Register CmpVal = MI.getOperand(2).getReg(); 1784 Register NewVal = MI.getOperand(3).getReg(); 1785 1786 assert(SITargetLowering::isFlatGlobalAddrSpace( 1787 MRI.getType(PtrReg).getAddressSpace()) && 1788 "this should not have been custom lowered"); 1789 1790 LLT ValTy = MRI.getType(CmpVal); 1791 LLT VecTy = LLT::vector(2, ValTy); 1792 1793 B.setInstr(MI); 1794 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1795 1796 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1797 .addDef(DstReg) 1798 .addUse(PtrReg) 1799 .addUse(PackedVal) 1800 .setMemRefs(MI.memoperands()); 1801 1802 MI.eraseFromParent(); 1803 return true; 1804 } 1805 1806 // Return the use branch instruction, otherwise null if the usage is invalid. 1807 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1808 MachineRegisterInfo &MRI) { 1809 Register CondDef = MI.getOperand(0).getReg(); 1810 if (!MRI.hasOneNonDBGUse(CondDef)) 1811 return nullptr; 1812 1813 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1814 return UseMI.getParent() == MI.getParent() && 1815 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1816 } 1817 1818 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1819 Register Reg, LLT Ty) const { 1820 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1821 if (LiveIn) 1822 return LiveIn; 1823 1824 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1825 MRI.addLiveIn(Reg, NewReg); 1826 return NewReg; 1827 } 1828 1829 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1830 const ArgDescriptor *Arg) const { 1831 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1832 return false; // TODO: Handle these 1833 1834 assert(Arg->getRegister().isPhysical()); 1835 1836 MachineRegisterInfo &MRI = *B.getMRI(); 1837 1838 LLT Ty = MRI.getType(DstReg); 1839 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1840 1841 if (Arg->isMasked()) { 1842 // TODO: Should we try to emit this once in the entry block? 1843 const LLT S32 = LLT::scalar(32); 1844 const unsigned Mask = Arg->getMask(); 1845 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1846 1847 Register AndMaskSrc = LiveIn; 1848 1849 if (Shift != 0) { 1850 auto ShiftAmt = B.buildConstant(S32, Shift); 1851 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1852 } 1853 1854 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1855 } else 1856 B.buildCopy(DstReg, LiveIn); 1857 1858 // Insert the argument copy if it doens't already exist. 1859 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1860 if (!MRI.getVRegDef(LiveIn)) { 1861 // FIXME: Should have scoped insert pt 1862 MachineBasicBlock &OrigInsBB = B.getMBB(); 1863 auto OrigInsPt = B.getInsertPt(); 1864 1865 MachineBasicBlock &EntryMBB = B.getMF().front(); 1866 EntryMBB.addLiveIn(Arg->getRegister()); 1867 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1868 B.buildCopy(LiveIn, Arg->getRegister()); 1869 1870 B.setInsertPt(OrigInsBB, OrigInsPt); 1871 } 1872 1873 return true; 1874 } 1875 1876 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1877 MachineInstr &MI, 1878 MachineRegisterInfo &MRI, 1879 MachineIRBuilder &B, 1880 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1881 B.setInstr(MI); 1882 1883 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1884 1885 const ArgDescriptor *Arg; 1886 const TargetRegisterClass *RC; 1887 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1888 if (!Arg) { 1889 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1890 return false; 1891 } 1892 1893 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1894 MI.eraseFromParent(); 1895 return true; 1896 } 1897 1898 return false; 1899 } 1900 1901 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1902 MachineRegisterInfo &MRI, 1903 MachineIRBuilder &B) const { 1904 B.setInstr(MI); 1905 Register Dst = MI.getOperand(0).getReg(); 1906 LLT DstTy = MRI.getType(Dst); 1907 LLT S16 = LLT::scalar(16); 1908 LLT S32 = LLT::scalar(32); 1909 LLT S64 = LLT::scalar(64); 1910 1911 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1912 return true; 1913 1914 if (DstTy == S16) 1915 return legalizeFDIV16(MI, MRI, B); 1916 if (DstTy == S32) 1917 return legalizeFDIV32(MI, MRI, B); 1918 if (DstTy == S64) 1919 return legalizeFDIV64(MI, MRI, B); 1920 1921 return false; 1922 } 1923 1924 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1925 MachineRegisterInfo &MRI, 1926 MachineIRBuilder &B) const { 1927 Register Res = MI.getOperand(0).getReg(); 1928 Register LHS = MI.getOperand(1).getReg(); 1929 Register RHS = MI.getOperand(2).getReg(); 1930 1931 uint16_t Flags = MI.getFlags(); 1932 1933 LLT ResTy = MRI.getType(Res); 1934 LLT S32 = LLT::scalar(32); 1935 LLT S64 = LLT::scalar(64); 1936 1937 const MachineFunction &MF = B.getMF(); 1938 bool Unsafe = 1939 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1940 1941 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1942 return false; 1943 1944 if (!Unsafe && ResTy == S32 && 1945 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 1946 return false; 1947 1948 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1949 // 1 / x -> RCP(x) 1950 if (CLHS->isExactlyValue(1.0)) { 1951 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1952 .addUse(RHS) 1953 .setMIFlags(Flags); 1954 1955 MI.eraseFromParent(); 1956 return true; 1957 } 1958 1959 // -1 / x -> RCP( FNEG(x) ) 1960 if (CLHS->isExactlyValue(-1.0)) { 1961 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1962 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1963 .addUse(FNeg.getReg(0)) 1964 .setMIFlags(Flags); 1965 1966 MI.eraseFromParent(); 1967 return true; 1968 } 1969 } 1970 1971 // x / y -> x * (1.0 / y) 1972 if (Unsafe) { 1973 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1974 .addUse(RHS) 1975 .setMIFlags(Flags); 1976 B.buildFMul(Res, LHS, RCP, Flags); 1977 1978 MI.eraseFromParent(); 1979 return true; 1980 } 1981 1982 return false; 1983 } 1984 1985 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 1986 MachineRegisterInfo &MRI, 1987 MachineIRBuilder &B) const { 1988 B.setInstr(MI); 1989 Register Res = MI.getOperand(0).getReg(); 1990 Register LHS = MI.getOperand(1).getReg(); 1991 Register RHS = MI.getOperand(2).getReg(); 1992 1993 uint16_t Flags = MI.getFlags(); 1994 1995 LLT S16 = LLT::scalar(16); 1996 LLT S32 = LLT::scalar(32); 1997 1998 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 1999 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2000 2001 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2002 .addUse(RHSExt.getReg(0)) 2003 .setMIFlags(Flags); 2004 2005 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2006 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2007 2008 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2009 .addUse(RDst.getReg(0)) 2010 .addUse(RHS) 2011 .addUse(LHS) 2012 .setMIFlags(Flags); 2013 2014 MI.eraseFromParent(); 2015 return true; 2016 } 2017 2018 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2019 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2020 static void toggleSPDenormMode(bool Enable, 2021 MachineIRBuilder &B, 2022 const GCNSubtarget &ST, 2023 AMDGPU::SIModeRegisterDefaults Mode) { 2024 // Set SP denorm mode to this value. 2025 unsigned SPDenormMode = 2026 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2027 2028 if (ST.hasDenormModeInst()) { 2029 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2030 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 2031 ? FP_DENORM_FLUSH_NONE 2032 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2033 2034 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2035 B.buildInstr(AMDGPU::S_DENORM_MODE) 2036 .addImm(NewDenormModeValue); 2037 2038 } else { 2039 // Select FP32 bit field in mode register. 2040 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2041 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2042 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2043 2044 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2045 .addImm(SPDenormMode) 2046 .addImm(SPDenormModeBitField); 2047 } 2048 } 2049 2050 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2051 MachineRegisterInfo &MRI, 2052 MachineIRBuilder &B) const { 2053 B.setInstr(MI); 2054 Register Res = MI.getOperand(0).getReg(); 2055 Register LHS = MI.getOperand(1).getReg(); 2056 Register RHS = MI.getOperand(2).getReg(); 2057 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2058 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2059 2060 uint16_t Flags = MI.getFlags(); 2061 2062 LLT S32 = LLT::scalar(32); 2063 LLT S1 = LLT::scalar(1); 2064 2065 auto One = B.buildFConstant(S32, 1.0f); 2066 2067 auto DenominatorScaled = 2068 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2069 .addUse(RHS) 2070 .addUse(LHS) 2071 .addImm(1) 2072 .setMIFlags(Flags); 2073 auto NumeratorScaled = 2074 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2075 .addUse(LHS) 2076 .addUse(RHS) 2077 .addImm(0) 2078 .setMIFlags(Flags); 2079 2080 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2081 .addUse(DenominatorScaled.getReg(0)) 2082 .setMIFlags(Flags); 2083 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2084 2085 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2086 // aren't modeled as reading it. 2087 if (!Mode.FP32Denormals) 2088 toggleSPDenormMode(true, B, ST, Mode); 2089 2090 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2091 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2092 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2093 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2094 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2095 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2096 2097 if (!Mode.FP32Denormals) 2098 toggleSPDenormMode(false, B, ST, Mode); 2099 2100 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2101 .addUse(Fma4.getReg(0)) 2102 .addUse(Fma1.getReg(0)) 2103 .addUse(Fma3.getReg(0)) 2104 .addUse(NumeratorScaled.getReg(1)) 2105 .setMIFlags(Flags); 2106 2107 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2108 .addUse(Fmas.getReg(0)) 2109 .addUse(RHS) 2110 .addUse(LHS) 2111 .setMIFlags(Flags); 2112 2113 MI.eraseFromParent(); 2114 return true; 2115 } 2116 2117 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2118 MachineRegisterInfo &MRI, 2119 MachineIRBuilder &B) const { 2120 B.setInstr(MI); 2121 Register Res = MI.getOperand(0).getReg(); 2122 Register LHS = MI.getOperand(1).getReg(); 2123 Register RHS = MI.getOperand(2).getReg(); 2124 2125 uint16_t Flags = MI.getFlags(); 2126 2127 LLT S64 = LLT::scalar(64); 2128 LLT S1 = LLT::scalar(1); 2129 2130 auto One = B.buildFConstant(S64, 1.0); 2131 2132 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2133 .addUse(LHS) 2134 .addUse(RHS) 2135 .addImm(1) 2136 .setMIFlags(Flags); 2137 2138 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2139 2140 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2141 .addUse(DivScale0.getReg(0)) 2142 .setMIFlags(Flags); 2143 2144 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2145 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2146 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2147 2148 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2149 .addUse(LHS) 2150 .addUse(RHS) 2151 .addImm(0) 2152 .setMIFlags(Flags); 2153 2154 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2155 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2156 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2157 2158 Register Scale; 2159 if (!ST.hasUsableDivScaleConditionOutput()) { 2160 // Workaround a hardware bug on SI where the condition output from div_scale 2161 // is not usable. 2162 2163 Scale = MRI.createGenericVirtualRegister(S1); 2164 2165 LLT S32 = LLT::scalar(32); 2166 2167 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2168 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2169 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2170 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2171 2172 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2173 Scale1Unmerge.getReg(1)); 2174 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2175 Scale0Unmerge.getReg(1)); 2176 B.buildXor(Scale, CmpNum, CmpDen); 2177 } else { 2178 Scale = DivScale1.getReg(1); 2179 } 2180 2181 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2182 .addUse(Fma4.getReg(0)) 2183 .addUse(Fma3.getReg(0)) 2184 .addUse(Mul.getReg(0)) 2185 .addUse(Scale) 2186 .setMIFlags(Flags); 2187 2188 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2189 .addUse(Fmas.getReg(0)) 2190 .addUse(RHS) 2191 .addUse(LHS) 2192 .setMIFlags(Flags); 2193 2194 MI.eraseFromParent(); 2195 return true; 2196 } 2197 2198 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2199 MachineRegisterInfo &MRI, 2200 MachineIRBuilder &B) const { 2201 B.setInstr(MI); 2202 Register Res = MI.getOperand(0).getReg(); 2203 Register LHS = MI.getOperand(2).getReg(); 2204 Register RHS = MI.getOperand(3).getReg(); 2205 uint16_t Flags = MI.getFlags(); 2206 2207 LLT S32 = LLT::scalar(32); 2208 LLT S1 = LLT::scalar(1); 2209 2210 auto Abs = B.buildFAbs(S32, RHS, Flags); 2211 const APFloat C0Val(1.0f); 2212 2213 auto C0 = B.buildConstant(S32, 0x6f800000); 2214 auto C1 = B.buildConstant(S32, 0x2f800000); 2215 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2216 2217 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2218 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2219 2220 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2221 2222 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2223 .addUse(Mul0.getReg(0)) 2224 .setMIFlags(Flags); 2225 2226 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2227 2228 B.buildFMul(Res, Sel, Mul1, Flags); 2229 2230 MI.eraseFromParent(); 2231 return true; 2232 } 2233 2234 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2235 MachineRegisterInfo &MRI, 2236 MachineIRBuilder &B) const { 2237 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2238 if (!MFI->isEntryFunction()) { 2239 return legalizePreloadedArgIntrin(MI, MRI, B, 2240 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2241 } 2242 2243 B.setInstr(MI); 2244 2245 uint64_t Offset = 2246 ST.getTargetLowering()->getImplicitParameterOffset( 2247 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2248 Register DstReg = MI.getOperand(0).getReg(); 2249 LLT DstTy = MRI.getType(DstReg); 2250 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2251 2252 const ArgDescriptor *Arg; 2253 const TargetRegisterClass *RC; 2254 std::tie(Arg, RC) 2255 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2256 if (!Arg) 2257 return false; 2258 2259 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2260 if (!loadInputValue(KernargPtrReg, B, Arg)) 2261 return false; 2262 2263 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2264 MI.eraseFromParent(); 2265 return true; 2266 } 2267 2268 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2269 MachineRegisterInfo &MRI, 2270 MachineIRBuilder &B, 2271 unsigned AddrSpace) const { 2272 B.setInstr(MI); 2273 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2274 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2275 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2276 MI.eraseFromParent(); 2277 return true; 2278 } 2279 2280 /// Handle register layout difference for f16 images for some subtargets. 2281 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2282 MachineRegisterInfo &MRI, 2283 Register Reg) const { 2284 if (!ST.hasUnpackedD16VMem()) 2285 return Reg; 2286 2287 const LLT S16 = LLT::scalar(16); 2288 const LLT S32 = LLT::scalar(32); 2289 LLT StoreVT = MRI.getType(Reg); 2290 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2291 2292 auto Unmerge = B.buildUnmerge(S16, Reg); 2293 2294 SmallVector<Register, 4> WideRegs; 2295 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2296 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2297 2298 int NumElts = StoreVT.getNumElements(); 2299 2300 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2301 } 2302 2303 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 2304 MachineRegisterInfo &MRI, 2305 MachineIRBuilder &B, 2306 bool IsFormat) const { 2307 // TODO: Reject f16 format on targets where unsupported. 2308 Register VData = MI.getOperand(1).getReg(); 2309 LLT Ty = MRI.getType(VData); 2310 2311 B.setInstr(MI); 2312 2313 const LLT S32 = LLT::scalar(32); 2314 const LLT S16 = LLT::scalar(16); 2315 2316 // Fixup illegal register types for i8 stores. 2317 if (Ty == LLT::scalar(8) || Ty == S16) { 2318 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2319 MI.getOperand(1).setReg(AnyExt); 2320 return true; 2321 } 2322 2323 if (Ty.isVector()) { 2324 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2325 if (IsFormat) 2326 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2327 return true; 2328 } 2329 2330 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2331 } 2332 2333 return Ty == S32; 2334 } 2335 2336 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2337 MachineRegisterInfo &MRI, 2338 MachineIRBuilder &B) const { 2339 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2340 auto IntrID = MI.getIntrinsicID(); 2341 switch (IntrID) { 2342 case Intrinsic::amdgcn_if: 2343 case Intrinsic::amdgcn_else: { 2344 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2345 const SIRegisterInfo *TRI 2346 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2347 2348 B.setInstr(*BrCond); 2349 Register Def = MI.getOperand(1).getReg(); 2350 Register Use = MI.getOperand(3).getReg(); 2351 2352 if (IntrID == Intrinsic::amdgcn_if) { 2353 B.buildInstr(AMDGPU::SI_IF) 2354 .addDef(Def) 2355 .addUse(Use) 2356 .addMBB(BrCond->getOperand(1).getMBB()); 2357 } else { 2358 B.buildInstr(AMDGPU::SI_ELSE) 2359 .addDef(Def) 2360 .addUse(Use) 2361 .addMBB(BrCond->getOperand(1).getMBB()) 2362 .addImm(0); 2363 } 2364 2365 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2366 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2367 MI.eraseFromParent(); 2368 BrCond->eraseFromParent(); 2369 return true; 2370 } 2371 2372 return false; 2373 } 2374 case Intrinsic::amdgcn_loop: { 2375 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2376 const SIRegisterInfo *TRI 2377 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2378 2379 B.setInstr(*BrCond); 2380 Register Reg = MI.getOperand(2).getReg(); 2381 B.buildInstr(AMDGPU::SI_LOOP) 2382 .addUse(Reg) 2383 .addMBB(BrCond->getOperand(1).getMBB()); 2384 MI.eraseFromParent(); 2385 BrCond->eraseFromParent(); 2386 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2387 return true; 2388 } 2389 2390 return false; 2391 } 2392 case Intrinsic::amdgcn_kernarg_segment_ptr: 2393 return legalizePreloadedArgIntrin( 2394 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2395 case Intrinsic::amdgcn_implicitarg_ptr: 2396 return legalizeImplicitArgPtr(MI, MRI, B); 2397 case Intrinsic::amdgcn_workitem_id_x: 2398 return legalizePreloadedArgIntrin(MI, MRI, B, 2399 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2400 case Intrinsic::amdgcn_workitem_id_y: 2401 return legalizePreloadedArgIntrin(MI, MRI, B, 2402 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2403 case Intrinsic::amdgcn_workitem_id_z: 2404 return legalizePreloadedArgIntrin(MI, MRI, B, 2405 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2406 case Intrinsic::amdgcn_workgroup_id_x: 2407 return legalizePreloadedArgIntrin(MI, MRI, B, 2408 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2409 case Intrinsic::amdgcn_workgroup_id_y: 2410 return legalizePreloadedArgIntrin(MI, MRI, B, 2411 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2412 case Intrinsic::amdgcn_workgroup_id_z: 2413 return legalizePreloadedArgIntrin(MI, MRI, B, 2414 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2415 case Intrinsic::amdgcn_dispatch_ptr: 2416 return legalizePreloadedArgIntrin(MI, MRI, B, 2417 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2418 case Intrinsic::amdgcn_queue_ptr: 2419 return legalizePreloadedArgIntrin(MI, MRI, B, 2420 AMDGPUFunctionArgInfo::QUEUE_PTR); 2421 case Intrinsic::amdgcn_implicit_buffer_ptr: 2422 return legalizePreloadedArgIntrin( 2423 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2424 case Intrinsic::amdgcn_dispatch_id: 2425 return legalizePreloadedArgIntrin(MI, MRI, B, 2426 AMDGPUFunctionArgInfo::DISPATCH_ID); 2427 case Intrinsic::amdgcn_fdiv_fast: 2428 return legalizeFDIVFastIntrin(MI, MRI, B); 2429 case Intrinsic::amdgcn_is_shared: 2430 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2431 case Intrinsic::amdgcn_is_private: 2432 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2433 case Intrinsic::amdgcn_wavefrontsize: { 2434 B.setInstr(MI); 2435 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2436 MI.eraseFromParent(); 2437 return true; 2438 } 2439 case Intrinsic::amdgcn_raw_buffer_store: 2440 return legalizeRawBufferStore(MI, MRI, B, false); 2441 case Intrinsic::amdgcn_raw_buffer_store_format: 2442 return legalizeRawBufferStore(MI, MRI, B, true); 2443 default: 2444 return true; 2445 } 2446 2447 return true; 2448 } 2449