1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 const LLT MinLegalScalarShiftTy = ST.has16BitInsts() ? S16 : S32; 248 249 setAction({G_BRCOND, S1}, Legal); // VCC branches 250 setAction({G_BRCOND, S32}, Legal); // SCC branches 251 252 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 253 // elements for v3s16 254 getActionDefinitionsBuilder(G_PHI) 255 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 256 .legalFor(AllS32Vectors) 257 .legalFor(AllS64Vectors) 258 .legalFor(AddrSpaces64) 259 .legalFor(AddrSpaces32) 260 .clampScalar(0, S32, S256) 261 .widenScalarToNextPow2(0, 32) 262 .clampMaxNumElements(0, S32, 16) 263 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 264 .legalIf(isPointer(0)); 265 266 if (ST.has16BitInsts()) { 267 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 268 .legalFor({S32, S16}) 269 .clampScalar(0, S16, S32) 270 .scalarize(0); 271 } else { 272 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 273 .legalFor({S32}) 274 .clampScalar(0, S32, S32) 275 .scalarize(0); 276 } 277 278 // FIXME: Not really legal. Placeholder for custom lowering. 279 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 280 .legalFor({S32, S64}) 281 .clampScalar(0, S32, S64) 282 .widenScalarToNextPow2(0, 32) 283 .scalarize(0); 284 285 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 286 .legalFor({S32}) 287 .clampScalar(0, S32, S32) 288 .scalarize(0); 289 290 // Report legal for any types we can handle anywhere. For the cases only legal 291 // on the SALU, RegBankSelect will be able to re-legalize. 292 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 293 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 294 .clampScalar(0, S32, S64) 295 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 296 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 297 .widenScalarToNextPow2(0) 298 .scalarize(0); 299 300 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 301 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 302 .legalFor({{S32, S1}, {S32, S32}}) 303 .clampScalar(0, S32, S32) 304 .scalarize(0); // TODO: Implement. 305 306 getActionDefinitionsBuilder(G_BITCAST) 307 // Don't worry about the size constraint. 308 .legalIf(all(isRegisterType(0), isRegisterType(1))) 309 // FIXME: Testing hack 310 .legalForCartesianProduct({S16, LLT::vector(2, 8), }) 311 .lower(); 312 313 314 getActionDefinitionsBuilder(G_CONSTANT) 315 .legalFor({S1, S32, S64, S16, GlobalPtr, 316 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 317 .clampScalar(0, S32, S64) 318 .widenScalarToNextPow2(0) 319 .legalIf(isPointer(0)); 320 321 getActionDefinitionsBuilder(G_FCONSTANT) 322 .legalFor({S32, S64, S16}) 323 .clampScalar(0, S16, S64); 324 325 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 326 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 327 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 328 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 329 .clampScalarOrElt(0, S32, S1024) 330 .legalIf(isMultiple32(0)) 331 .widenScalarToNextPow2(0, 32) 332 .clampMaxNumElements(0, S32, 16); 333 334 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 335 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 336 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 337 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 338 339 auto &FPOpActions = getActionDefinitionsBuilder( 340 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 341 .legalFor({S32, S64}); 342 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 343 .customFor({S32, S64}); 344 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 345 .customFor({S32, S64}); 346 347 if (ST.has16BitInsts()) { 348 if (ST.hasVOP3PInsts()) 349 FPOpActions.legalFor({S16, V2S16}); 350 else 351 FPOpActions.legalFor({S16}); 352 353 TrigActions.customFor({S16}); 354 FDIVActions.customFor({S16}); 355 } 356 357 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 358 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 359 360 if (ST.hasVOP3PInsts()) { 361 MinNumMaxNum.customFor(FPTypesPK16) 362 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 363 .clampMaxNumElements(0, S16, 2) 364 .clampScalar(0, S16, S64) 365 .scalarize(0); 366 } else if (ST.has16BitInsts()) { 367 MinNumMaxNum.customFor(FPTypes16) 368 .clampScalar(0, S16, S64) 369 .scalarize(0); 370 } else { 371 MinNumMaxNum.customFor(FPTypesBase) 372 .clampScalar(0, S32, S64) 373 .scalarize(0); 374 } 375 376 if (ST.hasVOP3PInsts()) 377 FPOpActions.clampMaxNumElements(0, S16, 2); 378 379 FPOpActions 380 .scalarize(0) 381 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 382 383 TrigActions 384 .scalarize(0) 385 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 386 387 FDIVActions 388 .scalarize(0) 389 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 390 391 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 392 .legalFor(FPTypesPK16) 393 .clampMaxNumElements(0, S16, 2) 394 .scalarize(0) 395 .clampScalar(0, S16, S64); 396 397 if (ST.has16BitInsts()) { 398 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 399 .legalFor({S32, S64, S16}) 400 .scalarize(0) 401 .clampScalar(0, S16, S64); 402 } else { 403 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 404 .legalFor({S32, S64}) 405 .scalarize(0) 406 .clampScalar(0, S32, S64); 407 } 408 409 getActionDefinitionsBuilder(G_FPTRUNC) 410 .legalFor({{S32, S64}, {S16, S32}}) 411 .scalarize(0); 412 413 getActionDefinitionsBuilder(G_FPEXT) 414 .legalFor({{S64, S32}, {S32, S16}}) 415 .lowerFor({{S64, S16}}) // FIXME: Implement 416 .scalarize(0); 417 418 getActionDefinitionsBuilder(G_FSUB) 419 // Use actual fsub instruction 420 .legalFor({S32}) 421 // Must use fadd + fneg 422 .lowerFor({S64, S16, V2S16}) 423 .scalarize(0) 424 .clampScalar(0, S32, S64); 425 426 // Whether this is legal depends on the floating point mode for the function. 427 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 428 if (ST.hasMadF16()) 429 FMad.customFor({S32, S16}); 430 else 431 FMad.customFor({S32}); 432 FMad.scalarize(0) 433 .lower(); 434 435 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 436 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 437 {S32, S1}, {S64, S1}, {S16, S1}, 438 {S96, S32}, 439 // FIXME: Hack 440 {S64, LLT::scalar(33)}, 441 {S32, S8}, {S32, LLT::scalar(24)}}) 442 .scalarize(0) 443 .clampScalar(0, S32, S64); 444 445 // TODO: Split s1->s64 during regbankselect for VALU. 446 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 447 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 448 .lowerFor({{S32, S64}}) 449 .lowerIf(typeIs(1, S1)) 450 .customFor({{S64, S64}}); 451 if (ST.has16BitInsts()) 452 IToFP.legalFor({{S16, S16}}); 453 IToFP.clampScalar(1, S32, S64) 454 .scalarize(0); 455 456 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 457 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 458 if (ST.has16BitInsts()) 459 FPToI.legalFor({{S16, S16}}); 460 else 461 FPToI.minScalar(1, S32); 462 463 FPToI.minScalar(0, S32) 464 .scalarize(0); 465 466 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 467 .scalarize(0) 468 .lower(); 469 470 if (ST.has16BitInsts()) { 471 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 472 .legalFor({S16, S32, S64}) 473 .clampScalar(0, S16, S64) 474 .scalarize(0); 475 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 476 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 477 .legalFor({S32, S64}) 478 .clampScalar(0, S32, S64) 479 .scalarize(0); 480 } else { 481 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 482 .legalFor({S32}) 483 .customFor({S64}) 484 .clampScalar(0, S32, S64) 485 .scalarize(0); 486 } 487 488 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 489 .scalarize(0) 490 .alwaysLegal(); 491 492 auto &CmpBuilder = 493 getActionDefinitionsBuilder(G_ICMP) 494 // The compare output type differs based on the register bank of the output, 495 // so make both s1 and s32 legal. 496 // 497 // Scalar compares producing output in scc will be promoted to s32, as that 498 // is the allocatable register type that will be needed for the copy from 499 // scc. This will be promoted during RegBankSelect, and we assume something 500 // before that won't try to use s32 result types. 501 // 502 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 503 // bank. 504 .legalForCartesianProduct( 505 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 506 .legalForCartesianProduct( 507 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 508 if (ST.has16BitInsts()) { 509 CmpBuilder.legalFor({{S1, S16}}); 510 } 511 512 CmpBuilder 513 .widenScalarToNextPow2(1) 514 .clampScalar(1, S32, S64) 515 .scalarize(0) 516 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 517 518 getActionDefinitionsBuilder(G_FCMP) 519 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 520 .widenScalarToNextPow2(1) 521 .clampScalar(1, S32, S64) 522 .scalarize(0); 523 524 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 525 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 526 G_FLOG, G_FLOG2, G_FLOG10}) 527 .legalFor({S32}) 528 .scalarize(0); 529 530 // The 64-bit versions produce 32-bit results, but only on the SALU. 531 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 532 G_CTTZ, G_CTTZ_ZERO_UNDEF, 533 G_CTPOP}) 534 .legalFor({{S32, S32}, {S32, S64}}) 535 .clampScalar(0, S32, S32) 536 .clampScalar(1, S32, S64) 537 .scalarize(0) 538 .widenScalarToNextPow2(0, 32) 539 .widenScalarToNextPow2(1, 32); 540 541 // TODO: Expand for > s32 542 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 543 .legalFor({S32}) 544 .clampScalar(0, S32, S32) 545 .scalarize(0); 546 547 if (ST.has16BitInsts()) { 548 if (ST.hasVOP3PInsts()) { 549 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 550 .legalFor({S32, S16, V2S16}) 551 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 552 .clampMaxNumElements(0, S16, 2) 553 .clampScalar(0, S16, S32) 554 .widenScalarToNextPow2(0) 555 .scalarize(0); 556 } else { 557 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 558 .legalFor({S32, S16}) 559 .widenScalarToNextPow2(0) 560 .clampScalar(0, S16, S32) 561 .scalarize(0); 562 } 563 } else { 564 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 565 .legalFor({S32}) 566 .clampScalar(0, S32, S32) 567 .widenScalarToNextPow2(0) 568 .scalarize(0); 569 } 570 571 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 572 return [=](const LegalityQuery &Query) { 573 return Query.Types[TypeIdx0].getSizeInBits() < 574 Query.Types[TypeIdx1].getSizeInBits(); 575 }; 576 }; 577 578 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 579 return [=](const LegalityQuery &Query) { 580 return Query.Types[TypeIdx0].getSizeInBits() > 581 Query.Types[TypeIdx1].getSizeInBits(); 582 }; 583 }; 584 585 getActionDefinitionsBuilder(G_INTTOPTR) 586 // List the common cases 587 .legalForCartesianProduct(AddrSpaces64, {S64}) 588 .legalForCartesianProduct(AddrSpaces32, {S32}) 589 .scalarize(0) 590 // Accept any address space as long as the size matches 591 .legalIf(sameSize(0, 1)) 592 .widenScalarIf(smallerThan(1, 0), 593 [](const LegalityQuery &Query) { 594 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 595 }) 596 .narrowScalarIf(greaterThan(1, 0), 597 [](const LegalityQuery &Query) { 598 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 599 }); 600 601 getActionDefinitionsBuilder(G_PTRTOINT) 602 // List the common cases 603 .legalForCartesianProduct(AddrSpaces64, {S64}) 604 .legalForCartesianProduct(AddrSpaces32, {S32}) 605 .scalarize(0) 606 // Accept any address space as long as the size matches 607 .legalIf(sameSize(0, 1)) 608 .widenScalarIf(smallerThan(0, 1), 609 [](const LegalityQuery &Query) { 610 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 611 }) 612 .narrowScalarIf( 613 greaterThan(0, 1), 614 [](const LegalityQuery &Query) { 615 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 616 }); 617 618 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 619 .scalarize(0) 620 .custom(); 621 622 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 623 // handle some operations by just promoting the register during 624 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 625 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 626 switch (AS) { 627 // FIXME: Private element size. 628 case AMDGPUAS::PRIVATE_ADDRESS: 629 return 32; 630 // FIXME: Check subtarget 631 case AMDGPUAS::LOCAL_ADDRESS: 632 return ST.useDS128() ? 128 : 64; 633 634 // Treat constant and global as identical. SMRD loads are sometimes usable 635 // for global loads (ideally constant address space should be eliminated) 636 // depending on the context. Legality cannot be context dependent, but 637 // RegBankSelect can split the load as necessary depending on the pointer 638 // register bank/uniformity and if the memory is invariant or not written in 639 // a kernel. 640 case AMDGPUAS::CONSTANT_ADDRESS: 641 case AMDGPUAS::GLOBAL_ADDRESS: 642 return 512; 643 default: 644 return 128; 645 } 646 }; 647 648 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 649 const LLT DstTy = Query.Types[0]; 650 651 // Split vector extloads. 652 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 653 unsigned Align = Query.MMODescrs[0].AlignInBits; 654 655 if (MemSize < DstTy.getSizeInBits()) 656 MemSize = std::max(MemSize, Align); 657 658 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 659 return true; 660 661 const LLT PtrTy = Query.Types[1]; 662 unsigned AS = PtrTy.getAddressSpace(); 663 if (MemSize > maxSizeForAddrSpace(AS)) 664 return true; 665 666 // Catch weird sized loads that don't evenly divide into the access sizes 667 // TODO: May be able to widen depending on alignment etc. 668 unsigned NumRegs = MemSize / 32; 669 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 670 return true; 671 672 if (Align < MemSize) { 673 const SITargetLowering *TLI = ST.getTargetLowering(); 674 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 675 } 676 677 return false; 678 }; 679 680 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 681 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 682 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 683 684 // TODO: Refine based on subtargets which support unaligned access or 128-bit 685 // LDS 686 // TODO: Unsupported flat for SI. 687 688 for (unsigned Op : {G_LOAD, G_STORE}) { 689 const bool IsStore = Op == G_STORE; 690 691 auto &Actions = getActionDefinitionsBuilder(Op); 692 // Whitelist the common cases. 693 // TODO: Pointer loads 694 // TODO: Wide constant loads 695 // TODO: Only CI+ has 3x loads 696 // TODO: Loads to s16 on gfx9 697 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 698 {V2S32, GlobalPtr, 64, GlobalAlign32}, 699 {V3S32, GlobalPtr, 96, GlobalAlign32}, 700 {S96, GlobalPtr, 96, GlobalAlign32}, 701 {V4S32, GlobalPtr, 128, GlobalAlign32}, 702 {S128, GlobalPtr, 128, GlobalAlign32}, 703 {S64, GlobalPtr, 64, GlobalAlign32}, 704 {V2S64, GlobalPtr, 128, GlobalAlign32}, 705 {V2S16, GlobalPtr, 32, GlobalAlign32}, 706 {S32, GlobalPtr, 8, GlobalAlign8}, 707 {S32, GlobalPtr, 16, GlobalAlign16}, 708 709 {S32, LocalPtr, 32, 32}, 710 {S64, LocalPtr, 64, 32}, 711 {V2S32, LocalPtr, 64, 32}, 712 {S32, LocalPtr, 8, 8}, 713 {S32, LocalPtr, 16, 16}, 714 {V2S16, LocalPtr, 32, 32}, 715 716 {S32, PrivatePtr, 32, 32}, 717 {S32, PrivatePtr, 8, 8}, 718 {S32, PrivatePtr, 16, 16}, 719 {V2S16, PrivatePtr, 32, 32}, 720 721 {S32, FlatPtr, 32, GlobalAlign32}, 722 {S32, FlatPtr, 16, GlobalAlign16}, 723 {S32, FlatPtr, 8, GlobalAlign8}, 724 {V2S16, FlatPtr, 32, GlobalAlign32}, 725 726 {S32, ConstantPtr, 32, GlobalAlign32}, 727 {V2S32, ConstantPtr, 64, GlobalAlign32}, 728 {V3S32, ConstantPtr, 96, GlobalAlign32}, 729 {V4S32, ConstantPtr, 128, GlobalAlign32}, 730 {S64, ConstantPtr, 64, GlobalAlign32}, 731 {S128, ConstantPtr, 128, GlobalAlign32}, 732 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 733 Actions 734 .customIf(typeIs(1, Constant32Ptr)) 735 .narrowScalarIf( 736 [=](const LegalityQuery &Query) -> bool { 737 return !Query.Types[0].isVector() && needToSplitLoad(Query); 738 }, 739 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 740 const LLT DstTy = Query.Types[0]; 741 const LLT PtrTy = Query.Types[1]; 742 743 const unsigned DstSize = DstTy.getSizeInBits(); 744 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 745 746 // Split extloads. 747 if (DstSize > MemSize) 748 return std::make_pair(0, LLT::scalar(MemSize)); 749 750 if (DstSize > 32 && (DstSize % 32 != 0)) { 751 // FIXME: Need a way to specify non-extload of larger size if 752 // suitably aligned. 753 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 754 } 755 756 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 757 if (MemSize > MaxSize) 758 return std::make_pair(0, LLT::scalar(MaxSize)); 759 760 unsigned Align = Query.MMODescrs[0].AlignInBits; 761 return std::make_pair(0, LLT::scalar(Align)); 762 }) 763 .fewerElementsIf( 764 [=](const LegalityQuery &Query) -> bool { 765 return Query.Types[0].isVector() && needToSplitLoad(Query); 766 }, 767 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 768 const LLT DstTy = Query.Types[0]; 769 const LLT PtrTy = Query.Types[1]; 770 771 LLT EltTy = DstTy.getElementType(); 772 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 773 774 // Split if it's too large for the address space. 775 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 776 unsigned NumElts = DstTy.getNumElements(); 777 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 778 779 // FIXME: Refine when odd breakdowns handled 780 // The scalars will need to be re-legalized. 781 if (NumPieces == 1 || NumPieces >= NumElts || 782 NumElts % NumPieces != 0) 783 return std::make_pair(0, EltTy); 784 785 return std::make_pair(0, 786 LLT::vector(NumElts / NumPieces, EltTy)); 787 } 788 789 // Need to split because of alignment. 790 unsigned Align = Query.MMODescrs[0].AlignInBits; 791 unsigned EltSize = EltTy.getSizeInBits(); 792 if (EltSize > Align && 793 (EltSize / Align < DstTy.getNumElements())) { 794 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 795 } 796 797 // May need relegalization for the scalars. 798 return std::make_pair(0, EltTy); 799 }) 800 .minScalar(0, S32); 801 802 if (IsStore) 803 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 804 805 // TODO: Need a bitcast lower option? 806 Actions 807 .legalIf([=](const LegalityQuery &Query) { 808 const LLT Ty0 = Query.Types[0]; 809 unsigned Size = Ty0.getSizeInBits(); 810 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 811 unsigned Align = Query.MMODescrs[0].AlignInBits; 812 813 // FIXME: Widening store from alignment not valid. 814 if (MemSize < Size) 815 MemSize = std::max(MemSize, Align); 816 817 // No extending vector loads. 818 if (Size > MemSize && Ty0.isVector()) 819 return false; 820 821 switch (MemSize) { 822 case 8: 823 case 16: 824 return Size == 32; 825 case 32: 826 case 64: 827 case 128: 828 return true; 829 case 96: 830 return ST.hasDwordx3LoadStores(); 831 case 256: 832 case 512: 833 return true; 834 default: 835 return false; 836 } 837 }) 838 .widenScalarToNextPow2(0) 839 // TODO: v3s32->v4s32 with alignment 840 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 841 } 842 843 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 844 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 845 {S32, GlobalPtr, 16, 2 * 8}, 846 {S32, LocalPtr, 8, 8}, 847 {S32, LocalPtr, 16, 16}, 848 {S32, PrivatePtr, 8, 8}, 849 {S32, PrivatePtr, 16, 16}, 850 {S32, ConstantPtr, 8, 8}, 851 {S32, ConstantPtr, 16, 2 * 8}}); 852 if (ST.hasFlatAddressSpace()) { 853 ExtLoads.legalForTypesWithMemDesc( 854 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 855 } 856 857 ExtLoads.clampScalar(0, S32, S32) 858 .widenScalarToNextPow2(0) 859 .unsupportedIfMemSizeNotPow2() 860 .lower(); 861 862 auto &Atomics = getActionDefinitionsBuilder( 863 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 864 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 865 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 866 G_ATOMICRMW_UMIN}) 867 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 868 {S64, GlobalPtr}, {S64, LocalPtr}}); 869 if (ST.hasFlatAddressSpace()) { 870 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 871 } 872 873 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 874 .legalFor({{S32, LocalPtr}}); 875 876 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 877 // demarshalling 878 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 879 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 880 {S32, FlatPtr}, {S64, FlatPtr}}) 881 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 882 {S32, RegionPtr}, {S64, RegionPtr}}); 883 // TODO: Pointer types, any 32-bit or 64-bit vector 884 885 // Condition should be s32 for scalar, s1 for vector. 886 getActionDefinitionsBuilder(G_SELECT) 887 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 888 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 889 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 890 .clampScalar(0, S16, S64) 891 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 892 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 893 .scalarize(1) 894 .clampMaxNumElements(0, S32, 2) 895 .clampMaxNumElements(0, LocalPtr, 2) 896 .clampMaxNumElements(0, PrivatePtr, 2) 897 .scalarize(0) 898 .widenScalarToNextPow2(0) 899 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 900 901 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 902 // be more flexible with the shift amount type. 903 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 904 .legalFor({{S32, S32}, {S64, S32}}); 905 if (ST.has16BitInsts()) { 906 if (ST.hasVOP3PInsts()) { 907 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 908 .clampMaxNumElements(0, S16, 2); 909 } else 910 Shifts.legalFor({{S16, S32}, {S16, S16}}); 911 912 // TODO: Support 16-bit shift amounts 913 Shifts.clampScalar(1, S32, S32); 914 Shifts.clampScalar(0, S16, S64); 915 Shifts.widenScalarToNextPow2(0, 16); 916 } else { 917 // Make sure we legalize the shift amount type first, as the general 918 // expansion for the shifted type will produce much worse code if it hasn't 919 // been truncated already. 920 Shifts.clampScalar(1, S32, S32); 921 Shifts.clampScalar(0, S32, S64); 922 Shifts.widenScalarToNextPow2(0, 32); 923 } 924 Shifts.scalarize(0); 925 926 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 927 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 928 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 929 unsigned IdxTypeIdx = 2; 930 931 getActionDefinitionsBuilder(Op) 932 .customIf([=](const LegalityQuery &Query) { 933 const LLT EltTy = Query.Types[EltTypeIdx]; 934 const LLT VecTy = Query.Types[VecTypeIdx]; 935 const LLT IdxTy = Query.Types[IdxTypeIdx]; 936 return (EltTy.getSizeInBits() == 16 || 937 EltTy.getSizeInBits() % 32 == 0) && 938 VecTy.getSizeInBits() % 32 == 0 && 939 VecTy.getSizeInBits() <= 1024 && 940 IdxTy.getSizeInBits() == 32; 941 }) 942 .clampScalar(EltTypeIdx, S32, S64) 943 .clampScalar(VecTypeIdx, S32, S64) 944 .clampScalar(IdxTypeIdx, S32, S32); 945 } 946 947 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 948 .unsupportedIf([=](const LegalityQuery &Query) { 949 const LLT &EltTy = Query.Types[1].getElementType(); 950 return Query.Types[0] != EltTy; 951 }); 952 953 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 954 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 955 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 956 957 // FIXME: Doesn't handle extract of illegal sizes. 958 getActionDefinitionsBuilder(Op) 959 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 960 // FIXME: Multiples of 16 should not be legal. 961 .legalIf([=](const LegalityQuery &Query) { 962 const LLT BigTy = Query.Types[BigTyIdx]; 963 const LLT LitTy = Query.Types[LitTyIdx]; 964 return (BigTy.getSizeInBits() % 32 == 0) && 965 (LitTy.getSizeInBits() % 16 == 0); 966 }) 967 .widenScalarIf( 968 [=](const LegalityQuery &Query) { 969 const LLT BigTy = Query.Types[BigTyIdx]; 970 return (BigTy.getScalarSizeInBits() < 16); 971 }, 972 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 973 .widenScalarIf( 974 [=](const LegalityQuery &Query) { 975 const LLT LitTy = Query.Types[LitTyIdx]; 976 return (LitTy.getScalarSizeInBits() < 16); 977 }, 978 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 979 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 980 .widenScalarToNextPow2(BigTyIdx, 32); 981 982 } 983 984 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 985 .legalForCartesianProduct(AllS32Vectors, {S32}) 986 .legalForCartesianProduct(AllS64Vectors, {S64}) 987 .clampNumElements(0, V16S32, V32S32) 988 .clampNumElements(0, V2S64, V16S64) 989 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 990 991 if (ST.hasScalarPackInsts()) 992 BuildVector.legalFor({V2S16, S32}); 993 994 BuildVector 995 .minScalarSameAs(1, 0) 996 .legalIf(isRegisterType(0)) 997 .minScalarOrElt(0, S32); 998 999 if (ST.hasScalarPackInsts()) { 1000 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1001 .legalFor({V2S16, S32}) 1002 .lower(); 1003 } else { 1004 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1005 .lower(); 1006 } 1007 1008 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1009 .legalIf(isRegisterType(0)); 1010 1011 // TODO: Don't fully scalarize v2s16 pieces 1012 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1013 1014 // Merge/Unmerge 1015 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1016 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1017 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1018 1019 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1020 const LLT &Ty = Query.Types[TypeIdx]; 1021 if (Ty.isVector()) { 1022 const LLT &EltTy = Ty.getElementType(); 1023 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1024 return true; 1025 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1026 return true; 1027 } 1028 return false; 1029 }; 1030 1031 auto &Builder = getActionDefinitionsBuilder(Op) 1032 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1033 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1034 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1035 // valid. 1036 .clampScalar(LitTyIdx, S16, S256) 1037 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1038 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1039 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1040 elementTypeIs(1, S16)), 1041 changeTo(1, V2S16)) 1042 // Break up vectors with weird elements into scalars 1043 .fewerElementsIf( 1044 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1045 scalarize(0)) 1046 .fewerElementsIf( 1047 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1048 scalarize(1)) 1049 .clampScalar(BigTyIdx, S32, S1024) 1050 .lowerFor({{S16, V2S16}}); 1051 1052 if (Op == G_MERGE_VALUES) { 1053 Builder.widenScalarIf( 1054 // TODO: Use 16-bit shifts if legal for 8-bit values? 1055 [=](const LegalityQuery &Query) { 1056 const LLT Ty = Query.Types[LitTyIdx]; 1057 return Ty.getSizeInBits() < 32; 1058 }, 1059 changeTo(LitTyIdx, S32)); 1060 } 1061 1062 Builder.widenScalarIf( 1063 [=](const LegalityQuery &Query) { 1064 const LLT Ty = Query.Types[BigTyIdx]; 1065 return !isPowerOf2_32(Ty.getSizeInBits()) && 1066 Ty.getSizeInBits() % 16 != 0; 1067 }, 1068 [=](const LegalityQuery &Query) { 1069 // Pick the next power of 2, or a multiple of 64 over 128. 1070 // Whichever is smaller. 1071 const LLT &Ty = Query.Types[BigTyIdx]; 1072 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1073 if (NewSizeInBits >= 256) { 1074 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1075 if (RoundedTo < NewSizeInBits) 1076 NewSizeInBits = RoundedTo; 1077 } 1078 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1079 }) 1080 .legalIf([=](const LegalityQuery &Query) { 1081 const LLT &BigTy = Query.Types[BigTyIdx]; 1082 const LLT &LitTy = Query.Types[LitTyIdx]; 1083 1084 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1085 return false; 1086 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1087 return false; 1088 1089 return BigTy.getSizeInBits() % 16 == 0 && 1090 LitTy.getSizeInBits() % 16 == 0 && 1091 BigTy.getSizeInBits() <= 1024; 1092 }) 1093 // Any vectors left are the wrong size. Scalarize them. 1094 .scalarize(0) 1095 .scalarize(1); 1096 } 1097 1098 // TODO: Make legal for s32, s64. s64 case needs break down in regbankselect. 1099 getActionDefinitionsBuilder(G_SEXT_INREG) 1100 .clampScalar(0, MinLegalScalarShiftTy, S64) 1101 .lower(); 1102 1103 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1104 .legalFor({S64}); 1105 1106 getActionDefinitionsBuilder({ 1107 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1108 G_FCOPYSIGN, 1109 1110 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1111 G_READ_REGISTER, 1112 G_WRITE_REGISTER, 1113 1114 G_SADDO, G_SSUBO, 1115 1116 // TODO: Implement 1117 G_FMINIMUM, G_FMAXIMUM 1118 }).lower(); 1119 1120 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1121 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1122 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1123 .unsupported(); 1124 1125 computeTables(); 1126 verify(*ST.getInstrInfo()); 1127 } 1128 1129 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1130 MachineRegisterInfo &MRI, 1131 MachineIRBuilder &B, 1132 GISelChangeObserver &Observer) const { 1133 switch (MI.getOpcode()) { 1134 case TargetOpcode::G_ADDRSPACE_CAST: 1135 return legalizeAddrSpaceCast(MI, MRI, B); 1136 case TargetOpcode::G_FRINT: 1137 return legalizeFrint(MI, MRI, B); 1138 case TargetOpcode::G_FCEIL: 1139 return legalizeFceil(MI, MRI, B); 1140 case TargetOpcode::G_INTRINSIC_TRUNC: 1141 return legalizeIntrinsicTrunc(MI, MRI, B); 1142 case TargetOpcode::G_SITOFP: 1143 return legalizeITOFP(MI, MRI, B, true); 1144 case TargetOpcode::G_UITOFP: 1145 return legalizeITOFP(MI, MRI, B, false); 1146 case TargetOpcode::G_FMINNUM: 1147 case TargetOpcode::G_FMAXNUM: 1148 case TargetOpcode::G_FMINNUM_IEEE: 1149 case TargetOpcode::G_FMAXNUM_IEEE: 1150 return legalizeMinNumMaxNum(MI, MRI, B); 1151 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1152 return legalizeExtractVectorElt(MI, MRI, B); 1153 case TargetOpcode::G_INSERT_VECTOR_ELT: 1154 return legalizeInsertVectorElt(MI, MRI, B); 1155 case TargetOpcode::G_FSIN: 1156 case TargetOpcode::G_FCOS: 1157 return legalizeSinCos(MI, MRI, B); 1158 case TargetOpcode::G_GLOBAL_VALUE: 1159 return legalizeGlobalValue(MI, MRI, B); 1160 case TargetOpcode::G_LOAD: 1161 return legalizeLoad(MI, MRI, B, Observer); 1162 case TargetOpcode::G_FMAD: 1163 return legalizeFMad(MI, MRI, B); 1164 case TargetOpcode::G_FDIV: 1165 return legalizeFDIV(MI, MRI, B); 1166 case TargetOpcode::G_ATOMIC_CMPXCHG: 1167 return legalizeAtomicCmpXChg(MI, MRI, B); 1168 default: 1169 return false; 1170 } 1171 1172 llvm_unreachable("expected switch to return"); 1173 } 1174 1175 Register AMDGPULegalizerInfo::getSegmentAperture( 1176 unsigned AS, 1177 MachineRegisterInfo &MRI, 1178 MachineIRBuilder &B) const { 1179 MachineFunction &MF = B.getMF(); 1180 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1181 const LLT S32 = LLT::scalar(32); 1182 1183 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1184 1185 if (ST.hasApertureRegs()) { 1186 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1187 // getreg. 1188 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1189 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1190 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1191 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1192 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1193 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1194 unsigned Encoding = 1195 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1196 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1197 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1198 1199 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1200 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1201 1202 B.buildInstr(AMDGPU::S_GETREG_B32) 1203 .addDef(GetReg) 1204 .addImm(Encoding); 1205 MRI.setType(GetReg, S32); 1206 1207 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1208 B.buildInstr(TargetOpcode::G_SHL) 1209 .addDef(ApertureReg) 1210 .addUse(GetReg) 1211 .addUse(ShiftAmt.getReg(0)); 1212 1213 return ApertureReg; 1214 } 1215 1216 Register QueuePtr = MRI.createGenericVirtualRegister( 1217 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1218 1219 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1220 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1221 return Register(); 1222 1223 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1224 // private_segment_aperture_base_hi. 1225 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1226 1227 // TODO: can we be smarter about machine pointer info? 1228 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1229 MachineMemOperand *MMO = MF.getMachineMemOperand( 1230 PtrInfo, 1231 MachineMemOperand::MOLoad | 1232 MachineMemOperand::MODereferenceable | 1233 MachineMemOperand::MOInvariant, 1234 4, 1235 MinAlign(64, StructOffset)); 1236 1237 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1238 Register LoadAddr; 1239 1240 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1241 B.buildLoad(LoadResult, LoadAddr, *MMO); 1242 return LoadResult; 1243 } 1244 1245 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1246 MachineInstr &MI, MachineRegisterInfo &MRI, 1247 MachineIRBuilder &B) const { 1248 MachineFunction &MF = B.getMF(); 1249 1250 B.setInstr(MI); 1251 1252 const LLT S32 = LLT::scalar(32); 1253 Register Dst = MI.getOperand(0).getReg(); 1254 Register Src = MI.getOperand(1).getReg(); 1255 1256 LLT DstTy = MRI.getType(Dst); 1257 LLT SrcTy = MRI.getType(Src); 1258 unsigned DestAS = DstTy.getAddressSpace(); 1259 unsigned SrcAS = SrcTy.getAddressSpace(); 1260 1261 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1262 // vector element. 1263 assert(!DstTy.isVector()); 1264 1265 const AMDGPUTargetMachine &TM 1266 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1267 1268 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1269 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1270 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1271 return true; 1272 } 1273 1274 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1275 // Truncate. 1276 B.buildExtract(Dst, Src, 0); 1277 MI.eraseFromParent(); 1278 return true; 1279 } 1280 1281 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1282 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1283 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1284 1285 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1286 // another. Merge operands are required to be the same type, but creating an 1287 // extra ptrtoint would be kind of pointless. 1288 auto HighAddr = B.buildConstant( 1289 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1290 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1291 MI.eraseFromParent(); 1292 return true; 1293 } 1294 1295 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1296 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1297 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1298 unsigned NullVal = TM.getNullPointerValue(DestAS); 1299 1300 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1301 auto FlatNull = B.buildConstant(SrcTy, 0); 1302 1303 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1304 1305 // Extract low 32-bits of the pointer. 1306 B.buildExtract(PtrLo32, Src, 0); 1307 1308 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1309 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1310 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1311 1312 MI.eraseFromParent(); 1313 return true; 1314 } 1315 1316 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1317 return false; 1318 1319 if (!ST.hasFlatAddressSpace()) 1320 return false; 1321 1322 auto SegmentNull = 1323 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1324 auto FlatNull = 1325 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1326 1327 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1328 if (!ApertureReg.isValid()) 1329 return false; 1330 1331 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1332 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1333 1334 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1335 1336 // Coerce the type of the low half of the result so we can use merge_values. 1337 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1338 B.buildInstr(TargetOpcode::G_PTRTOINT) 1339 .addDef(SrcAsInt) 1340 .addUse(Src); 1341 1342 // TODO: Should we allow mismatched types but matching sizes in merges to 1343 // avoid the ptrtoint? 1344 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1345 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1346 1347 MI.eraseFromParent(); 1348 return true; 1349 } 1350 1351 bool AMDGPULegalizerInfo::legalizeFrint( 1352 MachineInstr &MI, MachineRegisterInfo &MRI, 1353 MachineIRBuilder &B) const { 1354 B.setInstr(MI); 1355 1356 Register Src = MI.getOperand(1).getReg(); 1357 LLT Ty = MRI.getType(Src); 1358 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1359 1360 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1361 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1362 1363 auto C1 = B.buildFConstant(Ty, C1Val); 1364 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1365 1366 // TODO: Should this propagate fast-math-flags? 1367 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1368 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1369 1370 auto C2 = B.buildFConstant(Ty, C2Val); 1371 auto Fabs = B.buildFAbs(Ty, Src); 1372 1373 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1374 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1375 return true; 1376 } 1377 1378 bool AMDGPULegalizerInfo::legalizeFceil( 1379 MachineInstr &MI, MachineRegisterInfo &MRI, 1380 MachineIRBuilder &B) const { 1381 B.setInstr(MI); 1382 1383 const LLT S1 = LLT::scalar(1); 1384 const LLT S64 = LLT::scalar(64); 1385 1386 Register Src = MI.getOperand(1).getReg(); 1387 assert(MRI.getType(Src) == S64); 1388 1389 // result = trunc(src) 1390 // if (src > 0.0 && src != result) 1391 // result += 1.0 1392 1393 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1394 1395 const auto Zero = B.buildFConstant(S64, 0.0); 1396 const auto One = B.buildFConstant(S64, 1.0); 1397 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1398 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1399 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1400 auto Add = B.buildSelect(S64, And, One, Zero); 1401 1402 // TODO: Should this propagate fast-math-flags? 1403 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1404 return true; 1405 } 1406 1407 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1408 MachineIRBuilder &B) { 1409 const unsigned FractBits = 52; 1410 const unsigned ExpBits = 11; 1411 LLT S32 = LLT::scalar(32); 1412 1413 auto Const0 = B.buildConstant(S32, FractBits - 32); 1414 auto Const1 = B.buildConstant(S32, ExpBits); 1415 1416 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1417 .addUse(Const0.getReg(0)) 1418 .addUse(Const1.getReg(0)); 1419 1420 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1421 } 1422 1423 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1424 MachineInstr &MI, MachineRegisterInfo &MRI, 1425 MachineIRBuilder &B) const { 1426 B.setInstr(MI); 1427 1428 const LLT S1 = LLT::scalar(1); 1429 const LLT S32 = LLT::scalar(32); 1430 const LLT S64 = LLT::scalar(64); 1431 1432 Register Src = MI.getOperand(1).getReg(); 1433 assert(MRI.getType(Src) == S64); 1434 1435 // TODO: Should this use extract since the low half is unused? 1436 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1437 Register Hi = Unmerge.getReg(1); 1438 1439 // Extract the upper half, since this is where we will find the sign and 1440 // exponent. 1441 auto Exp = extractF64Exponent(Hi, B); 1442 1443 const unsigned FractBits = 52; 1444 1445 // Extract the sign bit. 1446 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1447 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1448 1449 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1450 1451 const auto Zero32 = B.buildConstant(S32, 0); 1452 1453 // Extend back to 64-bits. 1454 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1455 1456 auto Shr = B.buildAShr(S64, FractMask, Exp); 1457 auto Not = B.buildNot(S64, Shr); 1458 auto Tmp0 = B.buildAnd(S64, Src, Not); 1459 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1460 1461 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1462 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1463 1464 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1465 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1466 return true; 1467 } 1468 1469 bool AMDGPULegalizerInfo::legalizeITOFP( 1470 MachineInstr &MI, MachineRegisterInfo &MRI, 1471 MachineIRBuilder &B, bool Signed) const { 1472 B.setInstr(MI); 1473 1474 Register Dst = MI.getOperand(0).getReg(); 1475 Register Src = MI.getOperand(1).getReg(); 1476 1477 const LLT S64 = LLT::scalar(64); 1478 const LLT S32 = LLT::scalar(32); 1479 1480 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1481 1482 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1483 1484 auto CvtHi = Signed ? 1485 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1486 B.buildUITOFP(S64, Unmerge.getReg(1)); 1487 1488 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1489 1490 auto ThirtyTwo = B.buildConstant(S32, 32); 1491 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1492 .addUse(CvtHi.getReg(0)) 1493 .addUse(ThirtyTwo.getReg(0)); 1494 1495 // TODO: Should this propagate fast-math-flags? 1496 B.buildFAdd(Dst, LdExp, CvtLo); 1497 MI.eraseFromParent(); 1498 return true; 1499 } 1500 1501 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1502 MachineInstr &MI, MachineRegisterInfo &MRI, 1503 MachineIRBuilder &B) const { 1504 MachineFunction &MF = B.getMF(); 1505 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1506 1507 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1508 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1509 1510 // With ieee_mode disabled, the instructions have the correct behavior 1511 // already for G_FMINNUM/G_FMAXNUM 1512 if (!MFI->getMode().IEEE) 1513 return !IsIEEEOp; 1514 1515 if (IsIEEEOp) 1516 return true; 1517 1518 MachineIRBuilder HelperBuilder(MI); 1519 GISelObserverWrapper DummyObserver; 1520 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1521 HelperBuilder.setInstr(MI); 1522 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1523 } 1524 1525 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1526 MachineInstr &MI, MachineRegisterInfo &MRI, 1527 MachineIRBuilder &B) const { 1528 // TODO: Should move some of this into LegalizerHelper. 1529 1530 // TODO: Promote dynamic indexing of s16 to s32 1531 // TODO: Dynamic s64 indexing is only legal for SGPR. 1532 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1533 if (!IdxVal) // Dynamic case will be selected to register indexing. 1534 return true; 1535 1536 Register Dst = MI.getOperand(0).getReg(); 1537 Register Vec = MI.getOperand(1).getReg(); 1538 1539 LLT VecTy = MRI.getType(Vec); 1540 LLT EltTy = VecTy.getElementType(); 1541 assert(EltTy == MRI.getType(Dst)); 1542 1543 B.setInstr(MI); 1544 1545 if (IdxVal.getValue() < VecTy.getNumElements()) 1546 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1547 else 1548 B.buildUndef(Dst); 1549 1550 MI.eraseFromParent(); 1551 return true; 1552 } 1553 1554 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1555 MachineInstr &MI, MachineRegisterInfo &MRI, 1556 MachineIRBuilder &B) const { 1557 // TODO: Should move some of this into LegalizerHelper. 1558 1559 // TODO: Promote dynamic indexing of s16 to s32 1560 // TODO: Dynamic s64 indexing is only legal for SGPR. 1561 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1562 if (!IdxVal) // Dynamic case will be selected to register indexing. 1563 return true; 1564 1565 Register Dst = MI.getOperand(0).getReg(); 1566 Register Vec = MI.getOperand(1).getReg(); 1567 Register Ins = MI.getOperand(2).getReg(); 1568 1569 LLT VecTy = MRI.getType(Vec); 1570 LLT EltTy = VecTy.getElementType(); 1571 assert(EltTy == MRI.getType(Ins)); 1572 1573 B.setInstr(MI); 1574 1575 if (IdxVal.getValue() < VecTy.getNumElements()) 1576 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1577 else 1578 B.buildUndef(Dst); 1579 1580 MI.eraseFromParent(); 1581 return true; 1582 } 1583 1584 bool AMDGPULegalizerInfo::legalizeSinCos( 1585 MachineInstr &MI, MachineRegisterInfo &MRI, 1586 MachineIRBuilder &B) const { 1587 B.setInstr(MI); 1588 1589 Register DstReg = MI.getOperand(0).getReg(); 1590 Register SrcReg = MI.getOperand(1).getReg(); 1591 LLT Ty = MRI.getType(DstReg); 1592 unsigned Flags = MI.getFlags(); 1593 1594 Register TrigVal; 1595 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1596 if (ST.hasTrigReducedRange()) { 1597 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1598 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1599 .addUse(MulVal.getReg(0)) 1600 .setMIFlags(Flags).getReg(0); 1601 } else 1602 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1603 1604 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1605 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1606 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1607 .addUse(TrigVal) 1608 .setMIFlags(Flags); 1609 MI.eraseFromParent(); 1610 return true; 1611 } 1612 1613 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1614 Register DstReg, LLT PtrTy, 1615 MachineIRBuilder &B, const GlobalValue *GV, 1616 unsigned Offset, unsigned GAFlags) const { 1617 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1618 // to the following code sequence: 1619 // 1620 // For constant address space: 1621 // s_getpc_b64 s[0:1] 1622 // s_add_u32 s0, s0, $symbol 1623 // s_addc_u32 s1, s1, 0 1624 // 1625 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1626 // a fixup or relocation is emitted to replace $symbol with a literal 1627 // constant, which is a pc-relative offset from the encoding of the $symbol 1628 // operand to the global variable. 1629 // 1630 // For global address space: 1631 // s_getpc_b64 s[0:1] 1632 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1633 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1634 // 1635 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1636 // fixups or relocations are emitted to replace $symbol@*@lo and 1637 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1638 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1639 // operand to the global variable. 1640 // 1641 // What we want here is an offset from the value returned by s_getpc 1642 // (which is the address of the s_add_u32 instruction) to the global 1643 // variable, but since the encoding of $symbol starts 4 bytes after the start 1644 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1645 // small. This requires us to add 4 to the global variable offset in order to 1646 // compute the correct address. 1647 1648 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1649 1650 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1651 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1652 1653 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1654 .addDef(PCReg); 1655 1656 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1657 if (GAFlags == SIInstrInfo::MO_NONE) 1658 MIB.addImm(0); 1659 else 1660 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1661 1662 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1663 1664 if (PtrTy.getSizeInBits() == 32) 1665 B.buildExtract(DstReg, PCReg, 0); 1666 return true; 1667 } 1668 1669 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1670 MachineInstr &MI, MachineRegisterInfo &MRI, 1671 MachineIRBuilder &B) const { 1672 Register DstReg = MI.getOperand(0).getReg(); 1673 LLT Ty = MRI.getType(DstReg); 1674 unsigned AS = Ty.getAddressSpace(); 1675 1676 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1677 MachineFunction &MF = B.getMF(); 1678 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1679 B.setInstr(MI); 1680 1681 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1682 if (!MFI->isEntryFunction()) { 1683 const Function &Fn = MF.getFunction(); 1684 DiagnosticInfoUnsupported BadLDSDecl( 1685 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1686 Fn.getContext().diagnose(BadLDSDecl); 1687 } 1688 1689 // TODO: We could emit code to handle the initialization somewhere. 1690 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1691 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1692 MI.eraseFromParent(); 1693 return true; 1694 } 1695 1696 const Function &Fn = MF.getFunction(); 1697 DiagnosticInfoUnsupported BadInit( 1698 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1699 Fn.getContext().diagnose(BadInit); 1700 return true; 1701 } 1702 1703 const SITargetLowering *TLI = ST.getTargetLowering(); 1704 1705 if (TLI->shouldEmitFixup(GV)) { 1706 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1707 MI.eraseFromParent(); 1708 return true; 1709 } 1710 1711 if (TLI->shouldEmitPCReloc(GV)) { 1712 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1713 MI.eraseFromParent(); 1714 return true; 1715 } 1716 1717 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1718 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1719 1720 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1721 MachinePointerInfo::getGOT(MF), 1722 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1723 MachineMemOperand::MOInvariant, 1724 8 /*Size*/, 8 /*Align*/); 1725 1726 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1727 1728 if (Ty.getSizeInBits() == 32) { 1729 // Truncate if this is a 32-bit constant adrdess. 1730 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1731 B.buildExtract(DstReg, Load, 0); 1732 } else 1733 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1734 1735 MI.eraseFromParent(); 1736 return true; 1737 } 1738 1739 bool AMDGPULegalizerInfo::legalizeLoad( 1740 MachineInstr &MI, MachineRegisterInfo &MRI, 1741 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1742 B.setInstr(MI); 1743 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1744 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1745 Observer.changingInstr(MI); 1746 MI.getOperand(1).setReg(Cast.getReg(0)); 1747 Observer.changedInstr(MI); 1748 return true; 1749 } 1750 1751 bool AMDGPULegalizerInfo::legalizeFMad( 1752 MachineInstr &MI, MachineRegisterInfo &MRI, 1753 MachineIRBuilder &B) const { 1754 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1755 assert(Ty.isScalar()); 1756 1757 MachineFunction &MF = B.getMF(); 1758 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1759 1760 // TODO: Always legal with future ftz flag. 1761 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1762 return true; 1763 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1764 return true; 1765 1766 1767 MachineIRBuilder HelperBuilder(MI); 1768 GISelObserverWrapper DummyObserver; 1769 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1770 HelperBuilder.setMBB(*MI.getParent()); 1771 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1772 } 1773 1774 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1775 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1776 Register DstReg = MI.getOperand(0).getReg(); 1777 Register PtrReg = MI.getOperand(1).getReg(); 1778 Register CmpVal = MI.getOperand(2).getReg(); 1779 Register NewVal = MI.getOperand(3).getReg(); 1780 1781 assert(SITargetLowering::isFlatGlobalAddrSpace( 1782 MRI.getType(PtrReg).getAddressSpace()) && 1783 "this should not have been custom lowered"); 1784 1785 LLT ValTy = MRI.getType(CmpVal); 1786 LLT VecTy = LLT::vector(2, ValTy); 1787 1788 B.setInstr(MI); 1789 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1790 1791 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1792 .addDef(DstReg) 1793 .addUse(PtrReg) 1794 .addUse(PackedVal) 1795 .setMemRefs(MI.memoperands()); 1796 1797 MI.eraseFromParent(); 1798 return true; 1799 } 1800 1801 // Return the use branch instruction, otherwise null if the usage is invalid. 1802 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1803 MachineRegisterInfo &MRI, 1804 MachineInstr *&Br) { 1805 Register CondDef = MI.getOperand(0).getReg(); 1806 if (!MRI.hasOneNonDBGUse(CondDef)) 1807 return nullptr; 1808 1809 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1810 if (UseMI.getParent() != MI.getParent() || 1811 UseMI.getOpcode() != AMDGPU::G_BRCOND) 1812 return nullptr; 1813 1814 // Make sure the cond br is followed by a G_BR 1815 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 1816 if (Next != MI.getParent()->end()) { 1817 if (Next->getOpcode() != AMDGPU::G_BR) 1818 return nullptr; 1819 Br = &*Next; 1820 } 1821 1822 return &UseMI; 1823 } 1824 1825 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1826 Register Reg, LLT Ty) const { 1827 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1828 if (LiveIn) 1829 return LiveIn; 1830 1831 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1832 MRI.addLiveIn(Reg, NewReg); 1833 return NewReg; 1834 } 1835 1836 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1837 const ArgDescriptor *Arg) const { 1838 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1839 return false; // TODO: Handle these 1840 1841 assert(Arg->getRegister().isPhysical()); 1842 1843 MachineRegisterInfo &MRI = *B.getMRI(); 1844 1845 LLT Ty = MRI.getType(DstReg); 1846 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1847 1848 if (Arg->isMasked()) { 1849 // TODO: Should we try to emit this once in the entry block? 1850 const LLT S32 = LLT::scalar(32); 1851 const unsigned Mask = Arg->getMask(); 1852 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1853 1854 Register AndMaskSrc = LiveIn; 1855 1856 if (Shift != 0) { 1857 auto ShiftAmt = B.buildConstant(S32, Shift); 1858 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1859 } 1860 1861 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1862 } else 1863 B.buildCopy(DstReg, LiveIn); 1864 1865 // Insert the argument copy if it doens't already exist. 1866 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1867 if (!MRI.getVRegDef(LiveIn)) { 1868 // FIXME: Should have scoped insert pt 1869 MachineBasicBlock &OrigInsBB = B.getMBB(); 1870 auto OrigInsPt = B.getInsertPt(); 1871 1872 MachineBasicBlock &EntryMBB = B.getMF().front(); 1873 EntryMBB.addLiveIn(Arg->getRegister()); 1874 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1875 B.buildCopy(LiveIn, Arg->getRegister()); 1876 1877 B.setInsertPt(OrigInsBB, OrigInsPt); 1878 } 1879 1880 return true; 1881 } 1882 1883 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1884 MachineInstr &MI, 1885 MachineRegisterInfo &MRI, 1886 MachineIRBuilder &B, 1887 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1888 B.setInstr(MI); 1889 1890 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1891 1892 const ArgDescriptor *Arg; 1893 const TargetRegisterClass *RC; 1894 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1895 if (!Arg) { 1896 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1897 return false; 1898 } 1899 1900 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1901 MI.eraseFromParent(); 1902 return true; 1903 } 1904 1905 return false; 1906 } 1907 1908 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1909 MachineRegisterInfo &MRI, 1910 MachineIRBuilder &B) const { 1911 B.setInstr(MI); 1912 Register Dst = MI.getOperand(0).getReg(); 1913 LLT DstTy = MRI.getType(Dst); 1914 LLT S16 = LLT::scalar(16); 1915 LLT S32 = LLT::scalar(32); 1916 LLT S64 = LLT::scalar(64); 1917 1918 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1919 return true; 1920 1921 if (DstTy == S16) 1922 return legalizeFDIV16(MI, MRI, B); 1923 if (DstTy == S32) 1924 return legalizeFDIV32(MI, MRI, B); 1925 if (DstTy == S64) 1926 return legalizeFDIV64(MI, MRI, B); 1927 1928 return false; 1929 } 1930 1931 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1932 MachineRegisterInfo &MRI, 1933 MachineIRBuilder &B) const { 1934 Register Res = MI.getOperand(0).getReg(); 1935 Register LHS = MI.getOperand(1).getReg(); 1936 Register RHS = MI.getOperand(2).getReg(); 1937 1938 uint16_t Flags = MI.getFlags(); 1939 1940 LLT ResTy = MRI.getType(Res); 1941 LLT S32 = LLT::scalar(32); 1942 LLT S64 = LLT::scalar(64); 1943 1944 const MachineFunction &MF = B.getMF(); 1945 bool Unsafe = 1946 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1947 1948 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1949 return false; 1950 1951 if (!Unsafe && ResTy == S32 && 1952 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 1953 return false; 1954 1955 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1956 // 1 / x -> RCP(x) 1957 if (CLHS->isExactlyValue(1.0)) { 1958 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1959 .addUse(RHS) 1960 .setMIFlags(Flags); 1961 1962 MI.eraseFromParent(); 1963 return true; 1964 } 1965 1966 // -1 / x -> RCP( FNEG(x) ) 1967 if (CLHS->isExactlyValue(-1.0)) { 1968 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1969 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1970 .addUse(FNeg.getReg(0)) 1971 .setMIFlags(Flags); 1972 1973 MI.eraseFromParent(); 1974 return true; 1975 } 1976 } 1977 1978 // x / y -> x * (1.0 / y) 1979 if (Unsafe) { 1980 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1981 .addUse(RHS) 1982 .setMIFlags(Flags); 1983 B.buildFMul(Res, LHS, RCP, Flags); 1984 1985 MI.eraseFromParent(); 1986 return true; 1987 } 1988 1989 return false; 1990 } 1991 1992 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 1993 MachineRegisterInfo &MRI, 1994 MachineIRBuilder &B) const { 1995 B.setInstr(MI); 1996 Register Res = MI.getOperand(0).getReg(); 1997 Register LHS = MI.getOperand(1).getReg(); 1998 Register RHS = MI.getOperand(2).getReg(); 1999 2000 uint16_t Flags = MI.getFlags(); 2001 2002 LLT S16 = LLT::scalar(16); 2003 LLT S32 = LLT::scalar(32); 2004 2005 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2006 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2007 2008 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2009 .addUse(RHSExt.getReg(0)) 2010 .setMIFlags(Flags); 2011 2012 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2013 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2014 2015 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2016 .addUse(RDst.getReg(0)) 2017 .addUse(RHS) 2018 .addUse(LHS) 2019 .setMIFlags(Flags); 2020 2021 MI.eraseFromParent(); 2022 return true; 2023 } 2024 2025 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2026 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2027 static void toggleSPDenormMode(bool Enable, 2028 MachineIRBuilder &B, 2029 const GCNSubtarget &ST, 2030 AMDGPU::SIModeRegisterDefaults Mode) { 2031 // Set SP denorm mode to this value. 2032 unsigned SPDenormMode = 2033 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2034 2035 if (ST.hasDenormModeInst()) { 2036 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2037 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 2038 ? FP_DENORM_FLUSH_NONE 2039 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2040 2041 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2042 B.buildInstr(AMDGPU::S_DENORM_MODE) 2043 .addImm(NewDenormModeValue); 2044 2045 } else { 2046 // Select FP32 bit field in mode register. 2047 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2048 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2049 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2050 2051 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2052 .addImm(SPDenormMode) 2053 .addImm(SPDenormModeBitField); 2054 } 2055 } 2056 2057 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2058 MachineRegisterInfo &MRI, 2059 MachineIRBuilder &B) const { 2060 B.setInstr(MI); 2061 Register Res = MI.getOperand(0).getReg(); 2062 Register LHS = MI.getOperand(1).getReg(); 2063 Register RHS = MI.getOperand(2).getReg(); 2064 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2065 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2066 2067 uint16_t Flags = MI.getFlags(); 2068 2069 LLT S32 = LLT::scalar(32); 2070 LLT S1 = LLT::scalar(1); 2071 2072 auto One = B.buildFConstant(S32, 1.0f); 2073 2074 auto DenominatorScaled = 2075 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2076 .addUse(RHS) 2077 .addUse(LHS) 2078 .addImm(1) 2079 .setMIFlags(Flags); 2080 auto NumeratorScaled = 2081 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2082 .addUse(LHS) 2083 .addUse(RHS) 2084 .addImm(0) 2085 .setMIFlags(Flags); 2086 2087 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2088 .addUse(DenominatorScaled.getReg(0)) 2089 .setMIFlags(Flags); 2090 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2091 2092 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2093 // aren't modeled as reading it. 2094 if (!Mode.FP32Denormals) 2095 toggleSPDenormMode(true, B, ST, Mode); 2096 2097 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2098 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2099 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2100 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2101 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2102 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2103 2104 if (!Mode.FP32Denormals) 2105 toggleSPDenormMode(false, B, ST, Mode); 2106 2107 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2108 .addUse(Fma4.getReg(0)) 2109 .addUse(Fma1.getReg(0)) 2110 .addUse(Fma3.getReg(0)) 2111 .addUse(NumeratorScaled.getReg(1)) 2112 .setMIFlags(Flags); 2113 2114 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2115 .addUse(Fmas.getReg(0)) 2116 .addUse(RHS) 2117 .addUse(LHS) 2118 .setMIFlags(Flags); 2119 2120 MI.eraseFromParent(); 2121 return true; 2122 } 2123 2124 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2125 MachineRegisterInfo &MRI, 2126 MachineIRBuilder &B) const { 2127 B.setInstr(MI); 2128 Register Res = MI.getOperand(0).getReg(); 2129 Register LHS = MI.getOperand(1).getReg(); 2130 Register RHS = MI.getOperand(2).getReg(); 2131 2132 uint16_t Flags = MI.getFlags(); 2133 2134 LLT S64 = LLT::scalar(64); 2135 LLT S1 = LLT::scalar(1); 2136 2137 auto One = B.buildFConstant(S64, 1.0); 2138 2139 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2140 .addUse(LHS) 2141 .addUse(RHS) 2142 .addImm(1) 2143 .setMIFlags(Flags); 2144 2145 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2146 2147 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2148 .addUse(DivScale0.getReg(0)) 2149 .setMIFlags(Flags); 2150 2151 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2152 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2153 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2154 2155 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2156 .addUse(LHS) 2157 .addUse(RHS) 2158 .addImm(0) 2159 .setMIFlags(Flags); 2160 2161 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2162 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2163 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2164 2165 Register Scale; 2166 if (!ST.hasUsableDivScaleConditionOutput()) { 2167 // Workaround a hardware bug on SI where the condition output from div_scale 2168 // is not usable. 2169 2170 Scale = MRI.createGenericVirtualRegister(S1); 2171 2172 LLT S32 = LLT::scalar(32); 2173 2174 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2175 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2176 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2177 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2178 2179 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2180 Scale1Unmerge.getReg(1)); 2181 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2182 Scale0Unmerge.getReg(1)); 2183 B.buildXor(Scale, CmpNum, CmpDen); 2184 } else { 2185 Scale = DivScale1.getReg(1); 2186 } 2187 2188 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2189 .addUse(Fma4.getReg(0)) 2190 .addUse(Fma3.getReg(0)) 2191 .addUse(Mul.getReg(0)) 2192 .addUse(Scale) 2193 .setMIFlags(Flags); 2194 2195 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2196 .addUse(Fmas.getReg(0)) 2197 .addUse(RHS) 2198 .addUse(LHS) 2199 .setMIFlags(Flags); 2200 2201 MI.eraseFromParent(); 2202 return true; 2203 } 2204 2205 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2206 MachineRegisterInfo &MRI, 2207 MachineIRBuilder &B) const { 2208 B.setInstr(MI); 2209 Register Res = MI.getOperand(0).getReg(); 2210 Register LHS = MI.getOperand(2).getReg(); 2211 Register RHS = MI.getOperand(3).getReg(); 2212 uint16_t Flags = MI.getFlags(); 2213 2214 LLT S32 = LLT::scalar(32); 2215 LLT S1 = LLT::scalar(1); 2216 2217 auto Abs = B.buildFAbs(S32, RHS, Flags); 2218 const APFloat C0Val(1.0f); 2219 2220 auto C0 = B.buildConstant(S32, 0x6f800000); 2221 auto C1 = B.buildConstant(S32, 0x2f800000); 2222 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2223 2224 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2225 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2226 2227 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2228 2229 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2230 .addUse(Mul0.getReg(0)) 2231 .setMIFlags(Flags); 2232 2233 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2234 2235 B.buildFMul(Res, Sel, Mul1, Flags); 2236 2237 MI.eraseFromParent(); 2238 return true; 2239 } 2240 2241 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2242 MachineRegisterInfo &MRI, 2243 MachineIRBuilder &B) const { 2244 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2245 if (!MFI->isEntryFunction()) { 2246 return legalizePreloadedArgIntrin(MI, MRI, B, 2247 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2248 } 2249 2250 B.setInstr(MI); 2251 2252 uint64_t Offset = 2253 ST.getTargetLowering()->getImplicitParameterOffset( 2254 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2255 Register DstReg = MI.getOperand(0).getReg(); 2256 LLT DstTy = MRI.getType(DstReg); 2257 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2258 2259 const ArgDescriptor *Arg; 2260 const TargetRegisterClass *RC; 2261 std::tie(Arg, RC) 2262 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2263 if (!Arg) 2264 return false; 2265 2266 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2267 if (!loadInputValue(KernargPtrReg, B, Arg)) 2268 return false; 2269 2270 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2271 MI.eraseFromParent(); 2272 return true; 2273 } 2274 2275 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2276 MachineRegisterInfo &MRI, 2277 MachineIRBuilder &B, 2278 unsigned AddrSpace) const { 2279 B.setInstr(MI); 2280 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2281 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2282 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2283 MI.eraseFromParent(); 2284 return true; 2285 } 2286 2287 /// Handle register layout difference for f16 images for some subtargets. 2288 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2289 MachineRegisterInfo &MRI, 2290 Register Reg) const { 2291 if (!ST.hasUnpackedD16VMem()) 2292 return Reg; 2293 2294 const LLT S16 = LLT::scalar(16); 2295 const LLT S32 = LLT::scalar(32); 2296 LLT StoreVT = MRI.getType(Reg); 2297 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2298 2299 auto Unmerge = B.buildUnmerge(S16, Reg); 2300 2301 SmallVector<Register, 4> WideRegs; 2302 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2303 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2304 2305 int NumElts = StoreVT.getNumElements(); 2306 2307 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2308 } 2309 2310 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 2311 MachineRegisterInfo &MRI, 2312 MachineIRBuilder &B, 2313 bool IsFormat) const { 2314 // TODO: Reject f16 format on targets where unsupported. 2315 Register VData = MI.getOperand(1).getReg(); 2316 LLT Ty = MRI.getType(VData); 2317 2318 B.setInstr(MI); 2319 2320 const LLT S32 = LLT::scalar(32); 2321 const LLT S16 = LLT::scalar(16); 2322 2323 // Fixup illegal register types for i8 stores. 2324 if (Ty == LLT::scalar(8) || Ty == S16) { 2325 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2326 MI.getOperand(1).setReg(AnyExt); 2327 return true; 2328 } 2329 2330 if (Ty.isVector()) { 2331 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2332 if (IsFormat) 2333 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2334 return true; 2335 } 2336 2337 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2338 } 2339 2340 return Ty == S32; 2341 } 2342 2343 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2344 MachineIRBuilder &B, 2345 bool IsInc) const { 2346 B.setInstr(MI); 2347 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2348 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2349 B.buildInstr(Opc) 2350 .addDef(MI.getOperand(0).getReg()) 2351 .addUse(MI.getOperand(2).getReg()) 2352 .addUse(MI.getOperand(3).getReg()) 2353 .cloneMemRefs(MI); 2354 MI.eraseFromParent(); 2355 return true; 2356 } 2357 2358 // FIMXE: Needs observer like custom 2359 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2360 MachineRegisterInfo &MRI, 2361 MachineIRBuilder &B) const { 2362 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2363 auto IntrID = MI.getIntrinsicID(); 2364 switch (IntrID) { 2365 case Intrinsic::amdgcn_if: 2366 case Intrinsic::amdgcn_else: { 2367 MachineInstr *Br = nullptr; 2368 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2369 const SIRegisterInfo *TRI 2370 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2371 2372 B.setInstr(*BrCond); 2373 Register Def = MI.getOperand(1).getReg(); 2374 Register Use = MI.getOperand(3).getReg(); 2375 2376 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 2377 if (Br) 2378 BrTarget = Br->getOperand(0).getMBB(); 2379 2380 if (IntrID == Intrinsic::amdgcn_if) { 2381 B.buildInstr(AMDGPU::SI_IF) 2382 .addDef(Def) 2383 .addUse(Use) 2384 .addMBB(BrTarget); 2385 } else { 2386 B.buildInstr(AMDGPU::SI_ELSE) 2387 .addDef(Def) 2388 .addUse(Use) 2389 .addMBB(BrTarget) 2390 .addImm(0); 2391 } 2392 2393 if (Br) 2394 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 2395 2396 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2397 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2398 MI.eraseFromParent(); 2399 BrCond->eraseFromParent(); 2400 return true; 2401 } 2402 2403 return false; 2404 } 2405 case Intrinsic::amdgcn_loop: { 2406 MachineInstr *Br = nullptr; 2407 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2408 const SIRegisterInfo *TRI 2409 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2410 2411 B.setInstr(*BrCond); 2412 2413 // FIXME: Need to adjust branch targets based on unconditional branch. 2414 Register Reg = MI.getOperand(2).getReg(); 2415 B.buildInstr(AMDGPU::SI_LOOP) 2416 .addUse(Reg) 2417 .addMBB(BrCond->getOperand(1).getMBB()); 2418 MI.eraseFromParent(); 2419 BrCond->eraseFromParent(); 2420 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2421 return true; 2422 } 2423 2424 return false; 2425 } 2426 case Intrinsic::amdgcn_kernarg_segment_ptr: 2427 return legalizePreloadedArgIntrin( 2428 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2429 case Intrinsic::amdgcn_implicitarg_ptr: 2430 return legalizeImplicitArgPtr(MI, MRI, B); 2431 case Intrinsic::amdgcn_workitem_id_x: 2432 return legalizePreloadedArgIntrin(MI, MRI, B, 2433 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2434 case Intrinsic::amdgcn_workitem_id_y: 2435 return legalizePreloadedArgIntrin(MI, MRI, B, 2436 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2437 case Intrinsic::amdgcn_workitem_id_z: 2438 return legalizePreloadedArgIntrin(MI, MRI, B, 2439 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2440 case Intrinsic::amdgcn_workgroup_id_x: 2441 return legalizePreloadedArgIntrin(MI, MRI, B, 2442 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2443 case Intrinsic::amdgcn_workgroup_id_y: 2444 return legalizePreloadedArgIntrin(MI, MRI, B, 2445 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2446 case Intrinsic::amdgcn_workgroup_id_z: 2447 return legalizePreloadedArgIntrin(MI, MRI, B, 2448 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2449 case Intrinsic::amdgcn_dispatch_ptr: 2450 return legalizePreloadedArgIntrin(MI, MRI, B, 2451 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2452 case Intrinsic::amdgcn_queue_ptr: 2453 return legalizePreloadedArgIntrin(MI, MRI, B, 2454 AMDGPUFunctionArgInfo::QUEUE_PTR); 2455 case Intrinsic::amdgcn_implicit_buffer_ptr: 2456 return legalizePreloadedArgIntrin( 2457 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2458 case Intrinsic::amdgcn_dispatch_id: 2459 return legalizePreloadedArgIntrin(MI, MRI, B, 2460 AMDGPUFunctionArgInfo::DISPATCH_ID); 2461 case Intrinsic::amdgcn_fdiv_fast: 2462 return legalizeFDIVFastIntrin(MI, MRI, B); 2463 case Intrinsic::amdgcn_is_shared: 2464 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2465 case Intrinsic::amdgcn_is_private: 2466 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2467 case Intrinsic::amdgcn_wavefrontsize: { 2468 B.setInstr(MI); 2469 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2470 MI.eraseFromParent(); 2471 return true; 2472 } 2473 case Intrinsic::amdgcn_raw_buffer_store: 2474 return legalizeRawBufferStore(MI, MRI, B, false); 2475 case Intrinsic::amdgcn_raw_buffer_store_format: 2476 return legalizeRawBufferStore(MI, MRI, B, true); 2477 case Intrinsic::amdgcn_atomic_inc: 2478 return legalizeAtomicIncDec(MI, B, true); 2479 case Intrinsic::amdgcn_atomic_dec: 2480 return legalizeAtomicIncDec(MI, B, false); 2481 default: 2482 return true; 2483 } 2484 2485 return true; 2486 } 2487