1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Round the number of elements to the next power of two elements 46 static LLT getPow2VectorType(LLT Ty) { 47 unsigned NElts = Ty.getNumElements(); 48 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 49 return Ty.changeNumElements(Pow2NElts); 50 } 51 52 // Round the number of bits to the next power of two bits 53 static LLT getPow2ScalarType(LLT Ty) { 54 unsigned Bits = Ty.getSizeInBits(); 55 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 56 return LLT::scalar(Pow2Bits); 57 } 58 59 static LegalityPredicate isMultiple32(unsigned TypeIdx, 60 unsigned MaxSize = 1024) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getScalarType(); 64 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 65 }; 66 } 67 68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 69 return [=](const LegalityQuery &Query) { 70 return Query.Types[TypeIdx].getSizeInBits() == Size; 71 }; 72 } 73 74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 75 return [=](const LegalityQuery &Query) { 76 const LLT Ty = Query.Types[TypeIdx]; 77 return Ty.isVector() && 78 Ty.getNumElements() % 2 != 0 && 79 Ty.getElementType().getSizeInBits() < 32 && 80 Ty.getSizeInBits() % 32 != 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 133 }; 134 } 135 136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 137 return [=](const LegalityQuery &Query) { 138 const LLT QueryTy = Query.Types[TypeIdx]; 139 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 140 }; 141 } 142 143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 144 return [=](const LegalityQuery &Query) { 145 const LLT QueryTy = Query.Types[TypeIdx]; 146 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 147 }; 148 } 149 150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 151 // v2s16. 152 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 153 return [=](const LegalityQuery &Query) { 154 const LLT Ty = Query.Types[TypeIdx]; 155 if (Ty.isVector()) { 156 const int EltSize = Ty.getElementType().getSizeInBits(); 157 return EltSize == 32 || EltSize == 64 || 158 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 159 EltSize == 128 || EltSize == 256; 160 } 161 162 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 163 }; 164 } 165 166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 167 return [=](const LegalityQuery &Query) { 168 const LLT QueryTy = Query.Types[TypeIdx]; 169 return QueryTy.isVector() && QueryTy.getElementType() == Type; 170 }; 171 } 172 173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 174 return [=](const LegalityQuery &Query) { 175 const LLT Ty = Query.Types[TypeIdx]; 176 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 177 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 178 }; 179 } 180 181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 182 const GCNTargetMachine &TM) 183 : ST(ST_) { 184 using namespace TargetOpcode; 185 186 auto GetAddrSpacePtr = [&TM](unsigned AS) { 187 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 188 }; 189 190 const LLT S1 = LLT::scalar(1); 191 const LLT S16 = LLT::scalar(16); 192 const LLT S32 = LLT::scalar(32); 193 const LLT S64 = LLT::scalar(64); 194 const LLT S96 = LLT::scalar(96); 195 const LLT S128 = LLT::scalar(128); 196 const LLT S256 = LLT::scalar(256); 197 const LLT S1024 = LLT::scalar(1024); 198 199 const LLT V2S16 = LLT::vector(2, 16); 200 const LLT V4S16 = LLT::vector(4, 16); 201 202 const LLT V2S32 = LLT::vector(2, 32); 203 const LLT V3S32 = LLT::vector(3, 32); 204 const LLT V4S32 = LLT::vector(4, 32); 205 const LLT V5S32 = LLT::vector(5, 32); 206 const LLT V6S32 = LLT::vector(6, 32); 207 const LLT V7S32 = LLT::vector(7, 32); 208 const LLT V8S32 = LLT::vector(8, 32); 209 const LLT V9S32 = LLT::vector(9, 32); 210 const LLT V10S32 = LLT::vector(10, 32); 211 const LLT V11S32 = LLT::vector(11, 32); 212 const LLT V12S32 = LLT::vector(12, 32); 213 const LLT V13S32 = LLT::vector(13, 32); 214 const LLT V14S32 = LLT::vector(14, 32); 215 const LLT V15S32 = LLT::vector(15, 32); 216 const LLT V16S32 = LLT::vector(16, 32); 217 const LLT V32S32 = LLT::vector(32, 32); 218 219 const LLT V2S64 = LLT::vector(2, 64); 220 const LLT V3S64 = LLT::vector(3, 64); 221 const LLT V4S64 = LLT::vector(4, 64); 222 const LLT V5S64 = LLT::vector(5, 64); 223 const LLT V6S64 = LLT::vector(6, 64); 224 const LLT V7S64 = LLT::vector(7, 64); 225 const LLT V8S64 = LLT::vector(8, 64); 226 const LLT V16S64 = LLT::vector(16, 64); 227 228 std::initializer_list<LLT> AllS32Vectors = 229 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 230 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 231 std::initializer_list<LLT> AllS64Vectors = 232 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 233 234 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 235 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 236 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 237 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 238 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 239 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 240 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 241 242 const LLT CodePtr = FlatPtr; 243 244 const std::initializer_list<LLT> AddrSpaces64 = { 245 GlobalPtr, ConstantPtr, FlatPtr 246 }; 247 248 const std::initializer_list<LLT> AddrSpaces32 = { 249 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 250 }; 251 252 const std::initializer_list<LLT> FPTypesBase = { 253 S32, S64 254 }; 255 256 const std::initializer_list<LLT> FPTypes16 = { 257 S32, S64, S16 258 }; 259 260 const std::initializer_list<LLT> FPTypesPK16 = { 261 S32, S64, S16, V2S16 262 }; 263 264 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 265 266 setAction({G_BRCOND, S1}, Legal); // VCC branches 267 setAction({G_BRCOND, S32}, Legal); // SCC branches 268 269 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 270 // elements for v3s16 271 getActionDefinitionsBuilder(G_PHI) 272 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 273 .legalFor(AllS32Vectors) 274 .legalFor(AllS64Vectors) 275 .legalFor(AddrSpaces64) 276 .legalFor(AddrSpaces32) 277 .clampScalar(0, S32, S256) 278 .widenScalarToNextPow2(0, 32) 279 .clampMaxNumElements(0, S32, 16) 280 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 281 .legalIf(isPointer(0)); 282 283 if (ST.has16BitInsts()) { 284 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 285 .legalFor({S32, S16}) 286 .clampScalar(0, S16, S32) 287 .scalarize(0); 288 } else { 289 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 290 .legalFor({S32}) 291 .clampScalar(0, S32, S32) 292 .scalarize(0); 293 } 294 295 // FIXME: Not really legal. Placeholder for custom lowering. 296 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 297 .legalFor({S32, S64}) 298 .clampScalar(0, S32, S64) 299 .widenScalarToNextPow2(0, 32) 300 .scalarize(0); 301 302 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 303 .legalFor({S32}) 304 .clampScalar(0, S32, S32) 305 .scalarize(0); 306 307 // Report legal for any types we can handle anywhere. For the cases only legal 308 // on the SALU, RegBankSelect will be able to re-legalize. 309 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 310 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 311 .clampScalar(0, S32, S64) 312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 313 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 314 .widenScalarToNextPow2(0) 315 .scalarize(0); 316 317 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 318 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 319 .legalFor({{S32, S1}, {S32, S32}}) 320 .clampScalar(0, S32, S32) 321 .scalarize(0); // TODO: Implement. 322 323 getActionDefinitionsBuilder(G_BITCAST) 324 // Don't worry about the size constraint. 325 .legalIf(all(isRegisterType(0), isRegisterType(1))) 326 // FIXME: Testing hack 327 .legalForCartesianProduct({S16, LLT::vector(2, 8), }) 328 .lower(); 329 330 331 getActionDefinitionsBuilder(G_CONSTANT) 332 .legalFor({S1, S32, S64, S16, GlobalPtr, 333 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 334 .clampScalar(0, S32, S64) 335 .widenScalarToNextPow2(0) 336 .legalIf(isPointer(0)); 337 338 getActionDefinitionsBuilder(G_FCONSTANT) 339 .legalFor({S32, S64, S16}) 340 .clampScalar(0, S16, S64); 341 342 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 343 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 344 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 346 .clampScalarOrElt(0, S32, S1024) 347 .legalIf(isMultiple32(0)) 348 .widenScalarToNextPow2(0, 32) 349 .clampMaxNumElements(0, S32, 16); 350 351 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 352 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 353 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 354 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 355 356 auto &FPOpActions = getActionDefinitionsBuilder( 357 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 358 .legalFor({S32, S64}); 359 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 360 .customFor({S32, S64}); 361 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 362 .customFor({S32, S64}); 363 364 if (ST.has16BitInsts()) { 365 if (ST.hasVOP3PInsts()) 366 FPOpActions.legalFor({S16, V2S16}); 367 else 368 FPOpActions.legalFor({S16}); 369 370 TrigActions.customFor({S16}); 371 FDIVActions.customFor({S16}); 372 } 373 374 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 375 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 376 377 if (ST.hasVOP3PInsts()) { 378 MinNumMaxNum.customFor(FPTypesPK16) 379 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 380 .clampMaxNumElements(0, S16, 2) 381 .clampScalar(0, S16, S64) 382 .scalarize(0); 383 } else if (ST.has16BitInsts()) { 384 MinNumMaxNum.customFor(FPTypes16) 385 .clampScalar(0, S16, S64) 386 .scalarize(0); 387 } else { 388 MinNumMaxNum.customFor(FPTypesBase) 389 .clampScalar(0, S32, S64) 390 .scalarize(0); 391 } 392 393 if (ST.hasVOP3PInsts()) 394 FPOpActions.clampMaxNumElements(0, S16, 2); 395 396 FPOpActions 397 .scalarize(0) 398 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 399 400 TrigActions 401 .scalarize(0) 402 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 403 404 FDIVActions 405 .scalarize(0) 406 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 407 408 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 409 .legalFor(FPTypesPK16) 410 .clampMaxNumElements(0, S16, 2) 411 .scalarize(0) 412 .clampScalar(0, S16, S64); 413 414 if (ST.has16BitInsts()) { 415 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 416 .legalFor({S32, S64, S16}) 417 .scalarize(0) 418 .clampScalar(0, S16, S64); 419 } else { 420 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 421 .legalFor({S32, S64}) 422 .scalarize(0) 423 .clampScalar(0, S32, S64); 424 } 425 426 getActionDefinitionsBuilder(G_FPTRUNC) 427 .legalFor({{S32, S64}, {S16, S32}}) 428 .scalarize(0); 429 430 getActionDefinitionsBuilder(G_FPEXT) 431 .legalFor({{S64, S32}, {S32, S16}}) 432 .lowerFor({{S64, S16}}) // FIXME: Implement 433 .scalarize(0); 434 435 getActionDefinitionsBuilder(G_FSUB) 436 // Use actual fsub instruction 437 .legalFor({S32}) 438 // Must use fadd + fneg 439 .lowerFor({S64, S16, V2S16}) 440 .scalarize(0) 441 .clampScalar(0, S32, S64); 442 443 // Whether this is legal depends on the floating point mode for the function. 444 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 445 if (ST.hasMadF16()) 446 FMad.customFor({S32, S16}); 447 else 448 FMad.customFor({S32}); 449 FMad.scalarize(0) 450 .lower(); 451 452 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 453 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 454 {S32, S1}, {S64, S1}, {S16, S1}}) 455 .scalarize(0) 456 .clampScalar(0, S32, S64); 457 458 // TODO: Split s1->s64 during regbankselect for VALU. 459 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 460 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 461 .lowerFor({{S32, S64}}) 462 .lowerIf(typeIs(1, S1)) 463 .customFor({{S64, S64}}); 464 if (ST.has16BitInsts()) 465 IToFP.legalFor({{S16, S16}}); 466 IToFP.clampScalar(1, S32, S64) 467 .scalarize(0); 468 469 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 470 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 471 .customFor({{S64, S64}}); 472 if (ST.has16BitInsts()) 473 FPToI.legalFor({{S16, S16}}); 474 else 475 FPToI.minScalar(1, S32); 476 477 FPToI.minScalar(0, S32) 478 .scalarize(0) 479 .lower(); 480 481 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 482 .scalarize(0) 483 .lower(); 484 485 if (ST.has16BitInsts()) { 486 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 487 .legalFor({S16, S32, S64}) 488 .clampScalar(0, S16, S64) 489 .scalarize(0); 490 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 491 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 492 .legalFor({S32, S64}) 493 .clampScalar(0, S32, S64) 494 .scalarize(0); 495 } else { 496 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 497 .legalFor({S32}) 498 .customFor({S64}) 499 .clampScalar(0, S32, S64) 500 .scalarize(0); 501 } 502 503 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 504 .scalarize(0) 505 .alwaysLegal(); 506 507 auto &CmpBuilder = 508 getActionDefinitionsBuilder(G_ICMP) 509 // The compare output type differs based on the register bank of the output, 510 // so make both s1 and s32 legal. 511 // 512 // Scalar compares producing output in scc will be promoted to s32, as that 513 // is the allocatable register type that will be needed for the copy from 514 // scc. This will be promoted during RegBankSelect, and we assume something 515 // before that won't try to use s32 result types. 516 // 517 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 518 // bank. 519 .legalForCartesianProduct( 520 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 521 .legalForCartesianProduct( 522 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 523 if (ST.has16BitInsts()) { 524 CmpBuilder.legalFor({{S1, S16}}); 525 } 526 527 CmpBuilder 528 .widenScalarToNextPow2(1) 529 .clampScalar(1, S32, S64) 530 .scalarize(0) 531 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 532 533 getActionDefinitionsBuilder(G_FCMP) 534 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 535 .widenScalarToNextPow2(1) 536 .clampScalar(1, S32, S64) 537 .scalarize(0); 538 539 // FIXME: fpow has a selection pattern that should move to custom lowering. 540 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW}); 541 if (ST.has16BitInsts()) 542 Exp2Ops.legalFor({S32, S16}); 543 else 544 Exp2Ops.legalFor({S32}); 545 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 546 Exp2Ops.scalarize(0); 547 548 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10}); 549 if (ST.has16BitInsts()) 550 ExpOps.customFor({{S32}, {S16}}); 551 else 552 ExpOps.customFor({S32}); 553 ExpOps.clampScalar(0, MinScalarFPTy, S32) 554 .scalarize(0); 555 556 // The 64-bit versions produce 32-bit results, but only on the SALU. 557 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 558 G_CTTZ, G_CTTZ_ZERO_UNDEF, 559 G_CTPOP}) 560 .legalFor({{S32, S32}, {S32, S64}}) 561 .clampScalar(0, S32, S32) 562 .clampScalar(1, S32, S64) 563 .scalarize(0) 564 .widenScalarToNextPow2(0, 32) 565 .widenScalarToNextPow2(1, 32); 566 567 // TODO: Expand for > s32 568 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 569 .legalFor({S32}) 570 .clampScalar(0, S32, S32) 571 .scalarize(0); 572 573 if (ST.has16BitInsts()) { 574 if (ST.hasVOP3PInsts()) { 575 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 576 .legalFor({S32, S16, V2S16}) 577 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 578 .clampMaxNumElements(0, S16, 2) 579 .clampScalar(0, S16, S32) 580 .widenScalarToNextPow2(0) 581 .scalarize(0); 582 } else { 583 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 584 .legalFor({S32, S16}) 585 .widenScalarToNextPow2(0) 586 .clampScalar(0, S16, S32) 587 .scalarize(0); 588 } 589 } else { 590 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 591 .legalFor({S32}) 592 .clampScalar(0, S32, S32) 593 .widenScalarToNextPow2(0) 594 .scalarize(0); 595 } 596 597 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 598 return [=](const LegalityQuery &Query) { 599 return Query.Types[TypeIdx0].getSizeInBits() < 600 Query.Types[TypeIdx1].getSizeInBits(); 601 }; 602 }; 603 604 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 605 return [=](const LegalityQuery &Query) { 606 return Query.Types[TypeIdx0].getSizeInBits() > 607 Query.Types[TypeIdx1].getSizeInBits(); 608 }; 609 }; 610 611 getActionDefinitionsBuilder(G_INTTOPTR) 612 // List the common cases 613 .legalForCartesianProduct(AddrSpaces64, {S64}) 614 .legalForCartesianProduct(AddrSpaces32, {S32}) 615 .scalarize(0) 616 // Accept any address space as long as the size matches 617 .legalIf(sameSize(0, 1)) 618 .widenScalarIf(smallerThan(1, 0), 619 [](const LegalityQuery &Query) { 620 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 621 }) 622 .narrowScalarIf(greaterThan(1, 0), 623 [](const LegalityQuery &Query) { 624 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 625 }); 626 627 getActionDefinitionsBuilder(G_PTRTOINT) 628 // List the common cases 629 .legalForCartesianProduct(AddrSpaces64, {S64}) 630 .legalForCartesianProduct(AddrSpaces32, {S32}) 631 .scalarize(0) 632 // Accept any address space as long as the size matches 633 .legalIf(sameSize(0, 1)) 634 .widenScalarIf(smallerThan(0, 1), 635 [](const LegalityQuery &Query) { 636 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 637 }) 638 .narrowScalarIf( 639 greaterThan(0, 1), 640 [](const LegalityQuery &Query) { 641 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 642 }); 643 644 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 645 .scalarize(0) 646 .custom(); 647 648 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 649 // handle some operations by just promoting the register during 650 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 651 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 652 switch (AS) { 653 // FIXME: Private element size. 654 case AMDGPUAS::PRIVATE_ADDRESS: 655 return 32; 656 // FIXME: Check subtarget 657 case AMDGPUAS::LOCAL_ADDRESS: 658 return ST.useDS128() ? 128 : 64; 659 660 // Treat constant and global as identical. SMRD loads are sometimes usable 661 // for global loads (ideally constant address space should be eliminated) 662 // depending on the context. Legality cannot be context dependent, but 663 // RegBankSelect can split the load as necessary depending on the pointer 664 // register bank/uniformity and if the memory is invariant or not written in 665 // a kernel. 666 case AMDGPUAS::CONSTANT_ADDRESS: 667 case AMDGPUAS::GLOBAL_ADDRESS: 668 return IsLoad ? 512 : 128; 669 default: 670 return 128; 671 } 672 }; 673 674 const auto needToSplitMemOp = [=](const LegalityQuery &Query, bool IsLoad) -> bool { 675 const LLT DstTy = Query.Types[0]; 676 677 // Split vector extloads. 678 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 679 unsigned Align = Query.MMODescrs[0].AlignInBits; 680 681 if (MemSize < DstTy.getSizeInBits()) 682 MemSize = std::max(MemSize, Align); 683 684 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 685 return true; 686 687 const LLT PtrTy = Query.Types[1]; 688 unsigned AS = PtrTy.getAddressSpace(); 689 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 690 return true; 691 692 // Catch weird sized loads that don't evenly divide into the access sizes 693 // TODO: May be able to widen depending on alignment etc. 694 unsigned NumRegs = MemSize / 32; 695 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 696 return true; 697 698 if (Align < MemSize) { 699 const SITargetLowering *TLI = ST.getTargetLowering(); 700 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 701 } 702 703 return false; 704 }; 705 706 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 707 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 708 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 709 710 // TODO: Refine based on subtargets which support unaligned access or 128-bit 711 // LDS 712 // TODO: Unsupported flat for SI. 713 714 for (unsigned Op : {G_LOAD, G_STORE}) { 715 const bool IsStore = Op == G_STORE; 716 717 auto &Actions = getActionDefinitionsBuilder(Op); 718 // Whitelist the common cases. 719 // TODO: Pointer loads 720 // TODO: Wide constant loads 721 // TODO: Only CI+ has 3x loads 722 // TODO: Loads to s16 on gfx9 723 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 724 {V2S32, GlobalPtr, 64, GlobalAlign32}, 725 {V3S32, GlobalPtr, 96, GlobalAlign32}, 726 {S96, GlobalPtr, 96, GlobalAlign32}, 727 {V4S32, GlobalPtr, 128, GlobalAlign32}, 728 {S128, GlobalPtr, 128, GlobalAlign32}, 729 {S64, GlobalPtr, 64, GlobalAlign32}, 730 {V2S64, GlobalPtr, 128, GlobalAlign32}, 731 {V2S16, GlobalPtr, 32, GlobalAlign32}, 732 {S32, GlobalPtr, 8, GlobalAlign8}, 733 {S32, GlobalPtr, 16, GlobalAlign16}, 734 735 {S32, LocalPtr, 32, 32}, 736 {S64, LocalPtr, 64, 32}, 737 {V2S32, LocalPtr, 64, 32}, 738 {S32, LocalPtr, 8, 8}, 739 {S32, LocalPtr, 16, 16}, 740 {V2S16, LocalPtr, 32, 32}, 741 742 {S32, PrivatePtr, 32, 32}, 743 {S32, PrivatePtr, 8, 8}, 744 {S32, PrivatePtr, 16, 16}, 745 {V2S16, PrivatePtr, 32, 32}, 746 747 {S32, FlatPtr, 32, GlobalAlign32}, 748 {S32, FlatPtr, 16, GlobalAlign16}, 749 {S32, FlatPtr, 8, GlobalAlign8}, 750 {V2S16, FlatPtr, 32, GlobalAlign32}, 751 752 {S32, ConstantPtr, 32, GlobalAlign32}, 753 {V2S32, ConstantPtr, 64, GlobalAlign32}, 754 {V3S32, ConstantPtr, 96, GlobalAlign32}, 755 {V4S32, ConstantPtr, 128, GlobalAlign32}, 756 {S64, ConstantPtr, 64, GlobalAlign32}, 757 {S128, ConstantPtr, 128, GlobalAlign32}, 758 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 759 Actions 760 .customIf(typeIs(1, Constant32Ptr)) 761 .narrowScalarIf( 762 [=](const LegalityQuery &Query) -> bool { 763 return !Query.Types[0].isVector() && 764 needToSplitMemOp(Query, Op == G_LOAD); 765 }, 766 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 767 const LLT DstTy = Query.Types[0]; 768 const LLT PtrTy = Query.Types[1]; 769 770 const unsigned DstSize = DstTy.getSizeInBits(); 771 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 772 773 // Split extloads. 774 if (DstSize > MemSize) 775 return std::make_pair(0, LLT::scalar(MemSize)); 776 777 if (DstSize > 32 && (DstSize % 32 != 0)) { 778 // FIXME: Need a way to specify non-extload of larger size if 779 // suitably aligned. 780 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 781 } 782 783 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 784 Op == G_LOAD); 785 if (MemSize > MaxSize) 786 return std::make_pair(0, LLT::scalar(MaxSize)); 787 788 unsigned Align = Query.MMODescrs[0].AlignInBits; 789 return std::make_pair(0, LLT::scalar(Align)); 790 }) 791 .fewerElementsIf( 792 [=](const LegalityQuery &Query) -> bool { 793 return Query.Types[0].isVector() && 794 needToSplitMemOp(Query, Op == G_LOAD); 795 }, 796 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 797 const LLT DstTy = Query.Types[0]; 798 const LLT PtrTy = Query.Types[1]; 799 800 LLT EltTy = DstTy.getElementType(); 801 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 802 Op == G_LOAD); 803 804 // Split if it's too large for the address space. 805 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 806 unsigned NumElts = DstTy.getNumElements(); 807 unsigned EltSize = EltTy.getSizeInBits(); 808 809 if (MaxSize % EltSize == 0) { 810 return std::make_pair( 811 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 812 } 813 814 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 815 816 // FIXME: Refine when odd breakdowns handled 817 // The scalars will need to be re-legalized. 818 if (NumPieces == 1 || NumPieces >= NumElts || 819 NumElts % NumPieces != 0) 820 return std::make_pair(0, EltTy); 821 822 return std::make_pair(0, 823 LLT::vector(NumElts / NumPieces, EltTy)); 824 } 825 826 // Need to split because of alignment. 827 unsigned Align = Query.MMODescrs[0].AlignInBits; 828 unsigned EltSize = EltTy.getSizeInBits(); 829 if (EltSize > Align && 830 (EltSize / Align < DstTy.getNumElements())) { 831 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 832 } 833 834 // May need relegalization for the scalars. 835 return std::make_pair(0, EltTy); 836 }) 837 .minScalar(0, S32); 838 839 if (IsStore) 840 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 841 842 // TODO: Need a bitcast lower option? 843 Actions 844 .legalIf([=](const LegalityQuery &Query) { 845 const LLT Ty0 = Query.Types[0]; 846 unsigned Size = Ty0.getSizeInBits(); 847 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 848 unsigned Align = Query.MMODescrs[0].AlignInBits; 849 850 // FIXME: Widening store from alignment not valid. 851 if (MemSize < Size) 852 MemSize = std::max(MemSize, Align); 853 854 // No extending vector loads. 855 if (Size > MemSize && Ty0.isVector()) 856 return false; 857 858 switch (MemSize) { 859 case 8: 860 case 16: 861 return Size == 32; 862 case 32: 863 case 64: 864 case 128: 865 return true; 866 case 96: 867 return ST.hasDwordx3LoadStores(); 868 case 256: 869 case 512: 870 return true; 871 default: 872 return false; 873 } 874 }) 875 .widenScalarToNextPow2(0) 876 // TODO: v3s32->v4s32 with alignment 877 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 878 } 879 880 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 881 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 882 {S32, GlobalPtr, 16, 2 * 8}, 883 {S32, LocalPtr, 8, 8}, 884 {S32, LocalPtr, 16, 16}, 885 {S32, PrivatePtr, 8, 8}, 886 {S32, PrivatePtr, 16, 16}, 887 {S32, ConstantPtr, 8, 8}, 888 {S32, ConstantPtr, 16, 2 * 8}}); 889 if (ST.hasFlatAddressSpace()) { 890 ExtLoads.legalForTypesWithMemDesc( 891 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 892 } 893 894 ExtLoads.clampScalar(0, S32, S32) 895 .widenScalarToNextPow2(0) 896 .unsupportedIfMemSizeNotPow2() 897 .lower(); 898 899 auto &Atomics = getActionDefinitionsBuilder( 900 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 901 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 902 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 903 G_ATOMICRMW_UMIN}) 904 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 905 {S64, GlobalPtr}, {S64, LocalPtr}}); 906 if (ST.hasFlatAddressSpace()) { 907 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 908 } 909 910 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 911 .legalFor({{S32, LocalPtr}}); 912 913 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 914 // demarshalling 915 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 916 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 917 {S32, FlatPtr}, {S64, FlatPtr}}) 918 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 919 {S32, RegionPtr}, {S64, RegionPtr}}); 920 // TODO: Pointer types, any 32-bit or 64-bit vector 921 922 // Condition should be s32 for scalar, s1 for vector. 923 getActionDefinitionsBuilder(G_SELECT) 924 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 925 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 926 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 927 .clampScalar(0, S16, S64) 928 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 929 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 930 .scalarize(1) 931 .clampMaxNumElements(0, S32, 2) 932 .clampMaxNumElements(0, LocalPtr, 2) 933 .clampMaxNumElements(0, PrivatePtr, 2) 934 .scalarize(0) 935 .widenScalarToNextPow2(0) 936 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 937 938 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 939 // be more flexible with the shift amount type. 940 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 941 .legalFor({{S32, S32}, {S64, S32}}); 942 if (ST.has16BitInsts()) { 943 if (ST.hasVOP3PInsts()) { 944 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 945 .clampMaxNumElements(0, S16, 2); 946 } else 947 Shifts.legalFor({{S16, S32}, {S16, S16}}); 948 949 // TODO: Support 16-bit shift amounts 950 Shifts.clampScalar(1, S32, S32); 951 Shifts.clampScalar(0, S16, S64); 952 Shifts.widenScalarToNextPow2(0, 16); 953 } else { 954 // Make sure we legalize the shift amount type first, as the general 955 // expansion for the shifted type will produce much worse code if it hasn't 956 // been truncated already. 957 Shifts.clampScalar(1, S32, S32); 958 Shifts.clampScalar(0, S32, S64); 959 Shifts.widenScalarToNextPow2(0, 32); 960 } 961 Shifts.scalarize(0); 962 963 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 964 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 965 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 966 unsigned IdxTypeIdx = 2; 967 968 getActionDefinitionsBuilder(Op) 969 .customIf([=](const LegalityQuery &Query) { 970 const LLT EltTy = Query.Types[EltTypeIdx]; 971 const LLT VecTy = Query.Types[VecTypeIdx]; 972 const LLT IdxTy = Query.Types[IdxTypeIdx]; 973 return (EltTy.getSizeInBits() == 16 || 974 EltTy.getSizeInBits() % 32 == 0) && 975 VecTy.getSizeInBits() % 32 == 0 && 976 VecTy.getSizeInBits() <= 1024 && 977 IdxTy.getSizeInBits() == 32; 978 }) 979 .clampScalar(EltTypeIdx, S32, S64) 980 .clampScalar(VecTypeIdx, S32, S64) 981 .clampScalar(IdxTypeIdx, S32, S32); 982 } 983 984 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 985 .unsupportedIf([=](const LegalityQuery &Query) { 986 const LLT &EltTy = Query.Types[1].getElementType(); 987 return Query.Types[0] != EltTy; 988 }); 989 990 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 991 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 992 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 993 994 // FIXME: Doesn't handle extract of illegal sizes. 995 getActionDefinitionsBuilder(Op) 996 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 997 // FIXME: Multiples of 16 should not be legal. 998 .legalIf([=](const LegalityQuery &Query) { 999 const LLT BigTy = Query.Types[BigTyIdx]; 1000 const LLT LitTy = Query.Types[LitTyIdx]; 1001 return (BigTy.getSizeInBits() % 32 == 0) && 1002 (LitTy.getSizeInBits() % 16 == 0); 1003 }) 1004 .widenScalarIf( 1005 [=](const LegalityQuery &Query) { 1006 const LLT BigTy = Query.Types[BigTyIdx]; 1007 return (BigTy.getScalarSizeInBits() < 16); 1008 }, 1009 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1010 .widenScalarIf( 1011 [=](const LegalityQuery &Query) { 1012 const LLT LitTy = Query.Types[LitTyIdx]; 1013 return (LitTy.getScalarSizeInBits() < 16); 1014 }, 1015 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1016 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1017 .widenScalarToNextPow2(BigTyIdx, 32); 1018 1019 } 1020 1021 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1022 .legalForCartesianProduct(AllS32Vectors, {S32}) 1023 .legalForCartesianProduct(AllS64Vectors, {S64}) 1024 .clampNumElements(0, V16S32, V32S32) 1025 .clampNumElements(0, V2S64, V16S64) 1026 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1027 1028 if (ST.hasScalarPackInsts()) { 1029 BuildVector 1030 // FIXME: Should probably widen s1 vectors straight to s32 1031 .minScalarOrElt(0, S16) 1032 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1033 .minScalar(1, S32); 1034 1035 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1036 .legalFor({V2S16, S32}) 1037 .lower(); 1038 BuildVector.minScalarOrElt(0, S32); 1039 } else { 1040 BuildVector.customFor({V2S16, S16}); 1041 BuildVector.minScalarOrElt(0, S32); 1042 1043 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1044 .customFor({V2S16, S32}) 1045 .lower(); 1046 } 1047 1048 BuildVector.legalIf(isRegisterType(0)); 1049 1050 // FIXME: Clamp maximum size 1051 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1052 .legalIf(isRegisterType(0)); 1053 1054 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1055 // pre-legalize. 1056 if (ST.hasVOP3PInsts()) { 1057 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1058 .customFor({V2S16, V2S16}) 1059 .lower(); 1060 } else 1061 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1062 1063 // Merge/Unmerge 1064 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1065 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1066 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1067 1068 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1069 const LLT &Ty = Query.Types[TypeIdx]; 1070 if (Ty.isVector()) { 1071 const LLT &EltTy = Ty.getElementType(); 1072 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1073 return true; 1074 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1075 return true; 1076 } 1077 return false; 1078 }; 1079 1080 auto &Builder = getActionDefinitionsBuilder(Op) 1081 // Try to widen to s16 first for small types. 1082 // TODO: Only do this on targets with legal s16 shifts 1083 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1084 1085 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1086 .lowerFor({{S16, V2S16}}) 1087 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1088 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1089 elementTypeIs(1, S16)), 1090 changeTo(1, V2S16)) 1091 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1092 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1093 // valid. 1094 .clampScalar(LitTyIdx, S32, S256) 1095 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1096 // Break up vectors with weird elements into scalars 1097 .fewerElementsIf( 1098 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1099 scalarize(0)) 1100 .fewerElementsIf( 1101 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1102 scalarize(1)) 1103 .clampScalar(BigTyIdx, S32, S1024); 1104 1105 if (Op == G_MERGE_VALUES) { 1106 Builder.widenScalarIf( 1107 // TODO: Use 16-bit shifts if legal for 8-bit values? 1108 [=](const LegalityQuery &Query) { 1109 const LLT Ty = Query.Types[LitTyIdx]; 1110 return Ty.getSizeInBits() < 32; 1111 }, 1112 changeTo(LitTyIdx, S32)); 1113 } 1114 1115 Builder.widenScalarIf( 1116 [=](const LegalityQuery &Query) { 1117 const LLT Ty = Query.Types[BigTyIdx]; 1118 return !isPowerOf2_32(Ty.getSizeInBits()) && 1119 Ty.getSizeInBits() % 16 != 0; 1120 }, 1121 [=](const LegalityQuery &Query) { 1122 // Pick the next power of 2, or a multiple of 64 over 128. 1123 // Whichever is smaller. 1124 const LLT &Ty = Query.Types[BigTyIdx]; 1125 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1126 if (NewSizeInBits >= 256) { 1127 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1128 if (RoundedTo < NewSizeInBits) 1129 NewSizeInBits = RoundedTo; 1130 } 1131 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1132 }) 1133 .legalIf([=](const LegalityQuery &Query) { 1134 const LLT &BigTy = Query.Types[BigTyIdx]; 1135 const LLT &LitTy = Query.Types[LitTyIdx]; 1136 1137 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1138 return false; 1139 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1140 return false; 1141 1142 return BigTy.getSizeInBits() % 16 == 0 && 1143 LitTy.getSizeInBits() % 16 == 0 && 1144 BigTy.getSizeInBits() <= 1024; 1145 }) 1146 // Any vectors left are the wrong size. Scalarize them. 1147 .scalarize(0) 1148 .scalarize(1); 1149 } 1150 1151 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1152 // RegBankSelect. 1153 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1154 .legalFor({{S32}, {S64}}); 1155 1156 if (ST.hasVOP3PInsts()) { 1157 SextInReg.lowerFor({{V2S16}}) 1158 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1159 // get more vector shift opportunities, since we'll get those when 1160 // expanded. 1161 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1162 } else if (ST.has16BitInsts()) { 1163 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1164 } else { 1165 // Prefer to promote to s32 before lowering if we don't have 16-bit 1166 // shifts. This avoid a lot of intermediate truncate and extend operations. 1167 SextInReg.lowerFor({{S32}, {S64}}); 1168 } 1169 1170 SextInReg 1171 .scalarize(0) 1172 .clampScalar(0, S32, S64) 1173 .lower(); 1174 1175 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1176 .legalFor({S64}); 1177 1178 getActionDefinitionsBuilder({ 1179 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1180 G_FCOPYSIGN, 1181 1182 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1183 G_READ_REGISTER, 1184 G_WRITE_REGISTER, 1185 1186 G_SADDO, G_SSUBO, 1187 1188 // TODO: Implement 1189 G_FMINIMUM, G_FMAXIMUM 1190 }).lower(); 1191 1192 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1193 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1194 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1195 .unsupported(); 1196 1197 computeTables(); 1198 verify(*ST.getInstrInfo()); 1199 } 1200 1201 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1202 MachineRegisterInfo &MRI, 1203 MachineIRBuilder &B, 1204 GISelChangeObserver &Observer) const { 1205 switch (MI.getOpcode()) { 1206 case TargetOpcode::G_ADDRSPACE_CAST: 1207 return legalizeAddrSpaceCast(MI, MRI, B); 1208 case TargetOpcode::G_FRINT: 1209 return legalizeFrint(MI, MRI, B); 1210 case TargetOpcode::G_FCEIL: 1211 return legalizeFceil(MI, MRI, B); 1212 case TargetOpcode::G_INTRINSIC_TRUNC: 1213 return legalizeIntrinsicTrunc(MI, MRI, B); 1214 case TargetOpcode::G_SITOFP: 1215 return legalizeITOFP(MI, MRI, B, true); 1216 case TargetOpcode::G_UITOFP: 1217 return legalizeITOFP(MI, MRI, B, false); 1218 case TargetOpcode::G_FPTOSI: 1219 return legalizeFPTOI(MI, MRI, B, true); 1220 case TargetOpcode::G_FPTOUI: 1221 return legalizeFPTOI(MI, MRI, B, false); 1222 case TargetOpcode::G_FMINNUM: 1223 case TargetOpcode::G_FMAXNUM: 1224 case TargetOpcode::G_FMINNUM_IEEE: 1225 case TargetOpcode::G_FMAXNUM_IEEE: 1226 return legalizeMinNumMaxNum(MI, MRI, B); 1227 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1228 return legalizeExtractVectorElt(MI, MRI, B); 1229 case TargetOpcode::G_INSERT_VECTOR_ELT: 1230 return legalizeInsertVectorElt(MI, MRI, B); 1231 case TargetOpcode::G_SHUFFLE_VECTOR: 1232 return legalizeShuffleVector(MI, MRI, B); 1233 case TargetOpcode::G_FSIN: 1234 case TargetOpcode::G_FCOS: 1235 return legalizeSinCos(MI, MRI, B); 1236 case TargetOpcode::G_GLOBAL_VALUE: 1237 return legalizeGlobalValue(MI, MRI, B); 1238 case TargetOpcode::G_LOAD: 1239 return legalizeLoad(MI, MRI, B, Observer); 1240 case TargetOpcode::G_FMAD: 1241 return legalizeFMad(MI, MRI, B); 1242 case TargetOpcode::G_FDIV: 1243 return legalizeFDIV(MI, MRI, B); 1244 case TargetOpcode::G_ATOMIC_CMPXCHG: 1245 return legalizeAtomicCmpXChg(MI, MRI, B); 1246 case TargetOpcode::G_FLOG: 1247 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1248 case TargetOpcode::G_FLOG10: 1249 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1250 case TargetOpcode::G_FEXP: 1251 return legalizeFExp(MI, B); 1252 case TargetOpcode::G_BUILD_VECTOR: 1253 return legalizeBuildVector(MI, MRI, B); 1254 default: 1255 return false; 1256 } 1257 1258 llvm_unreachable("expected switch to return"); 1259 } 1260 1261 Register AMDGPULegalizerInfo::getSegmentAperture( 1262 unsigned AS, 1263 MachineRegisterInfo &MRI, 1264 MachineIRBuilder &B) const { 1265 MachineFunction &MF = B.getMF(); 1266 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1267 const LLT S32 = LLT::scalar(32); 1268 1269 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1270 1271 if (ST.hasApertureRegs()) { 1272 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1273 // getreg. 1274 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1275 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1276 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1277 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1278 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1279 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1280 unsigned Encoding = 1281 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1282 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1283 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1284 1285 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1286 1287 B.buildInstr(AMDGPU::S_GETREG_B32) 1288 .addDef(GetReg) 1289 .addImm(Encoding); 1290 MRI.setType(GetReg, S32); 1291 1292 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1293 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1294 } 1295 1296 Register QueuePtr = MRI.createGenericVirtualRegister( 1297 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1298 1299 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1300 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1301 return Register(); 1302 1303 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1304 // private_segment_aperture_base_hi. 1305 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1306 1307 // TODO: can we be smarter about machine pointer info? 1308 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1309 MachineMemOperand *MMO = MF.getMachineMemOperand( 1310 PtrInfo, 1311 MachineMemOperand::MOLoad | 1312 MachineMemOperand::MODereferenceable | 1313 MachineMemOperand::MOInvariant, 1314 4, 1315 MinAlign(64, StructOffset)); 1316 1317 Register LoadAddr; 1318 1319 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1320 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1321 } 1322 1323 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1324 MachineInstr &MI, MachineRegisterInfo &MRI, 1325 MachineIRBuilder &B) const { 1326 MachineFunction &MF = B.getMF(); 1327 1328 B.setInstr(MI); 1329 1330 const LLT S32 = LLT::scalar(32); 1331 Register Dst = MI.getOperand(0).getReg(); 1332 Register Src = MI.getOperand(1).getReg(); 1333 1334 LLT DstTy = MRI.getType(Dst); 1335 LLT SrcTy = MRI.getType(Src); 1336 unsigned DestAS = DstTy.getAddressSpace(); 1337 unsigned SrcAS = SrcTy.getAddressSpace(); 1338 1339 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1340 // vector element. 1341 assert(!DstTy.isVector()); 1342 1343 const AMDGPUTargetMachine &TM 1344 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1345 1346 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1347 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1348 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1349 return true; 1350 } 1351 1352 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1353 // Truncate. 1354 B.buildExtract(Dst, Src, 0); 1355 MI.eraseFromParent(); 1356 return true; 1357 } 1358 1359 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1360 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1361 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1362 1363 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1364 // another. Merge operands are required to be the same type, but creating an 1365 // extra ptrtoint would be kind of pointless. 1366 auto HighAddr = B.buildConstant( 1367 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1368 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1369 MI.eraseFromParent(); 1370 return true; 1371 } 1372 1373 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1374 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1375 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1376 unsigned NullVal = TM.getNullPointerValue(DestAS); 1377 1378 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1379 auto FlatNull = B.buildConstant(SrcTy, 0); 1380 1381 // Extract low 32-bits of the pointer. 1382 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1383 1384 auto CmpRes = 1385 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1386 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1387 1388 MI.eraseFromParent(); 1389 return true; 1390 } 1391 1392 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1393 return false; 1394 1395 if (!ST.hasFlatAddressSpace()) 1396 return false; 1397 1398 auto SegmentNull = 1399 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1400 auto FlatNull = 1401 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1402 1403 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1404 if (!ApertureReg.isValid()) 1405 return false; 1406 1407 auto CmpRes = 1408 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1409 1410 // Coerce the type of the low half of the result so we can use merge_values. 1411 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1412 1413 // TODO: Should we allow mismatched types but matching sizes in merges to 1414 // avoid the ptrtoint? 1415 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1416 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1417 1418 MI.eraseFromParent(); 1419 return true; 1420 } 1421 1422 bool AMDGPULegalizerInfo::legalizeFrint( 1423 MachineInstr &MI, MachineRegisterInfo &MRI, 1424 MachineIRBuilder &B) const { 1425 B.setInstr(MI); 1426 1427 Register Src = MI.getOperand(1).getReg(); 1428 LLT Ty = MRI.getType(Src); 1429 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1430 1431 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1432 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1433 1434 auto C1 = B.buildFConstant(Ty, C1Val); 1435 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1436 1437 // TODO: Should this propagate fast-math-flags? 1438 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1439 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1440 1441 auto C2 = B.buildFConstant(Ty, C2Val); 1442 auto Fabs = B.buildFAbs(Ty, Src); 1443 1444 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1445 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1446 return true; 1447 } 1448 1449 bool AMDGPULegalizerInfo::legalizeFceil( 1450 MachineInstr &MI, MachineRegisterInfo &MRI, 1451 MachineIRBuilder &B) const { 1452 B.setInstr(MI); 1453 1454 const LLT S1 = LLT::scalar(1); 1455 const LLT S64 = LLT::scalar(64); 1456 1457 Register Src = MI.getOperand(1).getReg(); 1458 assert(MRI.getType(Src) == S64); 1459 1460 // result = trunc(src) 1461 // if (src > 0.0 && src != result) 1462 // result += 1.0 1463 1464 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1465 1466 const auto Zero = B.buildFConstant(S64, 0.0); 1467 const auto One = B.buildFConstant(S64, 1.0); 1468 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1469 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1470 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1471 auto Add = B.buildSelect(S64, And, One, Zero); 1472 1473 // TODO: Should this propagate fast-math-flags? 1474 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1475 return true; 1476 } 1477 1478 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1479 MachineIRBuilder &B) { 1480 const unsigned FractBits = 52; 1481 const unsigned ExpBits = 11; 1482 LLT S32 = LLT::scalar(32); 1483 1484 auto Const0 = B.buildConstant(S32, FractBits - 32); 1485 auto Const1 = B.buildConstant(S32, ExpBits); 1486 1487 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1488 .addUse(Const0.getReg(0)) 1489 .addUse(Const1.getReg(0)); 1490 1491 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1492 } 1493 1494 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1495 MachineInstr &MI, MachineRegisterInfo &MRI, 1496 MachineIRBuilder &B) const { 1497 B.setInstr(MI); 1498 1499 const LLT S1 = LLT::scalar(1); 1500 const LLT S32 = LLT::scalar(32); 1501 const LLT S64 = LLT::scalar(64); 1502 1503 Register Src = MI.getOperand(1).getReg(); 1504 assert(MRI.getType(Src) == S64); 1505 1506 // TODO: Should this use extract since the low half is unused? 1507 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1508 Register Hi = Unmerge.getReg(1); 1509 1510 // Extract the upper half, since this is where we will find the sign and 1511 // exponent. 1512 auto Exp = extractF64Exponent(Hi, B); 1513 1514 const unsigned FractBits = 52; 1515 1516 // Extract the sign bit. 1517 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1518 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1519 1520 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1521 1522 const auto Zero32 = B.buildConstant(S32, 0); 1523 1524 // Extend back to 64-bits. 1525 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1526 1527 auto Shr = B.buildAShr(S64, FractMask, Exp); 1528 auto Not = B.buildNot(S64, Shr); 1529 auto Tmp0 = B.buildAnd(S64, Src, Not); 1530 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1531 1532 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1533 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1534 1535 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1536 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1537 return true; 1538 } 1539 1540 bool AMDGPULegalizerInfo::legalizeITOFP( 1541 MachineInstr &MI, MachineRegisterInfo &MRI, 1542 MachineIRBuilder &B, bool Signed) const { 1543 B.setInstr(MI); 1544 1545 Register Dst = MI.getOperand(0).getReg(); 1546 Register Src = MI.getOperand(1).getReg(); 1547 1548 const LLT S64 = LLT::scalar(64); 1549 const LLT S32 = LLT::scalar(32); 1550 1551 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1552 1553 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1554 1555 auto CvtHi = Signed ? 1556 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1557 B.buildUITOFP(S64, Unmerge.getReg(1)); 1558 1559 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1560 1561 auto ThirtyTwo = B.buildConstant(S32, 32); 1562 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1563 .addUse(CvtHi.getReg(0)) 1564 .addUse(ThirtyTwo.getReg(0)); 1565 1566 // TODO: Should this propagate fast-math-flags? 1567 B.buildFAdd(Dst, LdExp, CvtLo); 1568 MI.eraseFromParent(); 1569 return true; 1570 } 1571 1572 // TODO: Copied from DAG implementation. Verify logic and document how this 1573 // actually works. 1574 bool AMDGPULegalizerInfo::legalizeFPTOI( 1575 MachineInstr &MI, MachineRegisterInfo &MRI, 1576 MachineIRBuilder &B, bool Signed) const { 1577 B.setInstr(MI); 1578 1579 Register Dst = MI.getOperand(0).getReg(); 1580 Register Src = MI.getOperand(1).getReg(); 1581 1582 const LLT S64 = LLT::scalar(64); 1583 const LLT S32 = LLT::scalar(32); 1584 1585 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1586 1587 unsigned Flags = MI.getFlags(); 1588 1589 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1590 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1591 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1592 1593 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1594 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1595 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1596 1597 auto Hi = Signed ? 1598 B.buildFPTOSI(S32, FloorMul) : 1599 B.buildFPTOUI(S32, FloorMul); 1600 auto Lo = B.buildFPTOUI(S32, Fma); 1601 1602 B.buildMerge(Dst, { Lo.getReg(0), Hi.getReg(0) }); 1603 MI.eraseFromParent(); 1604 1605 return true; 1606 } 1607 1608 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1609 MachineInstr &MI, MachineRegisterInfo &MRI, 1610 MachineIRBuilder &B) const { 1611 MachineFunction &MF = B.getMF(); 1612 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1613 1614 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1615 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1616 1617 // With ieee_mode disabled, the instructions have the correct behavior 1618 // already for G_FMINNUM/G_FMAXNUM 1619 if (!MFI->getMode().IEEE) 1620 return !IsIEEEOp; 1621 1622 if (IsIEEEOp) 1623 return true; 1624 1625 MachineIRBuilder HelperBuilder(MI); 1626 GISelObserverWrapper DummyObserver; 1627 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1628 HelperBuilder.setInstr(MI); 1629 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1630 } 1631 1632 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1633 MachineInstr &MI, MachineRegisterInfo &MRI, 1634 MachineIRBuilder &B) const { 1635 // TODO: Should move some of this into LegalizerHelper. 1636 1637 // TODO: Promote dynamic indexing of s16 to s32 1638 // TODO: Dynamic s64 indexing is only legal for SGPR. 1639 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1640 if (!IdxVal) // Dynamic case will be selected to register indexing. 1641 return true; 1642 1643 Register Dst = MI.getOperand(0).getReg(); 1644 Register Vec = MI.getOperand(1).getReg(); 1645 1646 LLT VecTy = MRI.getType(Vec); 1647 LLT EltTy = VecTy.getElementType(); 1648 assert(EltTy == MRI.getType(Dst)); 1649 1650 B.setInstr(MI); 1651 1652 if (IdxVal.getValue() < VecTy.getNumElements()) 1653 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1654 else 1655 B.buildUndef(Dst); 1656 1657 MI.eraseFromParent(); 1658 return true; 1659 } 1660 1661 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1662 MachineInstr &MI, MachineRegisterInfo &MRI, 1663 MachineIRBuilder &B) const { 1664 // TODO: Should move some of this into LegalizerHelper. 1665 1666 // TODO: Promote dynamic indexing of s16 to s32 1667 // TODO: Dynamic s64 indexing is only legal for SGPR. 1668 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1669 if (!IdxVal) // Dynamic case will be selected to register indexing. 1670 return true; 1671 1672 Register Dst = MI.getOperand(0).getReg(); 1673 Register Vec = MI.getOperand(1).getReg(); 1674 Register Ins = MI.getOperand(2).getReg(); 1675 1676 LLT VecTy = MRI.getType(Vec); 1677 LLT EltTy = VecTy.getElementType(); 1678 assert(EltTy == MRI.getType(Ins)); 1679 1680 B.setInstr(MI); 1681 1682 if (IdxVal.getValue() < VecTy.getNumElements()) 1683 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1684 else 1685 B.buildUndef(Dst); 1686 1687 MI.eraseFromParent(); 1688 return true; 1689 } 1690 1691 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1692 assert(Mask.size() == 2); 1693 1694 // If one half is undef, the other is trivially in the same reg. 1695 if (Mask[0] == -1 || Mask[1] == -1) 1696 return true; 1697 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1698 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1699 } 1700 1701 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1702 MachineInstr &MI, MachineRegisterInfo &MRI, 1703 MachineIRBuilder &B) const { 1704 const LLT V2S16 = LLT::vector(2, 16); 1705 1706 Register Dst = MI.getOperand(0).getReg(); 1707 Register Src0 = MI.getOperand(1).getReg(); 1708 LLT DstTy = MRI.getType(Dst); 1709 LLT SrcTy = MRI.getType(Src0); 1710 1711 if (SrcTy == V2S16 && DstTy == V2S16 && 1712 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1713 return true; 1714 1715 MachineIRBuilder HelperBuilder(MI); 1716 GISelObserverWrapper DummyObserver; 1717 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1718 HelperBuilder.setInstr(MI); 1719 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1720 } 1721 1722 bool AMDGPULegalizerInfo::legalizeSinCos( 1723 MachineInstr &MI, MachineRegisterInfo &MRI, 1724 MachineIRBuilder &B) const { 1725 B.setInstr(MI); 1726 1727 Register DstReg = MI.getOperand(0).getReg(); 1728 Register SrcReg = MI.getOperand(1).getReg(); 1729 LLT Ty = MRI.getType(DstReg); 1730 unsigned Flags = MI.getFlags(); 1731 1732 Register TrigVal; 1733 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1734 if (ST.hasTrigReducedRange()) { 1735 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1736 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1737 .addUse(MulVal.getReg(0)) 1738 .setMIFlags(Flags).getReg(0); 1739 } else 1740 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1741 1742 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1743 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1744 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1745 .addUse(TrigVal) 1746 .setMIFlags(Flags); 1747 MI.eraseFromParent(); 1748 return true; 1749 } 1750 1751 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1752 Register DstReg, LLT PtrTy, 1753 MachineIRBuilder &B, const GlobalValue *GV, 1754 unsigned Offset, unsigned GAFlags) const { 1755 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1756 // to the following code sequence: 1757 // 1758 // For constant address space: 1759 // s_getpc_b64 s[0:1] 1760 // s_add_u32 s0, s0, $symbol 1761 // s_addc_u32 s1, s1, 0 1762 // 1763 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1764 // a fixup or relocation is emitted to replace $symbol with a literal 1765 // constant, which is a pc-relative offset from the encoding of the $symbol 1766 // operand to the global variable. 1767 // 1768 // For global address space: 1769 // s_getpc_b64 s[0:1] 1770 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1771 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1772 // 1773 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1774 // fixups or relocations are emitted to replace $symbol@*@lo and 1775 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1776 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1777 // operand to the global variable. 1778 // 1779 // What we want here is an offset from the value returned by s_getpc 1780 // (which is the address of the s_add_u32 instruction) to the global 1781 // variable, but since the encoding of $symbol starts 4 bytes after the start 1782 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1783 // small. This requires us to add 4 to the global variable offset in order to 1784 // compute the correct address. 1785 1786 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1787 1788 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1789 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1790 1791 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1792 .addDef(PCReg); 1793 1794 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1795 if (GAFlags == SIInstrInfo::MO_NONE) 1796 MIB.addImm(0); 1797 else 1798 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1799 1800 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1801 1802 if (PtrTy.getSizeInBits() == 32) 1803 B.buildExtract(DstReg, PCReg, 0); 1804 return true; 1805 } 1806 1807 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1808 MachineInstr &MI, MachineRegisterInfo &MRI, 1809 MachineIRBuilder &B) const { 1810 Register DstReg = MI.getOperand(0).getReg(); 1811 LLT Ty = MRI.getType(DstReg); 1812 unsigned AS = Ty.getAddressSpace(); 1813 1814 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1815 MachineFunction &MF = B.getMF(); 1816 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1817 B.setInstr(MI); 1818 1819 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1820 if (!MFI->isEntryFunction()) { 1821 const Function &Fn = MF.getFunction(); 1822 DiagnosticInfoUnsupported BadLDSDecl( 1823 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1824 Fn.getContext().diagnose(BadLDSDecl); 1825 } 1826 1827 // TODO: We could emit code to handle the initialization somewhere. 1828 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1829 const SITargetLowering *TLI = ST.getTargetLowering(); 1830 if (!TLI->shouldUseLDSConstAddress(GV)) { 1831 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1832 return true; // Leave in place; 1833 } 1834 1835 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1836 MI.eraseFromParent(); 1837 return true; 1838 } 1839 1840 const Function &Fn = MF.getFunction(); 1841 DiagnosticInfoUnsupported BadInit( 1842 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1843 Fn.getContext().diagnose(BadInit); 1844 return true; 1845 } 1846 1847 const SITargetLowering *TLI = ST.getTargetLowering(); 1848 1849 if (TLI->shouldEmitFixup(GV)) { 1850 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1851 MI.eraseFromParent(); 1852 return true; 1853 } 1854 1855 if (TLI->shouldEmitPCReloc(GV)) { 1856 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1857 MI.eraseFromParent(); 1858 return true; 1859 } 1860 1861 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1862 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1863 1864 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1865 MachinePointerInfo::getGOT(MF), 1866 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1867 MachineMemOperand::MOInvariant, 1868 8 /*Size*/, 8 /*Align*/); 1869 1870 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1871 1872 if (Ty.getSizeInBits() == 32) { 1873 // Truncate if this is a 32-bit constant adrdess. 1874 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1875 B.buildExtract(DstReg, Load, 0); 1876 } else 1877 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1878 1879 MI.eraseFromParent(); 1880 return true; 1881 } 1882 1883 bool AMDGPULegalizerInfo::legalizeLoad( 1884 MachineInstr &MI, MachineRegisterInfo &MRI, 1885 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1886 B.setInstr(MI); 1887 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1888 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1889 Observer.changingInstr(MI); 1890 MI.getOperand(1).setReg(Cast.getReg(0)); 1891 Observer.changedInstr(MI); 1892 return true; 1893 } 1894 1895 bool AMDGPULegalizerInfo::legalizeFMad( 1896 MachineInstr &MI, MachineRegisterInfo &MRI, 1897 MachineIRBuilder &B) const { 1898 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1899 assert(Ty.isScalar()); 1900 1901 MachineFunction &MF = B.getMF(); 1902 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1903 1904 // TODO: Always legal with future ftz flag. 1905 // FIXME: Do we need just output? 1906 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 1907 return true; 1908 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 1909 return true; 1910 1911 MachineIRBuilder HelperBuilder(MI); 1912 GISelObserverWrapper DummyObserver; 1913 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1914 HelperBuilder.setMBB(*MI.getParent()); 1915 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1916 } 1917 1918 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1919 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1920 Register DstReg = MI.getOperand(0).getReg(); 1921 Register PtrReg = MI.getOperand(1).getReg(); 1922 Register CmpVal = MI.getOperand(2).getReg(); 1923 Register NewVal = MI.getOperand(3).getReg(); 1924 1925 assert(SITargetLowering::isFlatGlobalAddrSpace( 1926 MRI.getType(PtrReg).getAddressSpace()) && 1927 "this should not have been custom lowered"); 1928 1929 LLT ValTy = MRI.getType(CmpVal); 1930 LLT VecTy = LLT::vector(2, ValTy); 1931 1932 B.setInstr(MI); 1933 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1934 1935 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1936 .addDef(DstReg) 1937 .addUse(PtrReg) 1938 .addUse(PackedVal) 1939 .setMemRefs(MI.memoperands()); 1940 1941 MI.eraseFromParent(); 1942 return true; 1943 } 1944 1945 bool AMDGPULegalizerInfo::legalizeFlog( 1946 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 1947 Register Dst = MI.getOperand(0).getReg(); 1948 Register Src = MI.getOperand(1).getReg(); 1949 LLT Ty = B.getMRI()->getType(Dst); 1950 unsigned Flags = MI.getFlags(); 1951 B.setInstr(MI); 1952 1953 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 1954 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 1955 1956 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 1957 MI.eraseFromParent(); 1958 return true; 1959 } 1960 1961 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 1962 MachineIRBuilder &B) const { 1963 Register Dst = MI.getOperand(0).getReg(); 1964 Register Src = MI.getOperand(1).getReg(); 1965 unsigned Flags = MI.getFlags(); 1966 LLT Ty = B.getMRI()->getType(Dst); 1967 B.setInstr(MI); 1968 1969 auto K = B.buildFConstant(Ty, numbers::log2e); 1970 auto Mul = B.buildFMul(Ty, Src, K, Flags); 1971 B.buildFExp2(Dst, Mul, Flags); 1972 MI.eraseFromParent(); 1973 return true; 1974 } 1975 1976 // Turn an illegal packed v2s16 build vector into bit operations. 1977 // TODO: This should probably be a bitcast action in LegalizerHelper. 1978 bool AMDGPULegalizerInfo::legalizeBuildVector( 1979 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1980 Register Dst = MI.getOperand(0).getReg(); 1981 LLT DstTy = MRI.getType(Dst); 1982 const LLT S32 = LLT::scalar(32); 1983 const LLT V2S16 = LLT::vector(2, 16); 1984 (void)DstTy; 1985 (void)V2S16; 1986 assert(DstTy == V2S16); 1987 1988 Register Src0 = MI.getOperand(1).getReg(); 1989 Register Src1 = MI.getOperand(2).getReg(); 1990 assert(MRI.getType(Src0) == LLT::scalar(16)); 1991 1992 B.setInstr(MI); 1993 auto Merge = B.buildMerge(S32, {Src0, Src1}); 1994 B.buildBitcast(Dst, Merge); 1995 1996 MI.eraseFromParent(); 1997 return true; 1998 } 1999 2000 // Return the use branch instruction, otherwise null if the usage is invalid. 2001 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2002 MachineRegisterInfo &MRI, 2003 MachineInstr *&Br) { 2004 Register CondDef = MI.getOperand(0).getReg(); 2005 if (!MRI.hasOneNonDBGUse(CondDef)) 2006 return nullptr; 2007 2008 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2009 if (UseMI.getParent() != MI.getParent() || 2010 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2011 return nullptr; 2012 2013 // Make sure the cond br is followed by a G_BR 2014 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2015 if (Next != MI.getParent()->end()) { 2016 if (Next->getOpcode() != AMDGPU::G_BR) 2017 return nullptr; 2018 Br = &*Next; 2019 } 2020 2021 return &UseMI; 2022 } 2023 2024 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 2025 Register Reg, LLT Ty) const { 2026 Register LiveIn = MRI.getLiveInVirtReg(Reg); 2027 if (LiveIn) 2028 return LiveIn; 2029 2030 Register NewReg = MRI.createGenericVirtualRegister(Ty); 2031 MRI.addLiveIn(Reg, NewReg); 2032 return NewReg; 2033 } 2034 2035 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2036 const ArgDescriptor *Arg) const { 2037 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2038 return false; // TODO: Handle these 2039 2040 assert(Arg->getRegister().isPhysical()); 2041 2042 MachineRegisterInfo &MRI = *B.getMRI(); 2043 2044 LLT Ty = MRI.getType(DstReg); 2045 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 2046 2047 if (Arg->isMasked()) { 2048 // TODO: Should we try to emit this once in the entry block? 2049 const LLT S32 = LLT::scalar(32); 2050 const unsigned Mask = Arg->getMask(); 2051 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2052 2053 Register AndMaskSrc = LiveIn; 2054 2055 if (Shift != 0) { 2056 auto ShiftAmt = B.buildConstant(S32, Shift); 2057 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2058 } 2059 2060 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2061 } else 2062 B.buildCopy(DstReg, LiveIn); 2063 2064 // Insert the argument copy if it doens't already exist. 2065 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2066 if (!MRI.getVRegDef(LiveIn)) { 2067 // FIXME: Should have scoped insert pt 2068 MachineBasicBlock &OrigInsBB = B.getMBB(); 2069 auto OrigInsPt = B.getInsertPt(); 2070 2071 MachineBasicBlock &EntryMBB = B.getMF().front(); 2072 EntryMBB.addLiveIn(Arg->getRegister()); 2073 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2074 B.buildCopy(LiveIn, Arg->getRegister()); 2075 2076 B.setInsertPt(OrigInsBB, OrigInsPt); 2077 } 2078 2079 return true; 2080 } 2081 2082 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2083 MachineInstr &MI, 2084 MachineRegisterInfo &MRI, 2085 MachineIRBuilder &B, 2086 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2087 B.setInstr(MI); 2088 2089 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2090 2091 const ArgDescriptor *Arg; 2092 const TargetRegisterClass *RC; 2093 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2094 if (!Arg) { 2095 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2096 return false; 2097 } 2098 2099 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 2100 MI.eraseFromParent(); 2101 return true; 2102 } 2103 2104 return false; 2105 } 2106 2107 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2108 MachineRegisterInfo &MRI, 2109 MachineIRBuilder &B) const { 2110 B.setInstr(MI); 2111 Register Dst = MI.getOperand(0).getReg(); 2112 LLT DstTy = MRI.getType(Dst); 2113 LLT S16 = LLT::scalar(16); 2114 LLT S32 = LLT::scalar(32); 2115 LLT S64 = LLT::scalar(64); 2116 2117 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2118 return true; 2119 2120 if (DstTy == S16) 2121 return legalizeFDIV16(MI, MRI, B); 2122 if (DstTy == S32) 2123 return legalizeFDIV32(MI, MRI, B); 2124 if (DstTy == S64) 2125 return legalizeFDIV64(MI, MRI, B); 2126 2127 return false; 2128 } 2129 2130 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2131 MachineRegisterInfo &MRI, 2132 MachineIRBuilder &B) const { 2133 Register Res = MI.getOperand(0).getReg(); 2134 Register LHS = MI.getOperand(1).getReg(); 2135 Register RHS = MI.getOperand(2).getReg(); 2136 2137 uint16_t Flags = MI.getFlags(); 2138 2139 LLT ResTy = MRI.getType(Res); 2140 LLT S32 = LLT::scalar(32); 2141 LLT S64 = LLT::scalar(64); 2142 2143 const MachineFunction &MF = B.getMF(); 2144 bool Unsafe = 2145 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2146 2147 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2148 return false; 2149 2150 if (!Unsafe && ResTy == S32 && 2151 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2152 return false; 2153 2154 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2155 // 1 / x -> RCP(x) 2156 if (CLHS->isExactlyValue(1.0)) { 2157 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2158 .addUse(RHS) 2159 .setMIFlags(Flags); 2160 2161 MI.eraseFromParent(); 2162 return true; 2163 } 2164 2165 // -1 / x -> RCP( FNEG(x) ) 2166 if (CLHS->isExactlyValue(-1.0)) { 2167 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2168 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2169 .addUse(FNeg.getReg(0)) 2170 .setMIFlags(Flags); 2171 2172 MI.eraseFromParent(); 2173 return true; 2174 } 2175 } 2176 2177 // x / y -> x * (1.0 / y) 2178 if (Unsafe) { 2179 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2180 .addUse(RHS) 2181 .setMIFlags(Flags); 2182 B.buildFMul(Res, LHS, RCP, Flags); 2183 2184 MI.eraseFromParent(); 2185 return true; 2186 } 2187 2188 return false; 2189 } 2190 2191 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2192 MachineRegisterInfo &MRI, 2193 MachineIRBuilder &B) const { 2194 B.setInstr(MI); 2195 Register Res = MI.getOperand(0).getReg(); 2196 Register LHS = MI.getOperand(1).getReg(); 2197 Register RHS = MI.getOperand(2).getReg(); 2198 2199 uint16_t Flags = MI.getFlags(); 2200 2201 LLT S16 = LLT::scalar(16); 2202 LLT S32 = LLT::scalar(32); 2203 2204 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2205 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2206 2207 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2208 .addUse(RHSExt.getReg(0)) 2209 .setMIFlags(Flags); 2210 2211 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2212 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2213 2214 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2215 .addUse(RDst.getReg(0)) 2216 .addUse(RHS) 2217 .addUse(LHS) 2218 .setMIFlags(Flags); 2219 2220 MI.eraseFromParent(); 2221 return true; 2222 } 2223 2224 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2225 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2226 static void toggleSPDenormMode(bool Enable, 2227 MachineIRBuilder &B, 2228 const GCNSubtarget &ST, 2229 AMDGPU::SIModeRegisterDefaults Mode) { 2230 // Set SP denorm mode to this value. 2231 unsigned SPDenormMode = 2232 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2233 2234 if (ST.hasDenormModeInst()) { 2235 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2236 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2237 2238 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2239 B.buildInstr(AMDGPU::S_DENORM_MODE) 2240 .addImm(NewDenormModeValue); 2241 2242 } else { 2243 // Select FP32 bit field in mode register. 2244 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2245 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2246 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2247 2248 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2249 .addImm(SPDenormMode) 2250 .addImm(SPDenormModeBitField); 2251 } 2252 } 2253 2254 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2255 MachineRegisterInfo &MRI, 2256 MachineIRBuilder &B) const { 2257 B.setInstr(MI); 2258 Register Res = MI.getOperand(0).getReg(); 2259 Register LHS = MI.getOperand(1).getReg(); 2260 Register RHS = MI.getOperand(2).getReg(); 2261 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2262 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2263 2264 uint16_t Flags = MI.getFlags(); 2265 2266 LLT S32 = LLT::scalar(32); 2267 LLT S1 = LLT::scalar(1); 2268 2269 auto One = B.buildFConstant(S32, 1.0f); 2270 2271 auto DenominatorScaled = 2272 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2273 .addUse(RHS) 2274 .addUse(LHS) 2275 .addImm(1) 2276 .setMIFlags(Flags); 2277 auto NumeratorScaled = 2278 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2279 .addUse(LHS) 2280 .addUse(RHS) 2281 .addImm(0) 2282 .setMIFlags(Flags); 2283 2284 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2285 .addUse(DenominatorScaled.getReg(0)) 2286 .setMIFlags(Flags); 2287 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2288 2289 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2290 // aren't modeled as reading it. 2291 if (!Mode.allFP32Denormals()) 2292 toggleSPDenormMode(true, B, ST, Mode); 2293 2294 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2295 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2296 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2297 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2298 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2299 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2300 2301 if (!Mode.allFP32Denormals()) 2302 toggleSPDenormMode(false, B, ST, Mode); 2303 2304 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2305 .addUse(Fma4.getReg(0)) 2306 .addUse(Fma1.getReg(0)) 2307 .addUse(Fma3.getReg(0)) 2308 .addUse(NumeratorScaled.getReg(1)) 2309 .setMIFlags(Flags); 2310 2311 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2312 .addUse(Fmas.getReg(0)) 2313 .addUse(RHS) 2314 .addUse(LHS) 2315 .setMIFlags(Flags); 2316 2317 MI.eraseFromParent(); 2318 return true; 2319 } 2320 2321 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2322 MachineRegisterInfo &MRI, 2323 MachineIRBuilder &B) const { 2324 B.setInstr(MI); 2325 Register Res = MI.getOperand(0).getReg(); 2326 Register LHS = MI.getOperand(1).getReg(); 2327 Register RHS = MI.getOperand(2).getReg(); 2328 2329 uint16_t Flags = MI.getFlags(); 2330 2331 LLT S64 = LLT::scalar(64); 2332 LLT S1 = LLT::scalar(1); 2333 2334 auto One = B.buildFConstant(S64, 1.0); 2335 2336 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2337 .addUse(LHS) 2338 .addUse(RHS) 2339 .addImm(1) 2340 .setMIFlags(Flags); 2341 2342 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2343 2344 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2345 .addUse(DivScale0.getReg(0)) 2346 .setMIFlags(Flags); 2347 2348 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2349 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2350 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2351 2352 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2353 .addUse(LHS) 2354 .addUse(RHS) 2355 .addImm(0) 2356 .setMIFlags(Flags); 2357 2358 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2359 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2360 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2361 2362 Register Scale; 2363 if (!ST.hasUsableDivScaleConditionOutput()) { 2364 // Workaround a hardware bug on SI where the condition output from div_scale 2365 // is not usable. 2366 2367 LLT S32 = LLT::scalar(32); 2368 2369 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2370 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2371 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2372 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2373 2374 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2375 Scale1Unmerge.getReg(1)); 2376 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2377 Scale0Unmerge.getReg(1)); 2378 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 2379 } else { 2380 Scale = DivScale1.getReg(1); 2381 } 2382 2383 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2384 .addUse(Fma4.getReg(0)) 2385 .addUse(Fma3.getReg(0)) 2386 .addUse(Mul.getReg(0)) 2387 .addUse(Scale) 2388 .setMIFlags(Flags); 2389 2390 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2391 .addUse(Fmas.getReg(0)) 2392 .addUse(RHS) 2393 .addUse(LHS) 2394 .setMIFlags(Flags); 2395 2396 MI.eraseFromParent(); 2397 return true; 2398 } 2399 2400 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2401 MachineRegisterInfo &MRI, 2402 MachineIRBuilder &B) const { 2403 B.setInstr(MI); 2404 Register Res = MI.getOperand(0).getReg(); 2405 Register LHS = MI.getOperand(2).getReg(); 2406 Register RHS = MI.getOperand(3).getReg(); 2407 uint16_t Flags = MI.getFlags(); 2408 2409 LLT S32 = LLT::scalar(32); 2410 LLT S1 = LLT::scalar(1); 2411 2412 auto Abs = B.buildFAbs(S32, RHS, Flags); 2413 const APFloat C0Val(1.0f); 2414 2415 auto C0 = B.buildConstant(S32, 0x6f800000); 2416 auto C1 = B.buildConstant(S32, 0x2f800000); 2417 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2418 2419 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2420 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2421 2422 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2423 2424 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2425 .addUse(Mul0.getReg(0)) 2426 .setMIFlags(Flags); 2427 2428 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2429 2430 B.buildFMul(Res, Sel, Mul1, Flags); 2431 2432 MI.eraseFromParent(); 2433 return true; 2434 } 2435 2436 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2437 MachineRegisterInfo &MRI, 2438 MachineIRBuilder &B) const { 2439 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2440 if (!MFI->isEntryFunction()) { 2441 return legalizePreloadedArgIntrin(MI, MRI, B, 2442 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2443 } 2444 2445 B.setInstr(MI); 2446 2447 uint64_t Offset = 2448 ST.getTargetLowering()->getImplicitParameterOffset( 2449 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2450 Register DstReg = MI.getOperand(0).getReg(); 2451 LLT DstTy = MRI.getType(DstReg); 2452 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2453 2454 const ArgDescriptor *Arg; 2455 const TargetRegisterClass *RC; 2456 std::tie(Arg, RC) 2457 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2458 if (!Arg) 2459 return false; 2460 2461 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2462 if (!loadInputValue(KernargPtrReg, B, Arg)) 2463 return false; 2464 2465 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2466 MI.eraseFromParent(); 2467 return true; 2468 } 2469 2470 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2471 MachineRegisterInfo &MRI, 2472 MachineIRBuilder &B, 2473 unsigned AddrSpace) const { 2474 B.setInstr(MI); 2475 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2476 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2477 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2478 MI.eraseFromParent(); 2479 return true; 2480 } 2481 2482 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2483 // offset (the offset that is included in bounds checking and swizzling, to be 2484 // split between the instruction's voffset and immoffset fields) and soffset 2485 // (the offset that is excluded from bounds checking and swizzling, to go in 2486 // the instruction's soffset field). This function takes the first kind of 2487 // offset and figures out how to split it between voffset and immoffset. 2488 std::tuple<Register, unsigned, unsigned> 2489 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2490 Register OrigOffset) const { 2491 const unsigned MaxImm = 4095; 2492 Register BaseReg; 2493 unsigned TotalConstOffset; 2494 MachineInstr *OffsetDef; 2495 const LLT S32 = LLT::scalar(32); 2496 2497 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2498 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2499 2500 unsigned ImmOffset = TotalConstOffset; 2501 2502 // If the immediate value is too big for the immoffset field, put the value 2503 // and -4096 into the immoffset field so that the value that is copied/added 2504 // for the voffset field is a multiple of 4096, and it stands more chance 2505 // of being CSEd with the copy/add for another similar load/store. 2506 // However, do not do that rounding down to a multiple of 4096 if that is a 2507 // negative number, as it appears to be illegal to have a negative offset 2508 // in the vgpr, even if adding the immediate offset makes it positive. 2509 unsigned Overflow = ImmOffset & ~MaxImm; 2510 ImmOffset -= Overflow; 2511 if ((int32_t)Overflow < 0) { 2512 Overflow += ImmOffset; 2513 ImmOffset = 0; 2514 } 2515 2516 if (Overflow != 0) { 2517 if (!BaseReg) { 2518 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2519 } else { 2520 auto OverflowVal = B.buildConstant(S32, Overflow); 2521 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2522 } 2523 } 2524 2525 if (!BaseReg) 2526 BaseReg = B.buildConstant(S32, 0).getReg(0); 2527 2528 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2529 } 2530 2531 /// Handle register layout difference for f16 images for some subtargets. 2532 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2533 MachineRegisterInfo &MRI, 2534 Register Reg) const { 2535 if (!ST.hasUnpackedD16VMem()) 2536 return Reg; 2537 2538 const LLT S16 = LLT::scalar(16); 2539 const LLT S32 = LLT::scalar(32); 2540 LLT StoreVT = MRI.getType(Reg); 2541 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2542 2543 auto Unmerge = B.buildUnmerge(S16, Reg); 2544 2545 SmallVector<Register, 4> WideRegs; 2546 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2547 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2548 2549 int NumElts = StoreVT.getNumElements(); 2550 2551 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2552 } 2553 2554 Register AMDGPULegalizerInfo::fixStoreSourceType( 2555 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2556 MachineRegisterInfo *MRI = B.getMRI(); 2557 LLT Ty = MRI->getType(VData); 2558 2559 const LLT S16 = LLT::scalar(16); 2560 2561 // Fixup illegal register types for i8 stores. 2562 if (Ty == LLT::scalar(8) || Ty == S16) { 2563 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2564 return AnyExt; 2565 } 2566 2567 if (Ty.isVector()) { 2568 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2569 if (IsFormat) 2570 return handleD16VData(B, *MRI, VData); 2571 } 2572 } 2573 2574 return VData; 2575 } 2576 2577 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2578 MachineRegisterInfo &MRI, 2579 MachineIRBuilder &B, 2580 bool IsTyped, 2581 bool IsFormat) const { 2582 B.setInstr(MI); 2583 2584 Register VData = MI.getOperand(1).getReg(); 2585 LLT Ty = MRI.getType(VData); 2586 LLT EltTy = Ty.getScalarType(); 2587 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2588 const LLT S32 = LLT::scalar(32); 2589 2590 VData = fixStoreSourceType(B, VData, IsFormat); 2591 Register RSrc = MI.getOperand(2).getReg(); 2592 2593 MachineMemOperand *MMO = *MI.memoperands_begin(); 2594 const int MemSize = MMO->getSize(); 2595 2596 unsigned ImmOffset; 2597 unsigned TotalOffset; 2598 2599 // The typed intrinsics add an immediate after the registers. 2600 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2601 2602 // The struct intrinsic variants add one additional operand over raw. 2603 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2604 Register VIndex; 2605 int OpOffset = 0; 2606 if (HasVIndex) { 2607 VIndex = MI.getOperand(3).getReg(); 2608 OpOffset = 1; 2609 } 2610 2611 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2612 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2613 2614 unsigned Format = 0; 2615 if (IsTyped) { 2616 Format = MI.getOperand(5 + OpOffset).getImm(); 2617 ++OpOffset; 2618 } 2619 2620 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2621 2622 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2623 if (TotalOffset != 0) 2624 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2625 2626 unsigned Opc; 2627 if (IsTyped) { 2628 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2629 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2630 } else if (IsFormat) { 2631 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2632 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2633 } else { 2634 switch (MemSize) { 2635 case 1: 2636 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2637 break; 2638 case 2: 2639 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 2640 break; 2641 default: 2642 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 2643 break; 2644 } 2645 } 2646 2647 if (!VIndex) 2648 VIndex = B.buildConstant(S32, 0).getReg(0); 2649 2650 auto MIB = B.buildInstr(Opc) 2651 .addUse(VData) // vdata 2652 .addUse(RSrc) // rsrc 2653 .addUse(VIndex) // vindex 2654 .addUse(VOffset) // voffset 2655 .addUse(SOffset) // soffset 2656 .addImm(ImmOffset); // offset(imm) 2657 2658 if (IsTyped) 2659 MIB.addImm(Format); 2660 2661 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2662 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2663 .addMemOperand(MMO); 2664 2665 MI.eraseFromParent(); 2666 return true; 2667 } 2668 2669 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 2670 MachineRegisterInfo &MRI, 2671 MachineIRBuilder &B, 2672 bool IsFormat, 2673 bool IsTyped) const { 2674 B.setInstr(MI); 2675 2676 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 2677 MachineMemOperand *MMO = *MI.memoperands_begin(); 2678 const int MemSize = MMO->getSize(); 2679 const LLT S32 = LLT::scalar(32); 2680 2681 Register Dst = MI.getOperand(0).getReg(); 2682 Register RSrc = MI.getOperand(2).getReg(); 2683 2684 // The typed intrinsics add an immediate after the registers. 2685 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2686 2687 // The struct intrinsic variants add one additional operand over raw. 2688 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2689 Register VIndex; 2690 int OpOffset = 0; 2691 if (HasVIndex) { 2692 VIndex = MI.getOperand(3).getReg(); 2693 OpOffset = 1; 2694 } 2695 2696 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2697 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2698 2699 unsigned Format = 0; 2700 if (IsTyped) { 2701 Format = MI.getOperand(5 + OpOffset).getImm(); 2702 ++OpOffset; 2703 } 2704 2705 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2706 unsigned ImmOffset; 2707 unsigned TotalOffset; 2708 2709 LLT Ty = MRI.getType(Dst); 2710 LLT EltTy = Ty.getScalarType(); 2711 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2712 const bool Unpacked = ST.hasUnpackedD16VMem(); 2713 2714 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2715 if (TotalOffset != 0) 2716 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2717 2718 unsigned Opc; 2719 2720 if (IsTyped) { 2721 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 2722 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 2723 } else if (IsFormat) { 2724 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 2725 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 2726 } else { 2727 switch (MemSize) { 2728 case 1: 2729 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 2730 break; 2731 case 2: 2732 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 2733 break; 2734 default: 2735 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 2736 break; 2737 } 2738 } 2739 2740 Register LoadDstReg; 2741 2742 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 2743 LLT UnpackedTy = Ty.changeElementSize(32); 2744 2745 if (IsExtLoad) 2746 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 2747 else if (Unpacked && IsD16 && Ty.isVector()) 2748 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 2749 else 2750 LoadDstReg = Dst; 2751 2752 if (!VIndex) 2753 VIndex = B.buildConstant(S32, 0).getReg(0); 2754 2755 auto MIB = B.buildInstr(Opc) 2756 .addDef(LoadDstReg) // vdata 2757 .addUse(RSrc) // rsrc 2758 .addUse(VIndex) // vindex 2759 .addUse(VOffset) // voffset 2760 .addUse(SOffset) // soffset 2761 .addImm(ImmOffset); // offset(imm) 2762 2763 if (IsTyped) 2764 MIB.addImm(Format); 2765 2766 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2767 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2768 .addMemOperand(MMO); 2769 2770 if (LoadDstReg != Dst) { 2771 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2772 2773 // Widen result for extending loads was widened. 2774 if (IsExtLoad) 2775 B.buildTrunc(Dst, LoadDstReg); 2776 else { 2777 // Repack to original 16-bit vector result 2778 // FIXME: G_TRUNC should work, but legalization currently fails 2779 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 2780 SmallVector<Register, 4> Repack; 2781 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 2782 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 2783 B.buildMerge(Dst, Repack); 2784 } 2785 } 2786 2787 MI.eraseFromParent(); 2788 return true; 2789 } 2790 2791 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2792 MachineIRBuilder &B, 2793 bool IsInc) const { 2794 B.setInstr(MI); 2795 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2796 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2797 B.buildInstr(Opc) 2798 .addDef(MI.getOperand(0).getReg()) 2799 .addUse(MI.getOperand(2).getReg()) 2800 .addUse(MI.getOperand(3).getReg()) 2801 .cloneMemRefs(MI); 2802 MI.eraseFromParent(); 2803 return true; 2804 } 2805 2806 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 2807 switch (IntrID) { 2808 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2809 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2810 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 2811 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2812 case Intrinsic::amdgcn_struct_buffer_atomic_add: 2813 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 2814 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 2815 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 2816 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 2817 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 2818 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 2819 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 2820 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2821 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2822 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 2823 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2824 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2825 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 2826 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2827 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2828 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 2829 case Intrinsic::amdgcn_raw_buffer_atomic_and: 2830 case Intrinsic::amdgcn_struct_buffer_atomic_and: 2831 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 2832 case Intrinsic::amdgcn_raw_buffer_atomic_or: 2833 case Intrinsic::amdgcn_struct_buffer_atomic_or: 2834 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 2835 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 2836 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 2837 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 2838 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 2839 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 2840 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 2841 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 2842 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 2843 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 2844 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 2845 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 2846 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 2847 default: 2848 llvm_unreachable("unhandled atomic opcode"); 2849 } 2850 } 2851 2852 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 2853 MachineIRBuilder &B, 2854 Intrinsic::ID IID) const { 2855 B.setInstr(MI); 2856 2857 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 2858 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 2859 2860 Register Dst = MI.getOperand(0).getReg(); 2861 Register VData = MI.getOperand(2).getReg(); 2862 2863 Register CmpVal; 2864 int OpOffset = 0; 2865 2866 if (IsCmpSwap) { 2867 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 2868 ++OpOffset; 2869 } 2870 2871 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 2872 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 2873 2874 // The struct intrinsic variants add one additional operand over raw. 2875 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2876 Register VIndex; 2877 if (HasVIndex) { 2878 VIndex = MI.getOperand(4 + OpOffset).getReg(); 2879 ++OpOffset; 2880 } 2881 2882 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 2883 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 2884 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 2885 2886 MachineMemOperand *MMO = *MI.memoperands_begin(); 2887 2888 unsigned ImmOffset; 2889 unsigned TotalOffset; 2890 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2891 if (TotalOffset != 0) 2892 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 2893 2894 if (!VIndex) 2895 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 2896 2897 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 2898 .addDef(Dst) 2899 .addUse(VData); // vdata 2900 2901 if (IsCmpSwap) 2902 MIB.addReg(CmpVal); 2903 2904 MIB.addUse(RSrc) // rsrc 2905 .addUse(VIndex) // vindex 2906 .addUse(VOffset) // voffset 2907 .addUse(SOffset) // soffset 2908 .addImm(ImmOffset) // offset(imm) 2909 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2910 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2911 .addMemOperand(MMO); 2912 2913 MI.eraseFromParent(); 2914 return true; 2915 } 2916 2917 // Produce a vector of s16 elements from s32 pieces. 2918 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg, 2919 ArrayRef<Register> UnmergeParts) { 2920 const LLT S16 = LLT::scalar(16); 2921 2922 SmallVector<Register, 4> RemergeParts(UnmergeParts.size()); 2923 for (int I = 0, E = UnmergeParts.size(); I != E; ++I) 2924 RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0); 2925 2926 B.buildBuildVector(DstReg, RemergeParts); 2927 } 2928 2929 /// Convert a set of s32 registers to a result vector with s16 elements. 2930 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg, 2931 ArrayRef<Register> UnmergeParts) { 2932 MachineRegisterInfo &MRI = *B.getMRI(); 2933 const LLT V2S16 = LLT::vector(2, 16); 2934 LLT TargetTy = MRI.getType(DstReg); 2935 int NumElts = UnmergeParts.size(); 2936 2937 if (NumElts == 1) { 2938 assert(TargetTy == V2S16); 2939 B.buildBitcast(DstReg, UnmergeParts[0]); 2940 return; 2941 } 2942 2943 SmallVector<Register, 4> RemergeParts(NumElts); 2944 for (int I = 0; I != NumElts; ++I) 2945 RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0); 2946 2947 if (TargetTy.getSizeInBits() == 32u * NumElts) { 2948 B.buildConcatVectors(DstReg, RemergeParts); 2949 return; 2950 } 2951 2952 const LLT V3S16 = LLT::vector(3, 16); 2953 const LLT V6S16 = LLT::vector(6, 16); 2954 2955 // Widen to v6s16 and unpack v3 parts. 2956 assert(TargetTy == V3S16); 2957 2958 RemergeParts.push_back(B.buildUndef(V2S16).getReg(0)); 2959 auto Concat = B.buildConcatVectors(V6S16, RemergeParts); 2960 B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat); 2961 } 2962 2963 // FIXME: Just vector trunc should be sufficent, but legalization currently 2964 // broken. 2965 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg, 2966 Register WideDstReg) { 2967 const LLT S32 = LLT::scalar(32); 2968 const LLT S16 = LLT::scalar(16); 2969 2970 auto Unmerge = B.buildUnmerge(S32, WideDstReg); 2971 2972 int NumOps = Unmerge->getNumOperands() - 1; 2973 SmallVector<Register, 4> RemergeParts(NumOps); 2974 for (int I = 0; I != NumOps; ++I) 2975 RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0); 2976 2977 B.buildBuildVector(DstReg, RemergeParts); 2978 } 2979 2980 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 2981 MachineInstr &MI, MachineIRBuilder &B, 2982 GISelChangeObserver &Observer, 2983 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 2984 bool IsTFE = MI.getNumExplicitDefs() == 2; 2985 2986 // We are only processing the operands of d16 image operations on subtargets 2987 // that use the unpacked register layout, or need to repack the TFE result. 2988 2989 // TODO: Need to handle a16 images too 2990 // TODO: Do we need to guard against already legalized intrinsics? 2991 if (!IsTFE && !ST.hasUnpackedD16VMem()) 2992 return true; 2993 2994 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 2995 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 2996 2997 if (BaseOpcode->Atomic) // No d16 atomics, or TFE. 2998 return true; 2999 3000 B.setInstr(MI); 3001 3002 MachineRegisterInfo *MRI = B.getMRI(); 3003 const LLT S32 = LLT::scalar(32); 3004 const LLT S16 = LLT::scalar(16); 3005 3006 if (BaseOpcode->Store) { // No TFE for stores? 3007 Register VData = MI.getOperand(1).getReg(); 3008 LLT Ty = MRI->getType(VData); 3009 if (!Ty.isVector() || Ty.getElementType() != S16) 3010 return true; 3011 3012 B.setInstr(MI); 3013 3014 Observer.changingInstr(MI); 3015 MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData)); 3016 Observer.changedInstr(MI); 3017 return true; 3018 } 3019 3020 Register DstReg = MI.getOperand(0).getReg(); 3021 LLT Ty = MRI->getType(DstReg); 3022 const LLT EltTy = Ty.getScalarType(); 3023 const bool IsD16 = Ty.getScalarType() == S16; 3024 const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3025 3026 if (IsTFE) { 3027 // In the IR, TFE is supposed to be used with a 2 element struct return 3028 // type. The intruction really returns these two values in one contiguous 3029 // register, with one additional dword beyond the loaded data. Rewrite the 3030 // return type to use a single register result. 3031 Register Dst1Reg = MI.getOperand(1).getReg(); 3032 if (MRI->getType(Dst1Reg) != S32) 3033 return false; 3034 3035 // TODO: Make sure the TFE operand bit is set. 3036 3037 // The raw dword aligned data component of the load. The only legal cases 3038 // where this matters should be when using the packed D16 format, for 3039 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3040 LLT RoundedTy; 3041 LLT TFETy; 3042 3043 if (IsD16 && ST.hasUnpackedD16VMem()) { 3044 RoundedTy = LLT::scalarOrVector(NumElts, 32); 3045 TFETy = LLT::vector(NumElts + 1, 32); 3046 } else { 3047 unsigned EltSize = Ty.getScalarSizeInBits(); 3048 unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32; 3049 unsigned RoundedSize = 32 * RoundedElts; 3050 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3051 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3052 } 3053 3054 Register TFEReg = MRI->createGenericVirtualRegister(TFETy); 3055 Observer.changingInstr(MI); 3056 3057 MI.getOperand(0).setReg(TFEReg); 3058 MI.RemoveOperand(1); 3059 3060 Observer.changedInstr(MI); 3061 3062 // Insert after the instruction. 3063 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3064 3065 // Now figure out how to copy the new result register back into the old 3066 // result. 3067 3068 SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg); 3069 int NumDataElts = TFETy.getNumElements() - 1; 3070 3071 if (!Ty.isVector()) { 3072 // Simplest case is a trivial unmerge (plus a truncate for d16). 3073 UnmergeResults[0] = Ty == S32 ? 3074 DstReg : MRI->createGenericVirtualRegister(S32); 3075 3076 B.buildUnmerge(UnmergeResults, TFEReg); 3077 if (Ty != S32) 3078 B.buildTrunc(DstReg, UnmergeResults[0]); 3079 return true; 3080 } 3081 3082 // We have to repack into a new vector of some kind. 3083 for (int I = 0; I != NumDataElts; ++I) 3084 UnmergeResults[I] = MRI->createGenericVirtualRegister(S32); 3085 B.buildUnmerge(UnmergeResults, TFEReg); 3086 3087 // Drop the final TFE element. 3088 ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts); 3089 3090 if (EltTy == S32) 3091 B.buildBuildVector(DstReg, DataPart); 3092 else if (ST.hasUnpackedD16VMem()) 3093 truncToS16Vector(B, DstReg, DataPart); 3094 else 3095 bitcastToS16Vector(B, DstReg, DataPart); 3096 3097 return true; 3098 } 3099 3100 // Must be an image load. 3101 if (!Ty.isVector() || Ty.getElementType() != S16) 3102 return true; 3103 3104 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3105 3106 LLT WidenedTy = Ty.changeElementType(S32); 3107 Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy); 3108 3109 Observer.changingInstr(MI); 3110 MI.getOperand(0).setReg(WideDstReg); 3111 Observer.changedInstr(MI); 3112 3113 repackUnpackedD16Load(B, DstReg, WideDstReg); 3114 return true; 3115 } 3116 3117 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3118 MachineInstr &MI, MachineIRBuilder &B, 3119 GISelChangeObserver &Observer) const { 3120 Register Dst = MI.getOperand(0).getReg(); 3121 LLT Ty = B.getMRI()->getType(Dst); 3122 unsigned Size = Ty.getSizeInBits(); 3123 3124 // There are no 96-bit result scalar loads, but widening to 128-bit should 3125 // always be legal. We may need to restore this to a 96-bit result if it turns 3126 // out this needs to be converted to a vector load during RegBankSelect. 3127 if (isPowerOf2_32(Size)) 3128 return true; 3129 3130 LegalizerHelper Helper(B.getMF(), *this, Observer, B); 3131 B.setInstr(MI); 3132 3133 Observer.changingInstr(MI); 3134 3135 if (Ty.isVector()) 3136 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3137 else 3138 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3139 3140 Observer.changedInstr(MI); 3141 return true; 3142 } 3143 3144 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 3145 MachineIRBuilder &B, 3146 GISelChangeObserver &Observer) const { 3147 MachineRegisterInfo &MRI = *B.getMRI(); 3148 3149 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 3150 auto IntrID = MI.getIntrinsicID(); 3151 switch (IntrID) { 3152 case Intrinsic::amdgcn_if: 3153 case Intrinsic::amdgcn_else: { 3154 MachineInstr *Br = nullptr; 3155 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3156 const SIRegisterInfo *TRI 3157 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3158 3159 B.setInstr(*BrCond); 3160 Register Def = MI.getOperand(1).getReg(); 3161 Register Use = MI.getOperand(3).getReg(); 3162 3163 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 3164 if (Br) 3165 BrTarget = Br->getOperand(0).getMBB(); 3166 3167 if (IntrID == Intrinsic::amdgcn_if) { 3168 B.buildInstr(AMDGPU::SI_IF) 3169 .addDef(Def) 3170 .addUse(Use) 3171 .addMBB(BrTarget); 3172 } else { 3173 B.buildInstr(AMDGPU::SI_ELSE) 3174 .addDef(Def) 3175 .addUse(Use) 3176 .addMBB(BrTarget) 3177 .addImm(0); 3178 } 3179 3180 if (Br) 3181 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 3182 3183 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 3184 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 3185 MI.eraseFromParent(); 3186 BrCond->eraseFromParent(); 3187 return true; 3188 } 3189 3190 return false; 3191 } 3192 case Intrinsic::amdgcn_loop: { 3193 MachineInstr *Br = nullptr; 3194 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 3195 const SIRegisterInfo *TRI 3196 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 3197 3198 B.setInstr(*BrCond); 3199 3200 // FIXME: Need to adjust branch targets based on unconditional branch. 3201 Register Reg = MI.getOperand(2).getReg(); 3202 B.buildInstr(AMDGPU::SI_LOOP) 3203 .addUse(Reg) 3204 .addMBB(BrCond->getOperand(1).getMBB()); 3205 MI.eraseFromParent(); 3206 BrCond->eraseFromParent(); 3207 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 3208 return true; 3209 } 3210 3211 return false; 3212 } 3213 case Intrinsic::amdgcn_kernarg_segment_ptr: 3214 return legalizePreloadedArgIntrin( 3215 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3216 case Intrinsic::amdgcn_implicitarg_ptr: 3217 return legalizeImplicitArgPtr(MI, MRI, B); 3218 case Intrinsic::amdgcn_workitem_id_x: 3219 return legalizePreloadedArgIntrin(MI, MRI, B, 3220 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3221 case Intrinsic::amdgcn_workitem_id_y: 3222 return legalizePreloadedArgIntrin(MI, MRI, B, 3223 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3224 case Intrinsic::amdgcn_workitem_id_z: 3225 return legalizePreloadedArgIntrin(MI, MRI, B, 3226 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3227 case Intrinsic::amdgcn_workgroup_id_x: 3228 return legalizePreloadedArgIntrin(MI, MRI, B, 3229 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 3230 case Intrinsic::amdgcn_workgroup_id_y: 3231 return legalizePreloadedArgIntrin(MI, MRI, B, 3232 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 3233 case Intrinsic::amdgcn_workgroup_id_z: 3234 return legalizePreloadedArgIntrin(MI, MRI, B, 3235 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 3236 case Intrinsic::amdgcn_dispatch_ptr: 3237 return legalizePreloadedArgIntrin(MI, MRI, B, 3238 AMDGPUFunctionArgInfo::DISPATCH_PTR); 3239 case Intrinsic::amdgcn_queue_ptr: 3240 return legalizePreloadedArgIntrin(MI, MRI, B, 3241 AMDGPUFunctionArgInfo::QUEUE_PTR); 3242 case Intrinsic::amdgcn_implicit_buffer_ptr: 3243 return legalizePreloadedArgIntrin( 3244 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 3245 case Intrinsic::amdgcn_dispatch_id: 3246 return legalizePreloadedArgIntrin(MI, MRI, B, 3247 AMDGPUFunctionArgInfo::DISPATCH_ID); 3248 case Intrinsic::amdgcn_fdiv_fast: 3249 return legalizeFDIVFastIntrin(MI, MRI, B); 3250 case Intrinsic::amdgcn_is_shared: 3251 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 3252 case Intrinsic::amdgcn_is_private: 3253 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 3254 case Intrinsic::amdgcn_wavefrontsize: { 3255 B.setInstr(MI); 3256 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 3257 MI.eraseFromParent(); 3258 return true; 3259 } 3260 case Intrinsic::amdgcn_s_buffer_load: 3261 return legalizeSBufferLoad(MI, B, Observer); 3262 case Intrinsic::amdgcn_raw_buffer_store: 3263 case Intrinsic::amdgcn_struct_buffer_store: 3264 return legalizeBufferStore(MI, MRI, B, false, false); 3265 case Intrinsic::amdgcn_raw_buffer_store_format: 3266 case Intrinsic::amdgcn_struct_buffer_store_format: 3267 return legalizeBufferStore(MI, MRI, B, false, true); 3268 case Intrinsic::amdgcn_raw_tbuffer_store: 3269 case Intrinsic::amdgcn_struct_tbuffer_store: 3270 return legalizeBufferStore(MI, MRI, B, true, true); 3271 case Intrinsic::amdgcn_raw_buffer_load: 3272 case Intrinsic::amdgcn_struct_buffer_load: 3273 return legalizeBufferLoad(MI, MRI, B, false, false); 3274 case Intrinsic::amdgcn_raw_buffer_load_format: 3275 case Intrinsic::amdgcn_struct_buffer_load_format: 3276 return legalizeBufferLoad(MI, MRI, B, true, false); 3277 case Intrinsic::amdgcn_raw_tbuffer_load: 3278 case Intrinsic::amdgcn_struct_tbuffer_load: 3279 return legalizeBufferLoad(MI, MRI, B, true, true); 3280 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3281 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3282 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3283 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3284 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3285 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3286 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3287 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3288 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3289 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3290 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3291 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3292 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3293 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3294 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3295 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3296 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3297 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3298 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3299 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3300 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3301 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3302 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3303 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3304 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3305 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3306 return legalizeBufferAtomic(MI, B, IntrID); 3307 case Intrinsic::amdgcn_atomic_inc: 3308 return legalizeAtomicIncDec(MI, B, true); 3309 case Intrinsic::amdgcn_atomic_dec: 3310 return legalizeAtomicIncDec(MI, B, false); 3311 default: { 3312 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 3313 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 3314 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 3315 return true; 3316 } 3317 } 3318 3319 return true; 3320 } 3321