1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 static LegalityPredicate isMultiple32(unsigned TypeIdx, 46 unsigned MaxSize = 1024) { 47 return [=](const LegalityQuery &Query) { 48 const LLT Ty = Query.Types[TypeIdx]; 49 const LLT EltTy = Ty.getScalarType(); 50 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 51 }; 52 } 53 54 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 55 return [=](const LegalityQuery &Query) { 56 return Query.Types[TypeIdx].getSizeInBits() == Size; 57 }; 58 } 59 60 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 return Ty.isVector() && 64 Ty.getNumElements() % 2 != 0 && 65 Ty.getElementType().getSizeInBits() < 32 && 66 Ty.getSizeInBits() % 32 != 0; 67 }; 68 } 69 70 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 71 return [=](const LegalityQuery &Query) { 72 const LLT Ty = Query.Types[TypeIdx]; 73 const LLT EltTy = Ty.getScalarType(); 74 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 75 }; 76 } 77 78 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 79 return [=](const LegalityQuery &Query) { 80 const LLT Ty = Query.Types[TypeIdx]; 81 const LLT EltTy = Ty.getElementType(); 82 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 83 }; 84 } 85 86 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getElementType(); 90 unsigned Size = Ty.getSizeInBits(); 91 unsigned Pieces = (Size + 63) / 64; 92 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 93 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 94 }; 95 } 96 97 // Increase the number of vector elements to reach the next multiple of 32-bit 98 // type. 99 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 100 return [=](const LegalityQuery &Query) { 101 const LLT Ty = Query.Types[TypeIdx]; 102 103 const LLT EltTy = Ty.getElementType(); 104 const int Size = Ty.getSizeInBits(); 105 const int EltSize = EltTy.getSizeInBits(); 106 const int NextMul32 = (Size + 31) / 32; 107 108 assert(EltSize < 32); 109 110 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 111 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 112 }; 113 } 114 115 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 116 return [=](const LegalityQuery &Query) { 117 const LLT QueryTy = Query.Types[TypeIdx]; 118 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 119 }; 120 } 121 122 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 123 return [=](const LegalityQuery &Query) { 124 const LLT QueryTy = Query.Types[TypeIdx]; 125 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 126 }; 127 } 128 129 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 133 }; 134 } 135 136 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 137 // v2s16. 138 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 139 return [=](const LegalityQuery &Query) { 140 const LLT Ty = Query.Types[TypeIdx]; 141 if (Ty.isVector()) { 142 const int EltSize = Ty.getElementType().getSizeInBits(); 143 return EltSize == 32 || EltSize == 64 || 144 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 145 EltSize == 128 || EltSize == 256; 146 } 147 148 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 149 }; 150 } 151 152 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 153 return [=](const LegalityQuery &Query) { 154 return Query.Types[TypeIdx].getElementType() == Type; 155 }; 156 } 157 158 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 159 return [=](const LegalityQuery &Query) { 160 const LLT Ty = Query.Types[TypeIdx]; 161 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 162 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 163 }; 164 } 165 166 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 167 const GCNTargetMachine &TM) 168 : ST(ST_) { 169 using namespace TargetOpcode; 170 171 auto GetAddrSpacePtr = [&TM](unsigned AS) { 172 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 173 }; 174 175 const LLT S1 = LLT::scalar(1); 176 const LLT S8 = LLT::scalar(8); 177 const LLT S16 = LLT::scalar(16); 178 const LLT S32 = LLT::scalar(32); 179 const LLT S64 = LLT::scalar(64); 180 const LLT S96 = LLT::scalar(96); 181 const LLT S128 = LLT::scalar(128); 182 const LLT S256 = LLT::scalar(256); 183 const LLT S1024 = LLT::scalar(1024); 184 185 const LLT V2S16 = LLT::vector(2, 16); 186 const LLT V4S16 = LLT::vector(4, 16); 187 188 const LLT V2S32 = LLT::vector(2, 32); 189 const LLT V3S32 = LLT::vector(3, 32); 190 const LLT V4S32 = LLT::vector(4, 32); 191 const LLT V5S32 = LLT::vector(5, 32); 192 const LLT V6S32 = LLT::vector(6, 32); 193 const LLT V7S32 = LLT::vector(7, 32); 194 const LLT V8S32 = LLT::vector(8, 32); 195 const LLT V9S32 = LLT::vector(9, 32); 196 const LLT V10S32 = LLT::vector(10, 32); 197 const LLT V11S32 = LLT::vector(11, 32); 198 const LLT V12S32 = LLT::vector(12, 32); 199 const LLT V13S32 = LLT::vector(13, 32); 200 const LLT V14S32 = LLT::vector(14, 32); 201 const LLT V15S32 = LLT::vector(15, 32); 202 const LLT V16S32 = LLT::vector(16, 32); 203 const LLT V32S32 = LLT::vector(32, 32); 204 205 const LLT V2S64 = LLT::vector(2, 64); 206 const LLT V3S64 = LLT::vector(3, 64); 207 const LLT V4S64 = LLT::vector(4, 64); 208 const LLT V5S64 = LLT::vector(5, 64); 209 const LLT V6S64 = LLT::vector(6, 64); 210 const LLT V7S64 = LLT::vector(7, 64); 211 const LLT V8S64 = LLT::vector(8, 64); 212 const LLT V16S64 = LLT::vector(16, 64); 213 214 std::initializer_list<LLT> AllS32Vectors = 215 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 216 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 217 std::initializer_list<LLT> AllS64Vectors = 218 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 219 220 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 221 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 222 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 223 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 224 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 225 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 226 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 227 228 const LLT CodePtr = FlatPtr; 229 230 const std::initializer_list<LLT> AddrSpaces64 = { 231 GlobalPtr, ConstantPtr, FlatPtr 232 }; 233 234 const std::initializer_list<LLT> AddrSpaces32 = { 235 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 236 }; 237 238 const std::initializer_list<LLT> FPTypesBase = { 239 S32, S64 240 }; 241 242 const std::initializer_list<LLT> FPTypes16 = { 243 S32, S64, S16 244 }; 245 246 const std::initializer_list<LLT> FPTypesPK16 = { 247 S32, S64, S16, V2S16 248 }; 249 250 const LLT MinLegalScalarShiftTy = ST.has16BitInsts() ? S16 : S32; 251 252 setAction({G_BRCOND, S1}, Legal); // VCC branches 253 setAction({G_BRCOND, S32}, Legal); // SCC branches 254 255 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 256 // elements for v3s16 257 getActionDefinitionsBuilder(G_PHI) 258 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 259 .legalFor(AllS32Vectors) 260 .legalFor(AllS64Vectors) 261 .legalFor(AddrSpaces64) 262 .legalFor(AddrSpaces32) 263 .clampScalar(0, S32, S256) 264 .widenScalarToNextPow2(0, 32) 265 .clampMaxNumElements(0, S32, 16) 266 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 267 .legalIf(isPointer(0)); 268 269 if (ST.has16BitInsts()) { 270 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 271 .legalFor({S32, S16}) 272 .clampScalar(0, S16, S32) 273 .scalarize(0); 274 } else { 275 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 276 .legalFor({S32}) 277 .clampScalar(0, S32, S32) 278 .scalarize(0); 279 } 280 281 // FIXME: Not really legal. Placeholder for custom lowering. 282 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 283 .legalFor({S32, S64}) 284 .clampScalar(0, S32, S64) 285 .widenScalarToNextPow2(0, 32) 286 .scalarize(0); 287 288 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 289 .legalFor({S32}) 290 .clampScalar(0, S32, S32) 291 .scalarize(0); 292 293 // Report legal for any types we can handle anywhere. For the cases only legal 294 // on the SALU, RegBankSelect will be able to re-legalize. 295 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 296 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 297 .clampScalar(0, S32, S64) 298 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 299 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 300 .widenScalarToNextPow2(0) 301 .scalarize(0); 302 303 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 304 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 305 .legalFor({{S32, S1}, {S32, S32}}) 306 .clampScalar(0, S32, S32) 307 .scalarize(0); // TODO: Implement. 308 309 getActionDefinitionsBuilder(G_BITCAST) 310 // Don't worry about the size constraint. 311 .legalIf(all(isRegisterType(0), isRegisterType(1))) 312 // FIXME: Testing hack 313 .legalForCartesianProduct({S16, LLT::vector(2, 8), }) 314 .lower(); 315 316 317 getActionDefinitionsBuilder(G_CONSTANT) 318 .legalFor({S1, S32, S64, S16, GlobalPtr, 319 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 320 .clampScalar(0, S32, S64) 321 .widenScalarToNextPow2(0) 322 .legalIf(isPointer(0)); 323 324 getActionDefinitionsBuilder(G_FCONSTANT) 325 .legalFor({S32, S64, S16}) 326 .clampScalar(0, S16, S64); 327 328 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 329 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 330 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 331 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 332 .clampScalarOrElt(0, S32, S1024) 333 .legalIf(isMultiple32(0)) 334 .widenScalarToNextPow2(0, 32) 335 .clampMaxNumElements(0, S32, 16); 336 337 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 338 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 339 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 340 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 341 342 auto &FPOpActions = getActionDefinitionsBuilder( 343 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 344 .legalFor({S32, S64}); 345 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 346 .customFor({S32, S64}); 347 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 348 .customFor({S32, S64}); 349 350 if (ST.has16BitInsts()) { 351 if (ST.hasVOP3PInsts()) 352 FPOpActions.legalFor({S16, V2S16}); 353 else 354 FPOpActions.legalFor({S16}); 355 356 TrigActions.customFor({S16}); 357 FDIVActions.customFor({S16}); 358 } 359 360 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 361 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 362 363 if (ST.hasVOP3PInsts()) { 364 MinNumMaxNum.customFor(FPTypesPK16) 365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 366 .clampMaxNumElements(0, S16, 2) 367 .clampScalar(0, S16, S64) 368 .scalarize(0); 369 } else if (ST.has16BitInsts()) { 370 MinNumMaxNum.customFor(FPTypes16) 371 .clampScalar(0, S16, S64) 372 .scalarize(0); 373 } else { 374 MinNumMaxNum.customFor(FPTypesBase) 375 .clampScalar(0, S32, S64) 376 .scalarize(0); 377 } 378 379 if (ST.hasVOP3PInsts()) 380 FPOpActions.clampMaxNumElements(0, S16, 2); 381 382 FPOpActions 383 .scalarize(0) 384 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 385 386 TrigActions 387 .scalarize(0) 388 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 389 390 FDIVActions 391 .scalarize(0) 392 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 393 394 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 395 .legalFor(FPTypesPK16) 396 .clampMaxNumElements(0, S16, 2) 397 .scalarize(0) 398 .clampScalar(0, S16, S64); 399 400 if (ST.has16BitInsts()) { 401 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 402 .legalFor({S32, S64, S16}) 403 .scalarize(0) 404 .clampScalar(0, S16, S64); 405 } else { 406 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 407 .legalFor({S32, S64}) 408 .scalarize(0) 409 .clampScalar(0, S32, S64); 410 } 411 412 getActionDefinitionsBuilder(G_FPTRUNC) 413 .legalFor({{S32, S64}, {S16, S32}}) 414 .scalarize(0); 415 416 getActionDefinitionsBuilder(G_FPEXT) 417 .legalFor({{S64, S32}, {S32, S16}}) 418 .lowerFor({{S64, S16}}) // FIXME: Implement 419 .scalarize(0); 420 421 getActionDefinitionsBuilder(G_FSUB) 422 // Use actual fsub instruction 423 .legalFor({S32}) 424 // Must use fadd + fneg 425 .lowerFor({S64, S16, V2S16}) 426 .scalarize(0) 427 .clampScalar(0, S32, S64); 428 429 // Whether this is legal depends on the floating point mode for the function. 430 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 431 if (ST.hasMadF16()) 432 FMad.customFor({S32, S16}); 433 else 434 FMad.customFor({S32}); 435 FMad.scalarize(0) 436 .lower(); 437 438 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 439 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 440 {S32, S1}, {S64, S1}, {S16, S1}, 441 {S96, S32}, 442 // FIXME: Hack 443 {S64, LLT::scalar(33)}, 444 {S32, S8}, {S32, LLT::scalar(24)}}) 445 .scalarize(0) 446 .clampScalar(0, S32, S64); 447 448 // TODO: Split s1->s64 during regbankselect for VALU. 449 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 450 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 451 .lowerFor({{S32, S64}}) 452 .lowerIf(typeIs(1, S1)) 453 .customFor({{S64, S64}}); 454 if (ST.has16BitInsts()) 455 IToFP.legalFor({{S16, S16}}); 456 IToFP.clampScalar(1, S32, S64) 457 .scalarize(0); 458 459 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 460 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 461 if (ST.has16BitInsts()) 462 FPToI.legalFor({{S16, S16}}); 463 else 464 FPToI.minScalar(1, S32); 465 466 FPToI.minScalar(0, S32) 467 .scalarize(0); 468 469 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 470 .scalarize(0) 471 .lower(); 472 473 if (ST.has16BitInsts()) { 474 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 475 .legalFor({S16, S32, S64}) 476 .clampScalar(0, S16, S64) 477 .scalarize(0); 478 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 479 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 480 .legalFor({S32, S64}) 481 .clampScalar(0, S32, S64) 482 .scalarize(0); 483 } else { 484 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 485 .legalFor({S32}) 486 .customFor({S64}) 487 .clampScalar(0, S32, S64) 488 .scalarize(0); 489 } 490 491 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 492 .scalarize(0) 493 .alwaysLegal(); 494 495 auto &CmpBuilder = 496 getActionDefinitionsBuilder(G_ICMP) 497 // The compare output type differs based on the register bank of the output, 498 // so make both s1 and s32 legal. 499 // 500 // Scalar compares producing output in scc will be promoted to s32, as that 501 // is the allocatable register type that will be needed for the copy from 502 // scc. This will be promoted during RegBankSelect, and we assume something 503 // before that won't try to use s32 result types. 504 // 505 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 506 // bank. 507 .legalForCartesianProduct( 508 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 509 .legalForCartesianProduct( 510 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 511 if (ST.has16BitInsts()) { 512 CmpBuilder.legalFor({{S1, S16}}); 513 } 514 515 CmpBuilder 516 .widenScalarToNextPow2(1) 517 .clampScalar(1, S32, S64) 518 .scalarize(0) 519 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 520 521 getActionDefinitionsBuilder(G_FCMP) 522 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 523 .widenScalarToNextPow2(1) 524 .clampScalar(1, S32, S64) 525 .scalarize(0); 526 527 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 528 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 529 G_FLOG, G_FLOG2, G_FLOG10}) 530 .legalFor({S32}) 531 .scalarize(0); 532 533 // The 64-bit versions produce 32-bit results, but only on the SALU. 534 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 535 G_CTTZ, G_CTTZ_ZERO_UNDEF, 536 G_CTPOP}) 537 .legalFor({{S32, S32}, {S32, S64}}) 538 .clampScalar(0, S32, S32) 539 .clampScalar(1, S32, S64) 540 .scalarize(0) 541 .widenScalarToNextPow2(0, 32) 542 .widenScalarToNextPow2(1, 32); 543 544 // TODO: Expand for > s32 545 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 546 .legalFor({S32}) 547 .clampScalar(0, S32, S32) 548 .scalarize(0); 549 550 if (ST.has16BitInsts()) { 551 if (ST.hasVOP3PInsts()) { 552 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 553 .legalFor({S32, S16, V2S16}) 554 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 555 .clampMaxNumElements(0, S16, 2) 556 .clampScalar(0, S16, S32) 557 .widenScalarToNextPow2(0) 558 .scalarize(0); 559 } else { 560 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 561 .legalFor({S32, S16}) 562 .widenScalarToNextPow2(0) 563 .clampScalar(0, S16, S32) 564 .scalarize(0); 565 } 566 } else { 567 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 568 .legalFor({S32}) 569 .clampScalar(0, S32, S32) 570 .widenScalarToNextPow2(0) 571 .scalarize(0); 572 } 573 574 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 575 return [=](const LegalityQuery &Query) { 576 return Query.Types[TypeIdx0].getSizeInBits() < 577 Query.Types[TypeIdx1].getSizeInBits(); 578 }; 579 }; 580 581 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 582 return [=](const LegalityQuery &Query) { 583 return Query.Types[TypeIdx0].getSizeInBits() > 584 Query.Types[TypeIdx1].getSizeInBits(); 585 }; 586 }; 587 588 getActionDefinitionsBuilder(G_INTTOPTR) 589 // List the common cases 590 .legalForCartesianProduct(AddrSpaces64, {S64}) 591 .legalForCartesianProduct(AddrSpaces32, {S32}) 592 .scalarize(0) 593 // Accept any address space as long as the size matches 594 .legalIf(sameSize(0, 1)) 595 .widenScalarIf(smallerThan(1, 0), 596 [](const LegalityQuery &Query) { 597 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 598 }) 599 .narrowScalarIf(greaterThan(1, 0), 600 [](const LegalityQuery &Query) { 601 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 602 }); 603 604 getActionDefinitionsBuilder(G_PTRTOINT) 605 // List the common cases 606 .legalForCartesianProduct(AddrSpaces64, {S64}) 607 .legalForCartesianProduct(AddrSpaces32, {S32}) 608 .scalarize(0) 609 // Accept any address space as long as the size matches 610 .legalIf(sameSize(0, 1)) 611 .widenScalarIf(smallerThan(0, 1), 612 [](const LegalityQuery &Query) { 613 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 614 }) 615 .narrowScalarIf( 616 greaterThan(0, 1), 617 [](const LegalityQuery &Query) { 618 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 619 }); 620 621 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 622 .scalarize(0) 623 .custom(); 624 625 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 626 // handle some operations by just promoting the register during 627 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 628 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 629 switch (AS) { 630 // FIXME: Private element size. 631 case AMDGPUAS::PRIVATE_ADDRESS: 632 return 32; 633 // FIXME: Check subtarget 634 case AMDGPUAS::LOCAL_ADDRESS: 635 return ST.useDS128() ? 128 : 64; 636 637 // Treat constant and global as identical. SMRD loads are sometimes usable 638 // for global loads (ideally constant address space should be eliminated) 639 // depending on the context. Legality cannot be context dependent, but 640 // RegBankSelect can split the load as necessary depending on the pointer 641 // register bank/uniformity and if the memory is invariant or not written in 642 // a kernel. 643 case AMDGPUAS::CONSTANT_ADDRESS: 644 case AMDGPUAS::GLOBAL_ADDRESS: 645 return 512; 646 default: 647 return 128; 648 } 649 }; 650 651 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 652 const LLT DstTy = Query.Types[0]; 653 654 // Split vector extloads. 655 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 656 unsigned Align = Query.MMODescrs[0].AlignInBits; 657 658 if (MemSize < DstTy.getSizeInBits()) 659 MemSize = std::max(MemSize, Align); 660 661 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 662 return true; 663 664 const LLT PtrTy = Query.Types[1]; 665 unsigned AS = PtrTy.getAddressSpace(); 666 if (MemSize > maxSizeForAddrSpace(AS)) 667 return true; 668 669 // Catch weird sized loads that don't evenly divide into the access sizes 670 // TODO: May be able to widen depending on alignment etc. 671 unsigned NumRegs = MemSize / 32; 672 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 673 return true; 674 675 if (Align < MemSize) { 676 const SITargetLowering *TLI = ST.getTargetLowering(); 677 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 678 } 679 680 return false; 681 }; 682 683 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 684 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 685 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 686 687 // TODO: Refine based on subtargets which support unaligned access or 128-bit 688 // LDS 689 // TODO: Unsupported flat for SI. 690 691 for (unsigned Op : {G_LOAD, G_STORE}) { 692 const bool IsStore = Op == G_STORE; 693 694 auto &Actions = getActionDefinitionsBuilder(Op); 695 // Whitelist the common cases. 696 // TODO: Pointer loads 697 // TODO: Wide constant loads 698 // TODO: Only CI+ has 3x loads 699 // TODO: Loads to s16 on gfx9 700 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 701 {V2S32, GlobalPtr, 64, GlobalAlign32}, 702 {V3S32, GlobalPtr, 96, GlobalAlign32}, 703 {S96, GlobalPtr, 96, GlobalAlign32}, 704 {V4S32, GlobalPtr, 128, GlobalAlign32}, 705 {S128, GlobalPtr, 128, GlobalAlign32}, 706 {S64, GlobalPtr, 64, GlobalAlign32}, 707 {V2S64, GlobalPtr, 128, GlobalAlign32}, 708 {V2S16, GlobalPtr, 32, GlobalAlign32}, 709 {S32, GlobalPtr, 8, GlobalAlign8}, 710 {S32, GlobalPtr, 16, GlobalAlign16}, 711 712 {S32, LocalPtr, 32, 32}, 713 {S64, LocalPtr, 64, 32}, 714 {V2S32, LocalPtr, 64, 32}, 715 {S32, LocalPtr, 8, 8}, 716 {S32, LocalPtr, 16, 16}, 717 {V2S16, LocalPtr, 32, 32}, 718 719 {S32, PrivatePtr, 32, 32}, 720 {S32, PrivatePtr, 8, 8}, 721 {S32, PrivatePtr, 16, 16}, 722 {V2S16, PrivatePtr, 32, 32}, 723 724 {S32, FlatPtr, 32, GlobalAlign32}, 725 {S32, FlatPtr, 16, GlobalAlign16}, 726 {S32, FlatPtr, 8, GlobalAlign8}, 727 {V2S16, FlatPtr, 32, GlobalAlign32}, 728 729 {S32, ConstantPtr, 32, GlobalAlign32}, 730 {V2S32, ConstantPtr, 64, GlobalAlign32}, 731 {V3S32, ConstantPtr, 96, GlobalAlign32}, 732 {V4S32, ConstantPtr, 128, GlobalAlign32}, 733 {S64, ConstantPtr, 64, GlobalAlign32}, 734 {S128, ConstantPtr, 128, GlobalAlign32}, 735 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 736 Actions 737 .customIf(typeIs(1, Constant32Ptr)) 738 .narrowScalarIf( 739 [=](const LegalityQuery &Query) -> bool { 740 return !Query.Types[0].isVector() && needToSplitLoad(Query); 741 }, 742 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 743 const LLT DstTy = Query.Types[0]; 744 const LLT PtrTy = Query.Types[1]; 745 746 const unsigned DstSize = DstTy.getSizeInBits(); 747 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 748 749 // Split extloads. 750 if (DstSize > MemSize) 751 return std::make_pair(0, LLT::scalar(MemSize)); 752 753 if (DstSize > 32 && (DstSize % 32 != 0)) { 754 // FIXME: Need a way to specify non-extload of larger size if 755 // suitably aligned. 756 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 757 } 758 759 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 760 if (MemSize > MaxSize) 761 return std::make_pair(0, LLT::scalar(MaxSize)); 762 763 unsigned Align = Query.MMODescrs[0].AlignInBits; 764 return std::make_pair(0, LLT::scalar(Align)); 765 }) 766 .fewerElementsIf( 767 [=](const LegalityQuery &Query) -> bool { 768 return Query.Types[0].isVector() && needToSplitLoad(Query); 769 }, 770 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 771 const LLT DstTy = Query.Types[0]; 772 const LLT PtrTy = Query.Types[1]; 773 774 LLT EltTy = DstTy.getElementType(); 775 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 776 777 // Split if it's too large for the address space. 778 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 779 unsigned NumElts = DstTy.getNumElements(); 780 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 781 782 // FIXME: Refine when odd breakdowns handled 783 // The scalars will need to be re-legalized. 784 if (NumPieces == 1 || NumPieces >= NumElts || 785 NumElts % NumPieces != 0) 786 return std::make_pair(0, EltTy); 787 788 return std::make_pair(0, 789 LLT::vector(NumElts / NumPieces, EltTy)); 790 } 791 792 // Need to split because of alignment. 793 unsigned Align = Query.MMODescrs[0].AlignInBits; 794 unsigned EltSize = EltTy.getSizeInBits(); 795 if (EltSize > Align && 796 (EltSize / Align < DstTy.getNumElements())) { 797 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 798 } 799 800 // May need relegalization for the scalars. 801 return std::make_pair(0, EltTy); 802 }) 803 .minScalar(0, S32); 804 805 if (IsStore) 806 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 807 808 // TODO: Need a bitcast lower option? 809 Actions 810 .legalIf([=](const LegalityQuery &Query) { 811 const LLT Ty0 = Query.Types[0]; 812 unsigned Size = Ty0.getSizeInBits(); 813 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 814 unsigned Align = Query.MMODescrs[0].AlignInBits; 815 816 // FIXME: Widening store from alignment not valid. 817 if (MemSize < Size) 818 MemSize = std::max(MemSize, Align); 819 820 // No extending vector loads. 821 if (Size > MemSize && Ty0.isVector()) 822 return false; 823 824 switch (MemSize) { 825 case 8: 826 case 16: 827 return Size == 32; 828 case 32: 829 case 64: 830 case 128: 831 return true; 832 case 96: 833 return ST.hasDwordx3LoadStores(); 834 case 256: 835 case 512: 836 return true; 837 default: 838 return false; 839 } 840 }) 841 .widenScalarToNextPow2(0) 842 // TODO: v3s32->v4s32 with alignment 843 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 844 } 845 846 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 847 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 848 {S32, GlobalPtr, 16, 2 * 8}, 849 {S32, LocalPtr, 8, 8}, 850 {S32, LocalPtr, 16, 16}, 851 {S32, PrivatePtr, 8, 8}, 852 {S32, PrivatePtr, 16, 16}, 853 {S32, ConstantPtr, 8, 8}, 854 {S32, ConstantPtr, 16, 2 * 8}}); 855 if (ST.hasFlatAddressSpace()) { 856 ExtLoads.legalForTypesWithMemDesc( 857 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 858 } 859 860 ExtLoads.clampScalar(0, S32, S32) 861 .widenScalarToNextPow2(0) 862 .unsupportedIfMemSizeNotPow2() 863 .lower(); 864 865 auto &Atomics = getActionDefinitionsBuilder( 866 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 867 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 868 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 869 G_ATOMICRMW_UMIN}) 870 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 871 {S64, GlobalPtr}, {S64, LocalPtr}}); 872 if (ST.hasFlatAddressSpace()) { 873 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 874 } 875 876 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 877 .legalFor({{S32, LocalPtr}}); 878 879 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 880 // demarshalling 881 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 882 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 883 {S32, FlatPtr}, {S64, FlatPtr}}) 884 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 885 {S32, RegionPtr}, {S64, RegionPtr}}); 886 // TODO: Pointer types, any 32-bit or 64-bit vector 887 888 // Condition should be s32 for scalar, s1 for vector. 889 getActionDefinitionsBuilder(G_SELECT) 890 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 891 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 892 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 893 .clampScalar(0, S16, S64) 894 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 895 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 896 .scalarize(1) 897 .clampMaxNumElements(0, S32, 2) 898 .clampMaxNumElements(0, LocalPtr, 2) 899 .clampMaxNumElements(0, PrivatePtr, 2) 900 .scalarize(0) 901 .widenScalarToNextPow2(0) 902 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 903 904 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 905 // be more flexible with the shift amount type. 906 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 907 .legalFor({{S32, S32}, {S64, S32}}); 908 if (ST.has16BitInsts()) { 909 if (ST.hasVOP3PInsts()) { 910 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 911 .clampMaxNumElements(0, S16, 2); 912 } else 913 Shifts.legalFor({{S16, S32}, {S16, S16}}); 914 915 // TODO: Support 16-bit shift amounts 916 Shifts.clampScalar(1, S32, S32); 917 Shifts.clampScalar(0, S16, S64); 918 Shifts.widenScalarToNextPow2(0, 16); 919 } else { 920 // Make sure we legalize the shift amount type first, as the general 921 // expansion for the shifted type will produce much worse code if it hasn't 922 // been truncated already. 923 Shifts.clampScalar(1, S32, S32); 924 Shifts.clampScalar(0, S32, S64); 925 Shifts.widenScalarToNextPow2(0, 32); 926 } 927 Shifts.scalarize(0); 928 929 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 930 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 931 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 932 unsigned IdxTypeIdx = 2; 933 934 getActionDefinitionsBuilder(Op) 935 .customIf([=](const LegalityQuery &Query) { 936 const LLT EltTy = Query.Types[EltTypeIdx]; 937 const LLT VecTy = Query.Types[VecTypeIdx]; 938 const LLT IdxTy = Query.Types[IdxTypeIdx]; 939 return (EltTy.getSizeInBits() == 16 || 940 EltTy.getSizeInBits() % 32 == 0) && 941 VecTy.getSizeInBits() % 32 == 0 && 942 VecTy.getSizeInBits() <= 1024 && 943 IdxTy.getSizeInBits() == 32; 944 }) 945 .clampScalar(EltTypeIdx, S32, S64) 946 .clampScalar(VecTypeIdx, S32, S64) 947 .clampScalar(IdxTypeIdx, S32, S32); 948 } 949 950 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 951 .unsupportedIf([=](const LegalityQuery &Query) { 952 const LLT &EltTy = Query.Types[1].getElementType(); 953 return Query.Types[0] != EltTy; 954 }); 955 956 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 957 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 958 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 959 960 // FIXME: Doesn't handle extract of illegal sizes. 961 getActionDefinitionsBuilder(Op) 962 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 963 // FIXME: Multiples of 16 should not be legal. 964 .legalIf([=](const LegalityQuery &Query) { 965 const LLT BigTy = Query.Types[BigTyIdx]; 966 const LLT LitTy = Query.Types[LitTyIdx]; 967 return (BigTy.getSizeInBits() % 32 == 0) && 968 (LitTy.getSizeInBits() % 16 == 0); 969 }) 970 .widenScalarIf( 971 [=](const LegalityQuery &Query) { 972 const LLT BigTy = Query.Types[BigTyIdx]; 973 return (BigTy.getScalarSizeInBits() < 16); 974 }, 975 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 976 .widenScalarIf( 977 [=](const LegalityQuery &Query) { 978 const LLT LitTy = Query.Types[LitTyIdx]; 979 return (LitTy.getScalarSizeInBits() < 16); 980 }, 981 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 982 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 983 .widenScalarToNextPow2(BigTyIdx, 32); 984 985 } 986 987 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 988 .legalForCartesianProduct(AllS32Vectors, {S32}) 989 .legalForCartesianProduct(AllS64Vectors, {S64}) 990 .clampNumElements(0, V16S32, V32S32) 991 .clampNumElements(0, V2S64, V16S64) 992 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 993 994 if (ST.hasScalarPackInsts()) 995 BuildVector.legalFor({V2S16, S32}); 996 997 BuildVector 998 .minScalarSameAs(1, 0) 999 .legalIf(isRegisterType(0)) 1000 .minScalarOrElt(0, S32); 1001 1002 if (ST.hasScalarPackInsts()) { 1003 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1004 .legalFor({V2S16, S32}) 1005 .lower(); 1006 } else { 1007 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1008 .lower(); 1009 } 1010 1011 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1012 .legalIf(isRegisterType(0)); 1013 1014 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1015 // pre-legalize. 1016 if (ST.hasVOP3PInsts()) { 1017 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1018 .customFor({V2S16, V2S16}) 1019 .lower(); 1020 } else 1021 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1022 1023 // Merge/Unmerge 1024 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1025 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1026 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1027 1028 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1029 const LLT &Ty = Query.Types[TypeIdx]; 1030 if (Ty.isVector()) { 1031 const LLT &EltTy = Ty.getElementType(); 1032 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1033 return true; 1034 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1035 return true; 1036 } 1037 return false; 1038 }; 1039 1040 auto &Builder = getActionDefinitionsBuilder(Op) 1041 // Try to widen to s16 first for small types. 1042 // TODO: Only do this on targets with legal s16 shifts 1043 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1044 1045 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1046 .lowerFor({{S16, V2S16}}) 1047 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1048 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1049 elementTypeIs(1, S16)), 1050 changeTo(1, V2S16)) 1051 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1052 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1053 // valid. 1054 .clampScalar(LitTyIdx, S32, S256) 1055 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1056 // Break up vectors with weird elements into scalars 1057 .fewerElementsIf( 1058 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1059 scalarize(0)) 1060 .fewerElementsIf( 1061 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1062 scalarize(1)) 1063 .clampScalar(BigTyIdx, S32, S1024); 1064 1065 if (Op == G_MERGE_VALUES) { 1066 Builder.widenScalarIf( 1067 // TODO: Use 16-bit shifts if legal for 8-bit values? 1068 [=](const LegalityQuery &Query) { 1069 const LLT Ty = Query.Types[LitTyIdx]; 1070 return Ty.getSizeInBits() < 32; 1071 }, 1072 changeTo(LitTyIdx, S32)); 1073 } 1074 1075 Builder.widenScalarIf( 1076 [=](const LegalityQuery &Query) { 1077 const LLT Ty = Query.Types[BigTyIdx]; 1078 return !isPowerOf2_32(Ty.getSizeInBits()) && 1079 Ty.getSizeInBits() % 16 != 0; 1080 }, 1081 [=](const LegalityQuery &Query) { 1082 // Pick the next power of 2, or a multiple of 64 over 128. 1083 // Whichever is smaller. 1084 const LLT &Ty = Query.Types[BigTyIdx]; 1085 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1086 if (NewSizeInBits >= 256) { 1087 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1088 if (RoundedTo < NewSizeInBits) 1089 NewSizeInBits = RoundedTo; 1090 } 1091 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1092 }) 1093 .legalIf([=](const LegalityQuery &Query) { 1094 const LLT &BigTy = Query.Types[BigTyIdx]; 1095 const LLT &LitTy = Query.Types[LitTyIdx]; 1096 1097 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1098 return false; 1099 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1100 return false; 1101 1102 return BigTy.getSizeInBits() % 16 == 0 && 1103 LitTy.getSizeInBits() % 16 == 0 && 1104 BigTy.getSizeInBits() <= 1024; 1105 }) 1106 // Any vectors left are the wrong size. Scalarize them. 1107 .scalarize(0) 1108 .scalarize(1); 1109 } 1110 1111 // TODO: Make legal for s32, s64. s64 case needs break down in regbankselect. 1112 getActionDefinitionsBuilder(G_SEXT_INREG) 1113 .clampScalar(0, MinLegalScalarShiftTy, S64) 1114 .lower(); 1115 1116 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1117 .legalFor({S64}); 1118 1119 getActionDefinitionsBuilder({ 1120 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1121 G_FCOPYSIGN, 1122 1123 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1124 G_READ_REGISTER, 1125 G_WRITE_REGISTER, 1126 1127 G_SADDO, G_SSUBO, 1128 1129 // TODO: Implement 1130 G_FMINIMUM, G_FMAXIMUM 1131 }).lower(); 1132 1133 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1134 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1135 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1136 .unsupported(); 1137 1138 computeTables(); 1139 verify(*ST.getInstrInfo()); 1140 } 1141 1142 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1143 MachineRegisterInfo &MRI, 1144 MachineIRBuilder &B, 1145 GISelChangeObserver &Observer) const { 1146 switch (MI.getOpcode()) { 1147 case TargetOpcode::G_ADDRSPACE_CAST: 1148 return legalizeAddrSpaceCast(MI, MRI, B); 1149 case TargetOpcode::G_FRINT: 1150 return legalizeFrint(MI, MRI, B); 1151 case TargetOpcode::G_FCEIL: 1152 return legalizeFceil(MI, MRI, B); 1153 case TargetOpcode::G_INTRINSIC_TRUNC: 1154 return legalizeIntrinsicTrunc(MI, MRI, B); 1155 case TargetOpcode::G_SITOFP: 1156 return legalizeITOFP(MI, MRI, B, true); 1157 case TargetOpcode::G_UITOFP: 1158 return legalizeITOFP(MI, MRI, B, false); 1159 case TargetOpcode::G_FMINNUM: 1160 case TargetOpcode::G_FMAXNUM: 1161 case TargetOpcode::G_FMINNUM_IEEE: 1162 case TargetOpcode::G_FMAXNUM_IEEE: 1163 return legalizeMinNumMaxNum(MI, MRI, B); 1164 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1165 return legalizeExtractVectorElt(MI, MRI, B); 1166 case TargetOpcode::G_INSERT_VECTOR_ELT: 1167 return legalizeInsertVectorElt(MI, MRI, B); 1168 case TargetOpcode::G_SHUFFLE_VECTOR: 1169 return legalizeShuffleVector(MI, MRI, B); 1170 case TargetOpcode::G_FSIN: 1171 case TargetOpcode::G_FCOS: 1172 return legalizeSinCos(MI, MRI, B); 1173 case TargetOpcode::G_GLOBAL_VALUE: 1174 return legalizeGlobalValue(MI, MRI, B); 1175 case TargetOpcode::G_LOAD: 1176 return legalizeLoad(MI, MRI, B, Observer); 1177 case TargetOpcode::G_FMAD: 1178 return legalizeFMad(MI, MRI, B); 1179 case TargetOpcode::G_FDIV: 1180 return legalizeFDIV(MI, MRI, B); 1181 case TargetOpcode::G_ATOMIC_CMPXCHG: 1182 return legalizeAtomicCmpXChg(MI, MRI, B); 1183 default: 1184 return false; 1185 } 1186 1187 llvm_unreachable("expected switch to return"); 1188 } 1189 1190 Register AMDGPULegalizerInfo::getSegmentAperture( 1191 unsigned AS, 1192 MachineRegisterInfo &MRI, 1193 MachineIRBuilder &B) const { 1194 MachineFunction &MF = B.getMF(); 1195 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1196 const LLT S32 = LLT::scalar(32); 1197 1198 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1199 1200 if (ST.hasApertureRegs()) { 1201 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1202 // getreg. 1203 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1204 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1205 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1206 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1207 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1208 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1209 unsigned Encoding = 1210 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1211 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1212 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1213 1214 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1215 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1216 1217 B.buildInstr(AMDGPU::S_GETREG_B32) 1218 .addDef(GetReg) 1219 .addImm(Encoding); 1220 MRI.setType(GetReg, S32); 1221 1222 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1223 B.buildInstr(TargetOpcode::G_SHL) 1224 .addDef(ApertureReg) 1225 .addUse(GetReg) 1226 .addUse(ShiftAmt.getReg(0)); 1227 1228 return ApertureReg; 1229 } 1230 1231 Register QueuePtr = MRI.createGenericVirtualRegister( 1232 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1233 1234 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1235 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1236 return Register(); 1237 1238 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1239 // private_segment_aperture_base_hi. 1240 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1241 1242 // TODO: can we be smarter about machine pointer info? 1243 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1244 MachineMemOperand *MMO = MF.getMachineMemOperand( 1245 PtrInfo, 1246 MachineMemOperand::MOLoad | 1247 MachineMemOperand::MODereferenceable | 1248 MachineMemOperand::MOInvariant, 1249 4, 1250 MinAlign(64, StructOffset)); 1251 1252 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1253 Register LoadAddr; 1254 1255 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1256 B.buildLoad(LoadResult, LoadAddr, *MMO); 1257 return LoadResult; 1258 } 1259 1260 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1261 MachineInstr &MI, MachineRegisterInfo &MRI, 1262 MachineIRBuilder &B) const { 1263 MachineFunction &MF = B.getMF(); 1264 1265 B.setInstr(MI); 1266 1267 const LLT S32 = LLT::scalar(32); 1268 Register Dst = MI.getOperand(0).getReg(); 1269 Register Src = MI.getOperand(1).getReg(); 1270 1271 LLT DstTy = MRI.getType(Dst); 1272 LLT SrcTy = MRI.getType(Src); 1273 unsigned DestAS = DstTy.getAddressSpace(); 1274 unsigned SrcAS = SrcTy.getAddressSpace(); 1275 1276 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1277 // vector element. 1278 assert(!DstTy.isVector()); 1279 1280 const AMDGPUTargetMachine &TM 1281 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1282 1283 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1284 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1285 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1286 return true; 1287 } 1288 1289 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1290 // Truncate. 1291 B.buildExtract(Dst, Src, 0); 1292 MI.eraseFromParent(); 1293 return true; 1294 } 1295 1296 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1297 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1298 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1299 1300 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1301 // another. Merge operands are required to be the same type, but creating an 1302 // extra ptrtoint would be kind of pointless. 1303 auto HighAddr = B.buildConstant( 1304 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1305 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1306 MI.eraseFromParent(); 1307 return true; 1308 } 1309 1310 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1311 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1312 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1313 unsigned NullVal = TM.getNullPointerValue(DestAS); 1314 1315 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1316 auto FlatNull = B.buildConstant(SrcTy, 0); 1317 1318 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1319 1320 // Extract low 32-bits of the pointer. 1321 B.buildExtract(PtrLo32, Src, 0); 1322 1323 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1324 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1325 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1326 1327 MI.eraseFromParent(); 1328 return true; 1329 } 1330 1331 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1332 return false; 1333 1334 if (!ST.hasFlatAddressSpace()) 1335 return false; 1336 1337 auto SegmentNull = 1338 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1339 auto FlatNull = 1340 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1341 1342 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1343 if (!ApertureReg.isValid()) 1344 return false; 1345 1346 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1347 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1348 1349 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1350 1351 // Coerce the type of the low half of the result so we can use merge_values. 1352 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1353 B.buildInstr(TargetOpcode::G_PTRTOINT) 1354 .addDef(SrcAsInt) 1355 .addUse(Src); 1356 1357 // TODO: Should we allow mismatched types but matching sizes in merges to 1358 // avoid the ptrtoint? 1359 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1360 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1361 1362 MI.eraseFromParent(); 1363 return true; 1364 } 1365 1366 bool AMDGPULegalizerInfo::legalizeFrint( 1367 MachineInstr &MI, MachineRegisterInfo &MRI, 1368 MachineIRBuilder &B) const { 1369 B.setInstr(MI); 1370 1371 Register Src = MI.getOperand(1).getReg(); 1372 LLT Ty = MRI.getType(Src); 1373 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1374 1375 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1376 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1377 1378 auto C1 = B.buildFConstant(Ty, C1Val); 1379 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1380 1381 // TODO: Should this propagate fast-math-flags? 1382 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1383 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1384 1385 auto C2 = B.buildFConstant(Ty, C2Val); 1386 auto Fabs = B.buildFAbs(Ty, Src); 1387 1388 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1389 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1390 return true; 1391 } 1392 1393 bool AMDGPULegalizerInfo::legalizeFceil( 1394 MachineInstr &MI, MachineRegisterInfo &MRI, 1395 MachineIRBuilder &B) const { 1396 B.setInstr(MI); 1397 1398 const LLT S1 = LLT::scalar(1); 1399 const LLT S64 = LLT::scalar(64); 1400 1401 Register Src = MI.getOperand(1).getReg(); 1402 assert(MRI.getType(Src) == S64); 1403 1404 // result = trunc(src) 1405 // if (src > 0.0 && src != result) 1406 // result += 1.0 1407 1408 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1409 1410 const auto Zero = B.buildFConstant(S64, 0.0); 1411 const auto One = B.buildFConstant(S64, 1.0); 1412 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1413 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1414 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1415 auto Add = B.buildSelect(S64, And, One, Zero); 1416 1417 // TODO: Should this propagate fast-math-flags? 1418 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1419 return true; 1420 } 1421 1422 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1423 MachineIRBuilder &B) { 1424 const unsigned FractBits = 52; 1425 const unsigned ExpBits = 11; 1426 LLT S32 = LLT::scalar(32); 1427 1428 auto Const0 = B.buildConstant(S32, FractBits - 32); 1429 auto Const1 = B.buildConstant(S32, ExpBits); 1430 1431 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1432 .addUse(Const0.getReg(0)) 1433 .addUse(Const1.getReg(0)); 1434 1435 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1436 } 1437 1438 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1439 MachineInstr &MI, MachineRegisterInfo &MRI, 1440 MachineIRBuilder &B) const { 1441 B.setInstr(MI); 1442 1443 const LLT S1 = LLT::scalar(1); 1444 const LLT S32 = LLT::scalar(32); 1445 const LLT S64 = LLT::scalar(64); 1446 1447 Register Src = MI.getOperand(1).getReg(); 1448 assert(MRI.getType(Src) == S64); 1449 1450 // TODO: Should this use extract since the low half is unused? 1451 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1452 Register Hi = Unmerge.getReg(1); 1453 1454 // Extract the upper half, since this is where we will find the sign and 1455 // exponent. 1456 auto Exp = extractF64Exponent(Hi, B); 1457 1458 const unsigned FractBits = 52; 1459 1460 // Extract the sign bit. 1461 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1462 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1463 1464 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1465 1466 const auto Zero32 = B.buildConstant(S32, 0); 1467 1468 // Extend back to 64-bits. 1469 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1470 1471 auto Shr = B.buildAShr(S64, FractMask, Exp); 1472 auto Not = B.buildNot(S64, Shr); 1473 auto Tmp0 = B.buildAnd(S64, Src, Not); 1474 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1475 1476 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1477 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1478 1479 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1480 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1481 return true; 1482 } 1483 1484 bool AMDGPULegalizerInfo::legalizeITOFP( 1485 MachineInstr &MI, MachineRegisterInfo &MRI, 1486 MachineIRBuilder &B, bool Signed) const { 1487 B.setInstr(MI); 1488 1489 Register Dst = MI.getOperand(0).getReg(); 1490 Register Src = MI.getOperand(1).getReg(); 1491 1492 const LLT S64 = LLT::scalar(64); 1493 const LLT S32 = LLT::scalar(32); 1494 1495 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1496 1497 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1498 1499 auto CvtHi = Signed ? 1500 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1501 B.buildUITOFP(S64, Unmerge.getReg(1)); 1502 1503 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1504 1505 auto ThirtyTwo = B.buildConstant(S32, 32); 1506 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1507 .addUse(CvtHi.getReg(0)) 1508 .addUse(ThirtyTwo.getReg(0)); 1509 1510 // TODO: Should this propagate fast-math-flags? 1511 B.buildFAdd(Dst, LdExp, CvtLo); 1512 MI.eraseFromParent(); 1513 return true; 1514 } 1515 1516 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1517 MachineInstr &MI, MachineRegisterInfo &MRI, 1518 MachineIRBuilder &B) const { 1519 MachineFunction &MF = B.getMF(); 1520 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1521 1522 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1523 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1524 1525 // With ieee_mode disabled, the instructions have the correct behavior 1526 // already for G_FMINNUM/G_FMAXNUM 1527 if (!MFI->getMode().IEEE) 1528 return !IsIEEEOp; 1529 1530 if (IsIEEEOp) 1531 return true; 1532 1533 MachineIRBuilder HelperBuilder(MI); 1534 GISelObserverWrapper DummyObserver; 1535 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1536 HelperBuilder.setInstr(MI); 1537 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1538 } 1539 1540 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1541 MachineInstr &MI, MachineRegisterInfo &MRI, 1542 MachineIRBuilder &B) const { 1543 // TODO: Should move some of this into LegalizerHelper. 1544 1545 // TODO: Promote dynamic indexing of s16 to s32 1546 // TODO: Dynamic s64 indexing is only legal for SGPR. 1547 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1548 if (!IdxVal) // Dynamic case will be selected to register indexing. 1549 return true; 1550 1551 Register Dst = MI.getOperand(0).getReg(); 1552 Register Vec = MI.getOperand(1).getReg(); 1553 1554 LLT VecTy = MRI.getType(Vec); 1555 LLT EltTy = VecTy.getElementType(); 1556 assert(EltTy == MRI.getType(Dst)); 1557 1558 B.setInstr(MI); 1559 1560 if (IdxVal.getValue() < VecTy.getNumElements()) 1561 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1562 else 1563 B.buildUndef(Dst); 1564 1565 MI.eraseFromParent(); 1566 return true; 1567 } 1568 1569 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1570 MachineInstr &MI, MachineRegisterInfo &MRI, 1571 MachineIRBuilder &B) const { 1572 // TODO: Should move some of this into LegalizerHelper. 1573 1574 // TODO: Promote dynamic indexing of s16 to s32 1575 // TODO: Dynamic s64 indexing is only legal for SGPR. 1576 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1577 if (!IdxVal) // Dynamic case will be selected to register indexing. 1578 return true; 1579 1580 Register Dst = MI.getOperand(0).getReg(); 1581 Register Vec = MI.getOperand(1).getReg(); 1582 Register Ins = MI.getOperand(2).getReg(); 1583 1584 LLT VecTy = MRI.getType(Vec); 1585 LLT EltTy = VecTy.getElementType(); 1586 assert(EltTy == MRI.getType(Ins)); 1587 1588 B.setInstr(MI); 1589 1590 if (IdxVal.getValue() < VecTy.getNumElements()) 1591 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1592 else 1593 B.buildUndef(Dst); 1594 1595 MI.eraseFromParent(); 1596 return true; 1597 } 1598 1599 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1600 assert(Mask.size() == 2); 1601 1602 // If one half is undef, the other is trivially in the same reg. 1603 if (Mask[0] == -1 || Mask[1] == -1) 1604 return true; 1605 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1606 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1607 } 1608 1609 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1610 MachineInstr &MI, MachineRegisterInfo &MRI, 1611 MachineIRBuilder &B) const { 1612 const LLT V2S16 = LLT::vector(2, 16); 1613 1614 Register Dst = MI.getOperand(0).getReg(); 1615 Register Src0 = MI.getOperand(1).getReg(); 1616 LLT DstTy = MRI.getType(Dst); 1617 LLT SrcTy = MRI.getType(Src0); 1618 1619 if (SrcTy == V2S16 && DstTy == V2S16 && 1620 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1621 return true; 1622 1623 MachineIRBuilder HelperBuilder(MI); 1624 GISelObserverWrapper DummyObserver; 1625 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1626 HelperBuilder.setInstr(MI); 1627 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1628 } 1629 1630 bool AMDGPULegalizerInfo::legalizeSinCos( 1631 MachineInstr &MI, MachineRegisterInfo &MRI, 1632 MachineIRBuilder &B) const { 1633 B.setInstr(MI); 1634 1635 Register DstReg = MI.getOperand(0).getReg(); 1636 Register SrcReg = MI.getOperand(1).getReg(); 1637 LLT Ty = MRI.getType(DstReg); 1638 unsigned Flags = MI.getFlags(); 1639 1640 Register TrigVal; 1641 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1642 if (ST.hasTrigReducedRange()) { 1643 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1644 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1645 .addUse(MulVal.getReg(0)) 1646 .setMIFlags(Flags).getReg(0); 1647 } else 1648 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1649 1650 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1651 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1652 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1653 .addUse(TrigVal) 1654 .setMIFlags(Flags); 1655 MI.eraseFromParent(); 1656 return true; 1657 } 1658 1659 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1660 Register DstReg, LLT PtrTy, 1661 MachineIRBuilder &B, const GlobalValue *GV, 1662 unsigned Offset, unsigned GAFlags) const { 1663 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1664 // to the following code sequence: 1665 // 1666 // For constant address space: 1667 // s_getpc_b64 s[0:1] 1668 // s_add_u32 s0, s0, $symbol 1669 // s_addc_u32 s1, s1, 0 1670 // 1671 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1672 // a fixup or relocation is emitted to replace $symbol with a literal 1673 // constant, which is a pc-relative offset from the encoding of the $symbol 1674 // operand to the global variable. 1675 // 1676 // For global address space: 1677 // s_getpc_b64 s[0:1] 1678 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1679 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1680 // 1681 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1682 // fixups or relocations are emitted to replace $symbol@*@lo and 1683 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1684 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1685 // operand to the global variable. 1686 // 1687 // What we want here is an offset from the value returned by s_getpc 1688 // (which is the address of the s_add_u32 instruction) to the global 1689 // variable, but since the encoding of $symbol starts 4 bytes after the start 1690 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1691 // small. This requires us to add 4 to the global variable offset in order to 1692 // compute the correct address. 1693 1694 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1695 1696 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1697 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1698 1699 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1700 .addDef(PCReg); 1701 1702 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1703 if (GAFlags == SIInstrInfo::MO_NONE) 1704 MIB.addImm(0); 1705 else 1706 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1707 1708 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1709 1710 if (PtrTy.getSizeInBits() == 32) 1711 B.buildExtract(DstReg, PCReg, 0); 1712 return true; 1713 } 1714 1715 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1716 MachineInstr &MI, MachineRegisterInfo &MRI, 1717 MachineIRBuilder &B) const { 1718 Register DstReg = MI.getOperand(0).getReg(); 1719 LLT Ty = MRI.getType(DstReg); 1720 unsigned AS = Ty.getAddressSpace(); 1721 1722 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1723 MachineFunction &MF = B.getMF(); 1724 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1725 B.setInstr(MI); 1726 1727 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1728 if (!MFI->isEntryFunction()) { 1729 const Function &Fn = MF.getFunction(); 1730 DiagnosticInfoUnsupported BadLDSDecl( 1731 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1732 Fn.getContext().diagnose(BadLDSDecl); 1733 } 1734 1735 // TODO: We could emit code to handle the initialization somewhere. 1736 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1737 const SITargetLowering *TLI = ST.getTargetLowering(); 1738 if (!TLI->shouldUseLDSConstAddress(GV)) { 1739 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1740 return true; // Leave in place; 1741 } 1742 1743 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1744 MI.eraseFromParent(); 1745 return true; 1746 } 1747 1748 const Function &Fn = MF.getFunction(); 1749 DiagnosticInfoUnsupported BadInit( 1750 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1751 Fn.getContext().diagnose(BadInit); 1752 return true; 1753 } 1754 1755 const SITargetLowering *TLI = ST.getTargetLowering(); 1756 1757 if (TLI->shouldEmitFixup(GV)) { 1758 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1759 MI.eraseFromParent(); 1760 return true; 1761 } 1762 1763 if (TLI->shouldEmitPCReloc(GV)) { 1764 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1765 MI.eraseFromParent(); 1766 return true; 1767 } 1768 1769 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1770 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1771 1772 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1773 MachinePointerInfo::getGOT(MF), 1774 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1775 MachineMemOperand::MOInvariant, 1776 8 /*Size*/, 8 /*Align*/); 1777 1778 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1779 1780 if (Ty.getSizeInBits() == 32) { 1781 // Truncate if this is a 32-bit constant adrdess. 1782 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1783 B.buildExtract(DstReg, Load, 0); 1784 } else 1785 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1786 1787 MI.eraseFromParent(); 1788 return true; 1789 } 1790 1791 bool AMDGPULegalizerInfo::legalizeLoad( 1792 MachineInstr &MI, MachineRegisterInfo &MRI, 1793 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1794 B.setInstr(MI); 1795 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1796 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1797 Observer.changingInstr(MI); 1798 MI.getOperand(1).setReg(Cast.getReg(0)); 1799 Observer.changedInstr(MI); 1800 return true; 1801 } 1802 1803 bool AMDGPULegalizerInfo::legalizeFMad( 1804 MachineInstr &MI, MachineRegisterInfo &MRI, 1805 MachineIRBuilder &B) const { 1806 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1807 assert(Ty.isScalar()); 1808 1809 MachineFunction &MF = B.getMF(); 1810 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1811 1812 // TODO: Always legal with future ftz flag. 1813 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1814 return true; 1815 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1816 return true; 1817 1818 1819 MachineIRBuilder HelperBuilder(MI); 1820 GISelObserverWrapper DummyObserver; 1821 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1822 HelperBuilder.setMBB(*MI.getParent()); 1823 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1824 } 1825 1826 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1827 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1828 Register DstReg = MI.getOperand(0).getReg(); 1829 Register PtrReg = MI.getOperand(1).getReg(); 1830 Register CmpVal = MI.getOperand(2).getReg(); 1831 Register NewVal = MI.getOperand(3).getReg(); 1832 1833 assert(SITargetLowering::isFlatGlobalAddrSpace( 1834 MRI.getType(PtrReg).getAddressSpace()) && 1835 "this should not have been custom lowered"); 1836 1837 LLT ValTy = MRI.getType(CmpVal); 1838 LLT VecTy = LLT::vector(2, ValTy); 1839 1840 B.setInstr(MI); 1841 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1842 1843 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1844 .addDef(DstReg) 1845 .addUse(PtrReg) 1846 .addUse(PackedVal) 1847 .setMemRefs(MI.memoperands()); 1848 1849 MI.eraseFromParent(); 1850 return true; 1851 } 1852 1853 // Return the use branch instruction, otherwise null if the usage is invalid. 1854 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1855 MachineRegisterInfo &MRI, 1856 MachineInstr *&Br) { 1857 Register CondDef = MI.getOperand(0).getReg(); 1858 if (!MRI.hasOneNonDBGUse(CondDef)) 1859 return nullptr; 1860 1861 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1862 if (UseMI.getParent() != MI.getParent() || 1863 UseMI.getOpcode() != AMDGPU::G_BRCOND) 1864 return nullptr; 1865 1866 // Make sure the cond br is followed by a G_BR 1867 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 1868 if (Next != MI.getParent()->end()) { 1869 if (Next->getOpcode() != AMDGPU::G_BR) 1870 return nullptr; 1871 Br = &*Next; 1872 } 1873 1874 return &UseMI; 1875 } 1876 1877 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1878 Register Reg, LLT Ty) const { 1879 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1880 if (LiveIn) 1881 return LiveIn; 1882 1883 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1884 MRI.addLiveIn(Reg, NewReg); 1885 return NewReg; 1886 } 1887 1888 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1889 const ArgDescriptor *Arg) const { 1890 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1891 return false; // TODO: Handle these 1892 1893 assert(Arg->getRegister().isPhysical()); 1894 1895 MachineRegisterInfo &MRI = *B.getMRI(); 1896 1897 LLT Ty = MRI.getType(DstReg); 1898 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1899 1900 if (Arg->isMasked()) { 1901 // TODO: Should we try to emit this once in the entry block? 1902 const LLT S32 = LLT::scalar(32); 1903 const unsigned Mask = Arg->getMask(); 1904 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1905 1906 Register AndMaskSrc = LiveIn; 1907 1908 if (Shift != 0) { 1909 auto ShiftAmt = B.buildConstant(S32, Shift); 1910 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1911 } 1912 1913 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1914 } else 1915 B.buildCopy(DstReg, LiveIn); 1916 1917 // Insert the argument copy if it doens't already exist. 1918 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1919 if (!MRI.getVRegDef(LiveIn)) { 1920 // FIXME: Should have scoped insert pt 1921 MachineBasicBlock &OrigInsBB = B.getMBB(); 1922 auto OrigInsPt = B.getInsertPt(); 1923 1924 MachineBasicBlock &EntryMBB = B.getMF().front(); 1925 EntryMBB.addLiveIn(Arg->getRegister()); 1926 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1927 B.buildCopy(LiveIn, Arg->getRegister()); 1928 1929 B.setInsertPt(OrigInsBB, OrigInsPt); 1930 } 1931 1932 return true; 1933 } 1934 1935 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1936 MachineInstr &MI, 1937 MachineRegisterInfo &MRI, 1938 MachineIRBuilder &B, 1939 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1940 B.setInstr(MI); 1941 1942 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1943 1944 const ArgDescriptor *Arg; 1945 const TargetRegisterClass *RC; 1946 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1947 if (!Arg) { 1948 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1949 return false; 1950 } 1951 1952 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1953 MI.eraseFromParent(); 1954 return true; 1955 } 1956 1957 return false; 1958 } 1959 1960 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1961 MachineRegisterInfo &MRI, 1962 MachineIRBuilder &B) const { 1963 B.setInstr(MI); 1964 Register Dst = MI.getOperand(0).getReg(); 1965 LLT DstTy = MRI.getType(Dst); 1966 LLT S16 = LLT::scalar(16); 1967 LLT S32 = LLT::scalar(32); 1968 LLT S64 = LLT::scalar(64); 1969 1970 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1971 return true; 1972 1973 if (DstTy == S16) 1974 return legalizeFDIV16(MI, MRI, B); 1975 if (DstTy == S32) 1976 return legalizeFDIV32(MI, MRI, B); 1977 if (DstTy == S64) 1978 return legalizeFDIV64(MI, MRI, B); 1979 1980 return false; 1981 } 1982 1983 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1984 MachineRegisterInfo &MRI, 1985 MachineIRBuilder &B) const { 1986 Register Res = MI.getOperand(0).getReg(); 1987 Register LHS = MI.getOperand(1).getReg(); 1988 Register RHS = MI.getOperand(2).getReg(); 1989 1990 uint16_t Flags = MI.getFlags(); 1991 1992 LLT ResTy = MRI.getType(Res); 1993 LLT S32 = LLT::scalar(32); 1994 LLT S64 = LLT::scalar(64); 1995 1996 const MachineFunction &MF = B.getMF(); 1997 bool Unsafe = 1998 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1999 2000 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2001 return false; 2002 2003 if (!Unsafe && ResTy == S32 && 2004 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 2005 return false; 2006 2007 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2008 // 1 / x -> RCP(x) 2009 if (CLHS->isExactlyValue(1.0)) { 2010 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2011 .addUse(RHS) 2012 .setMIFlags(Flags); 2013 2014 MI.eraseFromParent(); 2015 return true; 2016 } 2017 2018 // -1 / x -> RCP( FNEG(x) ) 2019 if (CLHS->isExactlyValue(-1.0)) { 2020 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2021 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2022 .addUse(FNeg.getReg(0)) 2023 .setMIFlags(Flags); 2024 2025 MI.eraseFromParent(); 2026 return true; 2027 } 2028 } 2029 2030 // x / y -> x * (1.0 / y) 2031 if (Unsafe) { 2032 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2033 .addUse(RHS) 2034 .setMIFlags(Flags); 2035 B.buildFMul(Res, LHS, RCP, Flags); 2036 2037 MI.eraseFromParent(); 2038 return true; 2039 } 2040 2041 return false; 2042 } 2043 2044 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2045 MachineRegisterInfo &MRI, 2046 MachineIRBuilder &B) const { 2047 B.setInstr(MI); 2048 Register Res = MI.getOperand(0).getReg(); 2049 Register LHS = MI.getOperand(1).getReg(); 2050 Register RHS = MI.getOperand(2).getReg(); 2051 2052 uint16_t Flags = MI.getFlags(); 2053 2054 LLT S16 = LLT::scalar(16); 2055 LLT S32 = LLT::scalar(32); 2056 2057 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2058 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2059 2060 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2061 .addUse(RHSExt.getReg(0)) 2062 .setMIFlags(Flags); 2063 2064 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2065 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2066 2067 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2068 .addUse(RDst.getReg(0)) 2069 .addUse(RHS) 2070 .addUse(LHS) 2071 .setMIFlags(Flags); 2072 2073 MI.eraseFromParent(); 2074 return true; 2075 } 2076 2077 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2078 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2079 static void toggleSPDenormMode(bool Enable, 2080 MachineIRBuilder &B, 2081 const GCNSubtarget &ST, 2082 AMDGPU::SIModeRegisterDefaults Mode) { 2083 // Set SP denorm mode to this value. 2084 unsigned SPDenormMode = 2085 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2086 2087 if (ST.hasDenormModeInst()) { 2088 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2089 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 2090 ? FP_DENORM_FLUSH_NONE 2091 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2092 2093 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2094 B.buildInstr(AMDGPU::S_DENORM_MODE) 2095 .addImm(NewDenormModeValue); 2096 2097 } else { 2098 // Select FP32 bit field in mode register. 2099 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2100 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2101 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2102 2103 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2104 .addImm(SPDenormMode) 2105 .addImm(SPDenormModeBitField); 2106 } 2107 } 2108 2109 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2110 MachineRegisterInfo &MRI, 2111 MachineIRBuilder &B) const { 2112 B.setInstr(MI); 2113 Register Res = MI.getOperand(0).getReg(); 2114 Register LHS = MI.getOperand(1).getReg(); 2115 Register RHS = MI.getOperand(2).getReg(); 2116 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2117 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2118 2119 uint16_t Flags = MI.getFlags(); 2120 2121 LLT S32 = LLT::scalar(32); 2122 LLT S1 = LLT::scalar(1); 2123 2124 auto One = B.buildFConstant(S32, 1.0f); 2125 2126 auto DenominatorScaled = 2127 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2128 .addUse(RHS) 2129 .addUse(LHS) 2130 .addImm(1) 2131 .setMIFlags(Flags); 2132 auto NumeratorScaled = 2133 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2134 .addUse(LHS) 2135 .addUse(RHS) 2136 .addImm(0) 2137 .setMIFlags(Flags); 2138 2139 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2140 .addUse(DenominatorScaled.getReg(0)) 2141 .setMIFlags(Flags); 2142 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2143 2144 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2145 // aren't modeled as reading it. 2146 if (!Mode.FP32Denormals) 2147 toggleSPDenormMode(true, B, ST, Mode); 2148 2149 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2150 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2151 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2152 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2153 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2154 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2155 2156 if (!Mode.FP32Denormals) 2157 toggleSPDenormMode(false, B, ST, Mode); 2158 2159 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2160 .addUse(Fma4.getReg(0)) 2161 .addUse(Fma1.getReg(0)) 2162 .addUse(Fma3.getReg(0)) 2163 .addUse(NumeratorScaled.getReg(1)) 2164 .setMIFlags(Flags); 2165 2166 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2167 .addUse(Fmas.getReg(0)) 2168 .addUse(RHS) 2169 .addUse(LHS) 2170 .setMIFlags(Flags); 2171 2172 MI.eraseFromParent(); 2173 return true; 2174 } 2175 2176 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2177 MachineRegisterInfo &MRI, 2178 MachineIRBuilder &B) const { 2179 B.setInstr(MI); 2180 Register Res = MI.getOperand(0).getReg(); 2181 Register LHS = MI.getOperand(1).getReg(); 2182 Register RHS = MI.getOperand(2).getReg(); 2183 2184 uint16_t Flags = MI.getFlags(); 2185 2186 LLT S64 = LLT::scalar(64); 2187 LLT S1 = LLT::scalar(1); 2188 2189 auto One = B.buildFConstant(S64, 1.0); 2190 2191 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2192 .addUse(LHS) 2193 .addUse(RHS) 2194 .addImm(1) 2195 .setMIFlags(Flags); 2196 2197 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2198 2199 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2200 .addUse(DivScale0.getReg(0)) 2201 .setMIFlags(Flags); 2202 2203 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2204 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2205 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2206 2207 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2208 .addUse(LHS) 2209 .addUse(RHS) 2210 .addImm(0) 2211 .setMIFlags(Flags); 2212 2213 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2214 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2215 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2216 2217 Register Scale; 2218 if (!ST.hasUsableDivScaleConditionOutput()) { 2219 // Workaround a hardware bug on SI where the condition output from div_scale 2220 // is not usable. 2221 2222 Scale = MRI.createGenericVirtualRegister(S1); 2223 2224 LLT S32 = LLT::scalar(32); 2225 2226 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2227 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2228 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2229 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2230 2231 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2232 Scale1Unmerge.getReg(1)); 2233 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2234 Scale0Unmerge.getReg(1)); 2235 B.buildXor(Scale, CmpNum, CmpDen); 2236 } else { 2237 Scale = DivScale1.getReg(1); 2238 } 2239 2240 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2241 .addUse(Fma4.getReg(0)) 2242 .addUse(Fma3.getReg(0)) 2243 .addUse(Mul.getReg(0)) 2244 .addUse(Scale) 2245 .setMIFlags(Flags); 2246 2247 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2248 .addUse(Fmas.getReg(0)) 2249 .addUse(RHS) 2250 .addUse(LHS) 2251 .setMIFlags(Flags); 2252 2253 MI.eraseFromParent(); 2254 return true; 2255 } 2256 2257 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2258 MachineRegisterInfo &MRI, 2259 MachineIRBuilder &B) const { 2260 B.setInstr(MI); 2261 Register Res = MI.getOperand(0).getReg(); 2262 Register LHS = MI.getOperand(2).getReg(); 2263 Register RHS = MI.getOperand(3).getReg(); 2264 uint16_t Flags = MI.getFlags(); 2265 2266 LLT S32 = LLT::scalar(32); 2267 LLT S1 = LLT::scalar(1); 2268 2269 auto Abs = B.buildFAbs(S32, RHS, Flags); 2270 const APFloat C0Val(1.0f); 2271 2272 auto C0 = B.buildConstant(S32, 0x6f800000); 2273 auto C1 = B.buildConstant(S32, 0x2f800000); 2274 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2275 2276 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2277 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2278 2279 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2280 2281 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2282 .addUse(Mul0.getReg(0)) 2283 .setMIFlags(Flags); 2284 2285 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2286 2287 B.buildFMul(Res, Sel, Mul1, Flags); 2288 2289 MI.eraseFromParent(); 2290 return true; 2291 } 2292 2293 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2294 MachineRegisterInfo &MRI, 2295 MachineIRBuilder &B) const { 2296 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2297 if (!MFI->isEntryFunction()) { 2298 return legalizePreloadedArgIntrin(MI, MRI, B, 2299 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2300 } 2301 2302 B.setInstr(MI); 2303 2304 uint64_t Offset = 2305 ST.getTargetLowering()->getImplicitParameterOffset( 2306 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2307 Register DstReg = MI.getOperand(0).getReg(); 2308 LLT DstTy = MRI.getType(DstReg); 2309 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2310 2311 const ArgDescriptor *Arg; 2312 const TargetRegisterClass *RC; 2313 std::tie(Arg, RC) 2314 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2315 if (!Arg) 2316 return false; 2317 2318 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2319 if (!loadInputValue(KernargPtrReg, B, Arg)) 2320 return false; 2321 2322 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2323 MI.eraseFromParent(); 2324 return true; 2325 } 2326 2327 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2328 MachineRegisterInfo &MRI, 2329 MachineIRBuilder &B, 2330 unsigned AddrSpace) const { 2331 B.setInstr(MI); 2332 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2333 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2334 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2335 MI.eraseFromParent(); 2336 return true; 2337 } 2338 2339 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2340 // offset (the offset that is included in bounds checking and swizzling, to be 2341 // split between the instruction's voffset and immoffset fields) and soffset 2342 // (the offset that is excluded from bounds checking and swizzling, to go in 2343 // the instruction's soffset field). This function takes the first kind of 2344 // offset and figures out how to split it between voffset and immoffset. 2345 std::tuple<Register, unsigned, unsigned> 2346 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2347 Register OrigOffset) const { 2348 const unsigned MaxImm = 4095; 2349 Register BaseReg; 2350 unsigned TotalConstOffset; 2351 MachineInstr *OffsetDef; 2352 const LLT S32 = LLT::scalar(32); 2353 2354 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2355 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2356 2357 unsigned ImmOffset = TotalConstOffset; 2358 2359 // If the immediate value is too big for the immoffset field, put the value 2360 // and -4096 into the immoffset field so that the value that is copied/added 2361 // for the voffset field is a multiple of 4096, and it stands more chance 2362 // of being CSEd with the copy/add for another similar load/store. 2363 // However, do not do that rounding down to a multiple of 4096 if that is a 2364 // negative number, as it appears to be illegal to have a negative offset 2365 // in the vgpr, even if adding the immediate offset makes it positive. 2366 unsigned Overflow = ImmOffset & ~MaxImm; 2367 ImmOffset -= Overflow; 2368 if ((int32_t)Overflow < 0) { 2369 Overflow += ImmOffset; 2370 ImmOffset = 0; 2371 } 2372 2373 if (Overflow != 0) { 2374 if (!BaseReg) { 2375 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2376 } else { 2377 auto OverflowVal = B.buildConstant(S32, Overflow); 2378 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2379 } 2380 } 2381 2382 if (!BaseReg) 2383 BaseReg = B.buildConstant(S32, 0).getReg(0); 2384 2385 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2386 } 2387 2388 /// Handle register layout difference for f16 images for some subtargets. 2389 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2390 MachineRegisterInfo &MRI, 2391 Register Reg) const { 2392 if (!ST.hasUnpackedD16VMem()) 2393 return Reg; 2394 2395 const LLT S16 = LLT::scalar(16); 2396 const LLT S32 = LLT::scalar(32); 2397 LLT StoreVT = MRI.getType(Reg); 2398 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2399 2400 auto Unmerge = B.buildUnmerge(S16, Reg); 2401 2402 SmallVector<Register, 4> WideRegs; 2403 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2404 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2405 2406 int NumElts = StoreVT.getNumElements(); 2407 2408 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2409 } 2410 2411 Register AMDGPULegalizerInfo::fixStoreSourceType( 2412 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2413 MachineRegisterInfo *MRI = B.getMRI(); 2414 LLT Ty = MRI->getType(VData); 2415 2416 const LLT S16 = LLT::scalar(16); 2417 2418 // Fixup illegal register types for i8 stores. 2419 if (Ty == LLT::scalar(8) || Ty == S16) { 2420 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2421 return AnyExt; 2422 } 2423 2424 if (Ty.isVector()) { 2425 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2426 if (IsFormat) 2427 return handleD16VData(B, *MRI, VData); 2428 } 2429 } 2430 2431 return VData; 2432 } 2433 2434 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2435 MachineRegisterInfo &MRI, 2436 MachineIRBuilder &B, 2437 bool IsTyped, 2438 bool IsFormat) const { 2439 B.setInstr(MI); 2440 2441 Register VData = MI.getOperand(1).getReg(); 2442 LLT Ty = MRI.getType(VData); 2443 LLT EltTy = Ty.getScalarType(); 2444 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2445 const LLT S32 = LLT::scalar(32); 2446 2447 VData = fixStoreSourceType(B, VData, IsFormat); 2448 Register RSrc = MI.getOperand(2).getReg(); 2449 2450 MachineMemOperand *MMO = *MI.memoperands_begin(); 2451 const int MemSize = MMO->getSize(); 2452 2453 unsigned ImmOffset; 2454 unsigned TotalOffset; 2455 2456 // The typed intrinsics add an immediate after the registers. 2457 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2458 2459 // The struct intrinsic variants add one additional operand over raw. 2460 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2461 Register VIndex; 2462 int OpOffset = 0; 2463 if (HasVIndex) { 2464 VIndex = MI.getOperand(3).getReg(); 2465 OpOffset = 1; 2466 } 2467 2468 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2469 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2470 2471 unsigned Format = 0; 2472 if (IsTyped) { 2473 Format = MI.getOperand(5 + OpOffset).getImm(); 2474 ++OpOffset; 2475 } 2476 2477 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2478 2479 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2480 if (TotalOffset != 0) 2481 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2482 2483 unsigned Opc; 2484 if (IsTyped) { 2485 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2486 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2487 } else if (IsFormat) { 2488 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2489 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2490 } else { 2491 switch (MemSize) { 2492 case 1: 2493 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2494 break; 2495 case 2: 2496 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 2497 break; 2498 default: 2499 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 2500 break; 2501 } 2502 } 2503 2504 if (!VIndex) 2505 VIndex = B.buildConstant(S32, 0).getReg(0); 2506 2507 auto MIB = B.buildInstr(Opc) 2508 .addUse(VData) // vdata 2509 .addUse(RSrc) // rsrc 2510 .addUse(VIndex) // vindex 2511 .addUse(VOffset) // voffset 2512 .addUse(SOffset) // soffset 2513 .addImm(ImmOffset); // offset(imm) 2514 2515 if (IsTyped) 2516 MIB.addImm(Format); 2517 2518 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2519 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2520 .addMemOperand(MMO); 2521 2522 MI.eraseFromParent(); 2523 return true; 2524 } 2525 2526 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 2527 MachineRegisterInfo &MRI, 2528 MachineIRBuilder &B, 2529 bool IsFormat, 2530 bool IsTyped) const { 2531 B.setInstr(MI); 2532 2533 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 2534 MachineMemOperand *MMO = *MI.memoperands_begin(); 2535 const int MemSize = MMO->getSize(); 2536 const LLT S32 = LLT::scalar(32); 2537 2538 Register Dst = MI.getOperand(0).getReg(); 2539 Register RSrc = MI.getOperand(2).getReg(); 2540 2541 // The typed intrinsics add an immediate after the registers. 2542 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2543 2544 // The struct intrinsic variants add one additional operand over raw. 2545 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2546 Register VIndex; 2547 int OpOffset = 0; 2548 if (HasVIndex) { 2549 VIndex = MI.getOperand(3).getReg(); 2550 OpOffset = 1; 2551 } 2552 2553 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2554 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2555 2556 unsigned Format = 0; 2557 if (IsTyped) { 2558 Format = MI.getOperand(5 + OpOffset).getImm(); 2559 ++OpOffset; 2560 } 2561 2562 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2563 unsigned ImmOffset; 2564 unsigned TotalOffset; 2565 2566 LLT Ty = MRI.getType(Dst); 2567 LLT EltTy = Ty.getScalarType(); 2568 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2569 const bool Unpacked = ST.hasUnpackedD16VMem(); 2570 2571 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2572 if (TotalOffset != 0) 2573 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2574 2575 unsigned Opc; 2576 2577 if (IsTyped) { 2578 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 2579 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 2580 } else if (IsFormat) { 2581 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 2582 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 2583 } else { 2584 switch (MemSize) { 2585 case 1: 2586 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 2587 break; 2588 case 2: 2589 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 2590 break; 2591 default: 2592 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 2593 break; 2594 } 2595 } 2596 2597 Register LoadDstReg; 2598 2599 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 2600 LLT UnpackedTy = Ty.changeElementSize(32); 2601 2602 if (IsExtLoad) 2603 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 2604 else if (Unpacked && IsD16 && Ty.isVector()) 2605 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 2606 else 2607 LoadDstReg = Dst; 2608 2609 if (!VIndex) 2610 VIndex = B.buildConstant(S32, 0).getReg(0); 2611 2612 auto MIB = B.buildInstr(Opc) 2613 .addDef(LoadDstReg) // vdata 2614 .addUse(RSrc) // rsrc 2615 .addUse(VIndex) // vindex 2616 .addUse(VOffset) // voffset 2617 .addUse(SOffset) // soffset 2618 .addImm(ImmOffset); // offset(imm) 2619 2620 if (IsTyped) 2621 MIB.addImm(Format); 2622 2623 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2624 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2625 .addMemOperand(MMO); 2626 2627 if (LoadDstReg != Dst) { 2628 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2629 2630 // Widen result for extending loads was widened. 2631 if (IsExtLoad) 2632 B.buildTrunc(Dst, LoadDstReg); 2633 else { 2634 // Repack to original 16-bit vector result 2635 // FIXME: G_TRUNC should work, but legalization currently fails 2636 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 2637 SmallVector<Register, 4> Repack; 2638 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 2639 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 2640 B.buildMerge(Dst, Repack); 2641 } 2642 } 2643 2644 MI.eraseFromParent(); 2645 return true; 2646 } 2647 2648 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2649 MachineIRBuilder &B, 2650 bool IsInc) const { 2651 B.setInstr(MI); 2652 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2653 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2654 B.buildInstr(Opc) 2655 .addDef(MI.getOperand(0).getReg()) 2656 .addUse(MI.getOperand(2).getReg()) 2657 .addUse(MI.getOperand(3).getReg()) 2658 .cloneMemRefs(MI); 2659 MI.eraseFromParent(); 2660 return true; 2661 } 2662 2663 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 2664 switch (IntrID) { 2665 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2666 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2667 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 2668 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2669 case Intrinsic::amdgcn_struct_buffer_atomic_add: 2670 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 2671 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 2672 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 2673 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 2674 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 2675 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 2676 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 2677 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2678 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2679 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 2680 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2681 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2682 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 2683 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2684 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2685 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 2686 case Intrinsic::amdgcn_raw_buffer_atomic_and: 2687 case Intrinsic::amdgcn_struct_buffer_atomic_and: 2688 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 2689 case Intrinsic::amdgcn_raw_buffer_atomic_or: 2690 case Intrinsic::amdgcn_struct_buffer_atomic_or: 2691 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 2692 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 2693 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 2694 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 2695 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 2696 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 2697 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 2698 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 2699 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 2700 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 2701 default: 2702 llvm_unreachable("unhandled atomic opcode"); 2703 } 2704 } 2705 2706 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 2707 MachineIRBuilder &B, 2708 Intrinsic::ID IID) const { 2709 B.setInstr(MI); 2710 2711 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 2712 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 2713 2714 Register Dst = MI.getOperand(0).getReg(); 2715 Register VData = MI.getOperand(2).getReg(); 2716 2717 Register CmpVal; 2718 int OpOffset = 0; 2719 2720 if (IsCmpSwap) { 2721 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 2722 ++OpOffset; 2723 } 2724 2725 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 2726 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 2727 2728 // The struct intrinsic variants add one additional operand over raw. 2729 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2730 Register VIndex; 2731 if (HasVIndex) { 2732 VIndex = MI.getOperand(4).getReg(); 2733 ++OpOffset; 2734 } 2735 2736 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 2737 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 2738 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 2739 2740 MachineMemOperand *MMO = *MI.memoperands_begin(); 2741 2742 unsigned ImmOffset; 2743 unsigned TotalOffset; 2744 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2745 if (TotalOffset != 0) 2746 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 2747 2748 if (!VIndex) 2749 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 2750 2751 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 2752 .addDef(Dst) 2753 .addUse(VData); // vdata 2754 2755 if (IsCmpSwap) 2756 MIB.addReg(CmpVal); 2757 2758 MIB.addUse(RSrc) // rsrc 2759 .addUse(VIndex) // vindex 2760 .addUse(VOffset) // voffset 2761 .addUse(SOffset) // soffset 2762 .addImm(ImmOffset) // offset(imm) 2763 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2764 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2765 .addMemOperand(MMO); 2766 2767 MI.eraseFromParent(); 2768 return true; 2769 } 2770 2771 bool AMDGPULegalizerInfo::legalizeIntrinsic( 2772 MachineInstr &MI, MachineIRBuilder &B, 2773 GISelChangeObserver &Observer) const { 2774 MachineRegisterInfo &MRI = *B.getMRI(); 2775 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2776 auto IntrID = MI.getIntrinsicID(); 2777 switch (IntrID) { 2778 case Intrinsic::amdgcn_if: 2779 case Intrinsic::amdgcn_else: { 2780 MachineInstr *Br = nullptr; 2781 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2782 const SIRegisterInfo *TRI 2783 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2784 2785 B.setInstr(*BrCond); 2786 Register Def = MI.getOperand(1).getReg(); 2787 Register Use = MI.getOperand(3).getReg(); 2788 2789 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 2790 if (Br) 2791 BrTarget = Br->getOperand(0).getMBB(); 2792 2793 if (IntrID == Intrinsic::amdgcn_if) { 2794 B.buildInstr(AMDGPU::SI_IF) 2795 .addDef(Def) 2796 .addUse(Use) 2797 .addMBB(BrTarget); 2798 } else { 2799 B.buildInstr(AMDGPU::SI_ELSE) 2800 .addDef(Def) 2801 .addUse(Use) 2802 .addMBB(BrTarget) 2803 .addImm(0); 2804 } 2805 2806 if (Br) 2807 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 2808 2809 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2810 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2811 MI.eraseFromParent(); 2812 BrCond->eraseFromParent(); 2813 return true; 2814 } 2815 2816 return false; 2817 } 2818 case Intrinsic::amdgcn_loop: { 2819 MachineInstr *Br = nullptr; 2820 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2821 const SIRegisterInfo *TRI 2822 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2823 2824 B.setInstr(*BrCond); 2825 2826 // FIXME: Need to adjust branch targets based on unconditional branch. 2827 Register Reg = MI.getOperand(2).getReg(); 2828 B.buildInstr(AMDGPU::SI_LOOP) 2829 .addUse(Reg) 2830 .addMBB(BrCond->getOperand(1).getMBB()); 2831 MI.eraseFromParent(); 2832 BrCond->eraseFromParent(); 2833 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2834 return true; 2835 } 2836 2837 return false; 2838 } 2839 case Intrinsic::amdgcn_kernarg_segment_ptr: 2840 return legalizePreloadedArgIntrin( 2841 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2842 case Intrinsic::amdgcn_implicitarg_ptr: 2843 return legalizeImplicitArgPtr(MI, MRI, B); 2844 case Intrinsic::amdgcn_workitem_id_x: 2845 return legalizePreloadedArgIntrin(MI, MRI, B, 2846 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2847 case Intrinsic::amdgcn_workitem_id_y: 2848 return legalizePreloadedArgIntrin(MI, MRI, B, 2849 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2850 case Intrinsic::amdgcn_workitem_id_z: 2851 return legalizePreloadedArgIntrin(MI, MRI, B, 2852 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2853 case Intrinsic::amdgcn_workgroup_id_x: 2854 return legalizePreloadedArgIntrin(MI, MRI, B, 2855 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2856 case Intrinsic::amdgcn_workgroup_id_y: 2857 return legalizePreloadedArgIntrin(MI, MRI, B, 2858 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2859 case Intrinsic::amdgcn_workgroup_id_z: 2860 return legalizePreloadedArgIntrin(MI, MRI, B, 2861 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2862 case Intrinsic::amdgcn_dispatch_ptr: 2863 return legalizePreloadedArgIntrin(MI, MRI, B, 2864 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2865 case Intrinsic::amdgcn_queue_ptr: 2866 return legalizePreloadedArgIntrin(MI, MRI, B, 2867 AMDGPUFunctionArgInfo::QUEUE_PTR); 2868 case Intrinsic::amdgcn_implicit_buffer_ptr: 2869 return legalizePreloadedArgIntrin( 2870 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2871 case Intrinsic::amdgcn_dispatch_id: 2872 return legalizePreloadedArgIntrin(MI, MRI, B, 2873 AMDGPUFunctionArgInfo::DISPATCH_ID); 2874 case Intrinsic::amdgcn_fdiv_fast: 2875 return legalizeFDIVFastIntrin(MI, MRI, B); 2876 case Intrinsic::amdgcn_is_shared: 2877 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2878 case Intrinsic::amdgcn_is_private: 2879 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2880 case Intrinsic::amdgcn_wavefrontsize: { 2881 B.setInstr(MI); 2882 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2883 MI.eraseFromParent(); 2884 return true; 2885 } 2886 case Intrinsic::amdgcn_raw_buffer_store: 2887 case Intrinsic::amdgcn_struct_buffer_store: 2888 return legalizeBufferStore(MI, MRI, B, false, false); 2889 case Intrinsic::amdgcn_raw_buffer_store_format: 2890 case Intrinsic::amdgcn_struct_buffer_store_format: 2891 return legalizeBufferStore(MI, MRI, B, false, true); 2892 case Intrinsic::amdgcn_raw_tbuffer_store: 2893 case Intrinsic::amdgcn_struct_tbuffer_store: 2894 return legalizeBufferStore(MI, MRI, B, true, true); 2895 case Intrinsic::amdgcn_raw_buffer_load: 2896 case Intrinsic::amdgcn_struct_buffer_load: 2897 return legalizeBufferLoad(MI, MRI, B, false, false); 2898 case Intrinsic::amdgcn_raw_buffer_load_format: 2899 case Intrinsic::amdgcn_struct_buffer_load_format: 2900 return legalizeBufferLoad(MI, MRI, B, true, false); 2901 case Intrinsic::amdgcn_raw_tbuffer_load: 2902 case Intrinsic::amdgcn_struct_tbuffer_load: 2903 return legalizeBufferLoad(MI, MRI, B, true, true); 2904 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2905 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2906 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2907 case Intrinsic::amdgcn_struct_buffer_atomic_add: 2908 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 2909 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 2910 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 2911 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 2912 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2913 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2914 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2915 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2916 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2917 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2918 case Intrinsic::amdgcn_raw_buffer_atomic_and: 2919 case Intrinsic::amdgcn_struct_buffer_atomic_and: 2920 case Intrinsic::amdgcn_raw_buffer_atomic_or: 2921 case Intrinsic::amdgcn_struct_buffer_atomic_or: 2922 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 2923 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 2924 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 2925 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 2926 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 2927 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 2928 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 2929 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 2930 return legalizeBufferAtomic(MI, B, IntrID); 2931 case Intrinsic::amdgcn_atomic_inc: 2932 return legalizeAtomicIncDec(MI, B, true); 2933 case Intrinsic::amdgcn_atomic_dec: 2934 return legalizeAtomicIncDec(MI, B, false); 2935 default: 2936 return true; 2937 } 2938 2939 return true; 2940 } 2941