1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/TargetOpcodes.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/DerivedTypes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/Support/Debug.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 static LegalityPredicate isMultiple32(unsigned TypeIdx, 46 unsigned MaxSize = 1024) { 47 return [=](const LegalityQuery &Query) { 48 const LLT Ty = Query.Types[TypeIdx]; 49 const LLT EltTy = Ty.getScalarType(); 50 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 51 }; 52 } 53 54 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 55 return [=](const LegalityQuery &Query) { 56 return Query.Types[TypeIdx].getSizeInBits() == Size; 57 }; 58 } 59 60 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 return Ty.isVector() && 64 Ty.getNumElements() % 2 != 0 && 65 Ty.getElementType().getSizeInBits() < 32 && 66 Ty.getSizeInBits() % 32 != 0; 67 }; 68 } 69 70 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 71 return [=](const LegalityQuery &Query) { 72 const LLT Ty = Query.Types[TypeIdx]; 73 const LLT EltTy = Ty.getScalarType(); 74 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 75 }; 76 } 77 78 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 79 return [=](const LegalityQuery &Query) { 80 const LLT Ty = Query.Types[TypeIdx]; 81 const LLT EltTy = Ty.getElementType(); 82 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 83 }; 84 } 85 86 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getElementType(); 90 unsigned Size = Ty.getSizeInBits(); 91 unsigned Pieces = (Size + 63) / 64; 92 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 93 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 94 }; 95 } 96 97 // Increase the number of vector elements to reach the next multiple of 32-bit 98 // type. 99 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 100 return [=](const LegalityQuery &Query) { 101 const LLT Ty = Query.Types[TypeIdx]; 102 103 const LLT EltTy = Ty.getElementType(); 104 const int Size = Ty.getSizeInBits(); 105 const int EltSize = EltTy.getSizeInBits(); 106 const int NextMul32 = (Size + 31) / 32; 107 108 assert(EltSize < 32); 109 110 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 111 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 112 }; 113 } 114 115 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 116 return [=](const LegalityQuery &Query) { 117 const LLT QueryTy = Query.Types[TypeIdx]; 118 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 119 }; 120 } 121 122 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 123 return [=](const LegalityQuery &Query) { 124 const LLT QueryTy = Query.Types[TypeIdx]; 125 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 126 }; 127 } 128 129 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 130 return [=](const LegalityQuery &Query) { 131 const LLT QueryTy = Query.Types[TypeIdx]; 132 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 133 }; 134 } 135 136 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 137 // v2s16. 138 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 139 return [=](const LegalityQuery &Query) { 140 const LLT Ty = Query.Types[TypeIdx]; 141 if (Ty.isVector()) { 142 const int EltSize = Ty.getElementType().getSizeInBits(); 143 return EltSize == 32 || EltSize == 64 || 144 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 145 EltSize == 128 || EltSize == 256; 146 } 147 148 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 149 }; 150 } 151 152 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 153 return [=](const LegalityQuery &Query) { 154 return Query.Types[TypeIdx].getElementType() == Type; 155 }; 156 } 157 158 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 159 return [=](const LegalityQuery &Query) { 160 const LLT Ty = Query.Types[TypeIdx]; 161 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 162 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 163 }; 164 } 165 166 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 167 const GCNTargetMachine &TM) 168 : ST(ST_) { 169 using namespace TargetOpcode; 170 171 auto GetAddrSpacePtr = [&TM](unsigned AS) { 172 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 173 }; 174 175 const LLT S1 = LLT::scalar(1); 176 const LLT S8 = LLT::scalar(8); 177 const LLT S16 = LLT::scalar(16); 178 const LLT S32 = LLT::scalar(32); 179 const LLT S64 = LLT::scalar(64); 180 const LLT S96 = LLT::scalar(96); 181 const LLT S128 = LLT::scalar(128); 182 const LLT S256 = LLT::scalar(256); 183 const LLT S1024 = LLT::scalar(1024); 184 185 const LLT V2S16 = LLT::vector(2, 16); 186 const LLT V4S16 = LLT::vector(4, 16); 187 188 const LLT V2S32 = LLT::vector(2, 32); 189 const LLT V3S32 = LLT::vector(3, 32); 190 const LLT V4S32 = LLT::vector(4, 32); 191 const LLT V5S32 = LLT::vector(5, 32); 192 const LLT V6S32 = LLT::vector(6, 32); 193 const LLT V7S32 = LLT::vector(7, 32); 194 const LLT V8S32 = LLT::vector(8, 32); 195 const LLT V9S32 = LLT::vector(9, 32); 196 const LLT V10S32 = LLT::vector(10, 32); 197 const LLT V11S32 = LLT::vector(11, 32); 198 const LLT V12S32 = LLT::vector(12, 32); 199 const LLT V13S32 = LLT::vector(13, 32); 200 const LLT V14S32 = LLT::vector(14, 32); 201 const LLT V15S32 = LLT::vector(15, 32); 202 const LLT V16S32 = LLT::vector(16, 32); 203 const LLT V32S32 = LLT::vector(32, 32); 204 205 const LLT V2S64 = LLT::vector(2, 64); 206 const LLT V3S64 = LLT::vector(3, 64); 207 const LLT V4S64 = LLT::vector(4, 64); 208 const LLT V5S64 = LLT::vector(5, 64); 209 const LLT V6S64 = LLT::vector(6, 64); 210 const LLT V7S64 = LLT::vector(7, 64); 211 const LLT V8S64 = LLT::vector(8, 64); 212 const LLT V16S64 = LLT::vector(16, 64); 213 214 std::initializer_list<LLT> AllS32Vectors = 215 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 216 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 217 std::initializer_list<LLT> AllS64Vectors = 218 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 219 220 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 221 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 222 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 223 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 224 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 225 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 226 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 227 228 const LLT CodePtr = FlatPtr; 229 230 const std::initializer_list<LLT> AddrSpaces64 = { 231 GlobalPtr, ConstantPtr, FlatPtr 232 }; 233 234 const std::initializer_list<LLT> AddrSpaces32 = { 235 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 236 }; 237 238 const std::initializer_list<LLT> FPTypesBase = { 239 S32, S64 240 }; 241 242 const std::initializer_list<LLT> FPTypes16 = { 243 S32, S64, S16 244 }; 245 246 const std::initializer_list<LLT> FPTypesPK16 = { 247 S32, S64, S16, V2S16 248 }; 249 250 const LLT MinLegalScalarShiftTy = ST.has16BitInsts() ? S16 : S32; 251 252 setAction({G_BRCOND, S1}, Legal); // VCC branches 253 setAction({G_BRCOND, S32}, Legal); // SCC branches 254 255 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 256 // elements for v3s16 257 getActionDefinitionsBuilder(G_PHI) 258 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 259 .legalFor(AllS32Vectors) 260 .legalFor(AllS64Vectors) 261 .legalFor(AddrSpaces64) 262 .legalFor(AddrSpaces32) 263 .clampScalar(0, S32, S256) 264 .widenScalarToNextPow2(0, 32) 265 .clampMaxNumElements(0, S32, 16) 266 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 267 .legalIf(isPointer(0)); 268 269 if (ST.has16BitInsts()) { 270 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 271 .legalFor({S32, S16}) 272 .clampScalar(0, S16, S32) 273 .scalarize(0); 274 } else { 275 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 276 .legalFor({S32}) 277 .clampScalar(0, S32, S32) 278 .scalarize(0); 279 } 280 281 // FIXME: Not really legal. Placeholder for custom lowering. 282 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 283 .legalFor({S32, S64}) 284 .clampScalar(0, S32, S64) 285 .widenScalarToNextPow2(0, 32) 286 .scalarize(0); 287 288 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 289 .legalFor({S32}) 290 .clampScalar(0, S32, S32) 291 .scalarize(0); 292 293 // Report legal for any types we can handle anywhere. For the cases only legal 294 // on the SALU, RegBankSelect will be able to re-legalize. 295 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 296 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 297 .clampScalar(0, S32, S64) 298 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 299 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 300 .widenScalarToNextPow2(0) 301 .scalarize(0); 302 303 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 304 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 305 .legalFor({{S32, S1}, {S32, S32}}) 306 .clampScalar(0, S32, S32) 307 .scalarize(0); // TODO: Implement. 308 309 getActionDefinitionsBuilder(G_BITCAST) 310 // Don't worry about the size constraint. 311 .legalIf(all(isRegisterType(0), isRegisterType(1))) 312 // FIXME: Testing hack 313 .legalForCartesianProduct({S16, LLT::vector(2, 8), }) 314 .lower(); 315 316 317 getActionDefinitionsBuilder(G_CONSTANT) 318 .legalFor({S1, S32, S64, S16, GlobalPtr, 319 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 320 .clampScalar(0, S32, S64) 321 .widenScalarToNextPow2(0) 322 .legalIf(isPointer(0)); 323 324 getActionDefinitionsBuilder(G_FCONSTANT) 325 .legalFor({S32, S64, S16}) 326 .clampScalar(0, S16, S64); 327 328 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 329 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 330 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 331 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 332 .clampScalarOrElt(0, S32, S1024) 333 .legalIf(isMultiple32(0)) 334 .widenScalarToNextPow2(0, 32) 335 .clampMaxNumElements(0, S32, 16); 336 337 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 338 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 339 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 340 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 341 342 auto &FPOpActions = getActionDefinitionsBuilder( 343 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 344 .legalFor({S32, S64}); 345 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 346 .customFor({S32, S64}); 347 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 348 .customFor({S32, S64}); 349 350 if (ST.has16BitInsts()) { 351 if (ST.hasVOP3PInsts()) 352 FPOpActions.legalFor({S16, V2S16}); 353 else 354 FPOpActions.legalFor({S16}); 355 356 TrigActions.customFor({S16}); 357 FDIVActions.customFor({S16}); 358 } 359 360 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 361 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 362 363 if (ST.hasVOP3PInsts()) { 364 MinNumMaxNum.customFor(FPTypesPK16) 365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 366 .clampMaxNumElements(0, S16, 2) 367 .clampScalar(0, S16, S64) 368 .scalarize(0); 369 } else if (ST.has16BitInsts()) { 370 MinNumMaxNum.customFor(FPTypes16) 371 .clampScalar(0, S16, S64) 372 .scalarize(0); 373 } else { 374 MinNumMaxNum.customFor(FPTypesBase) 375 .clampScalar(0, S32, S64) 376 .scalarize(0); 377 } 378 379 if (ST.hasVOP3PInsts()) 380 FPOpActions.clampMaxNumElements(0, S16, 2); 381 382 FPOpActions 383 .scalarize(0) 384 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 385 386 TrigActions 387 .scalarize(0) 388 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 389 390 FDIVActions 391 .scalarize(0) 392 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 393 394 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 395 .legalFor(FPTypesPK16) 396 .clampMaxNumElements(0, S16, 2) 397 .scalarize(0) 398 .clampScalar(0, S16, S64); 399 400 if (ST.has16BitInsts()) { 401 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 402 .legalFor({S32, S64, S16}) 403 .scalarize(0) 404 .clampScalar(0, S16, S64); 405 } else { 406 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 407 .legalFor({S32, S64}) 408 .scalarize(0) 409 .clampScalar(0, S32, S64); 410 } 411 412 getActionDefinitionsBuilder(G_FPTRUNC) 413 .legalFor({{S32, S64}, {S16, S32}}) 414 .scalarize(0); 415 416 getActionDefinitionsBuilder(G_FPEXT) 417 .legalFor({{S64, S32}, {S32, S16}}) 418 .lowerFor({{S64, S16}}) // FIXME: Implement 419 .scalarize(0); 420 421 getActionDefinitionsBuilder(G_FSUB) 422 // Use actual fsub instruction 423 .legalFor({S32}) 424 // Must use fadd + fneg 425 .lowerFor({S64, S16, V2S16}) 426 .scalarize(0) 427 .clampScalar(0, S32, S64); 428 429 // Whether this is legal depends on the floating point mode for the function. 430 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 431 if (ST.hasMadF16()) 432 FMad.customFor({S32, S16}); 433 else 434 FMad.customFor({S32}); 435 FMad.scalarize(0) 436 .lower(); 437 438 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 439 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 440 {S32, S1}, {S64, S1}, {S16, S1}, 441 {S96, S32}, 442 // FIXME: Hack 443 {S64, LLT::scalar(33)}, 444 {S32, S8}, {S32, LLT::scalar(24)}}) 445 .scalarize(0) 446 .clampScalar(0, S32, S64); 447 448 // TODO: Split s1->s64 during regbankselect for VALU. 449 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 450 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 451 .lowerFor({{S32, S64}}) 452 .lowerIf(typeIs(1, S1)) 453 .customFor({{S64, S64}}); 454 if (ST.has16BitInsts()) 455 IToFP.legalFor({{S16, S16}}); 456 IToFP.clampScalar(1, S32, S64) 457 .scalarize(0); 458 459 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 460 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 461 if (ST.has16BitInsts()) 462 FPToI.legalFor({{S16, S16}}); 463 else 464 FPToI.minScalar(1, S32); 465 466 FPToI.minScalar(0, S32) 467 .scalarize(0); 468 469 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 470 .scalarize(0) 471 .lower(); 472 473 if (ST.has16BitInsts()) { 474 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 475 .legalFor({S16, S32, S64}) 476 .clampScalar(0, S16, S64) 477 .scalarize(0); 478 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 479 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 480 .legalFor({S32, S64}) 481 .clampScalar(0, S32, S64) 482 .scalarize(0); 483 } else { 484 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 485 .legalFor({S32}) 486 .customFor({S64}) 487 .clampScalar(0, S32, S64) 488 .scalarize(0); 489 } 490 491 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 492 .scalarize(0) 493 .alwaysLegal(); 494 495 auto &CmpBuilder = 496 getActionDefinitionsBuilder(G_ICMP) 497 // The compare output type differs based on the register bank of the output, 498 // so make both s1 and s32 legal. 499 // 500 // Scalar compares producing output in scc will be promoted to s32, as that 501 // is the allocatable register type that will be needed for the copy from 502 // scc. This will be promoted during RegBankSelect, and we assume something 503 // before that won't try to use s32 result types. 504 // 505 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 506 // bank. 507 .legalForCartesianProduct( 508 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 509 .legalForCartesianProduct( 510 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 511 if (ST.has16BitInsts()) { 512 CmpBuilder.legalFor({{S1, S16}}); 513 } 514 515 CmpBuilder 516 .widenScalarToNextPow2(1) 517 .clampScalar(1, S32, S64) 518 .scalarize(0) 519 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 520 521 getActionDefinitionsBuilder(G_FCMP) 522 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 523 .widenScalarToNextPow2(1) 524 .clampScalar(1, S32, S64) 525 .scalarize(0); 526 527 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 528 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 529 G_FLOG, G_FLOG2, G_FLOG10}) 530 .legalFor({S32}) 531 .scalarize(0); 532 533 // The 64-bit versions produce 32-bit results, but only on the SALU. 534 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 535 G_CTTZ, G_CTTZ_ZERO_UNDEF, 536 G_CTPOP}) 537 .legalFor({{S32, S32}, {S32, S64}}) 538 .clampScalar(0, S32, S32) 539 .clampScalar(1, S32, S64) 540 .scalarize(0) 541 .widenScalarToNextPow2(0, 32) 542 .widenScalarToNextPow2(1, 32); 543 544 // TODO: Expand for > s32 545 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 546 .legalFor({S32}) 547 .clampScalar(0, S32, S32) 548 .scalarize(0); 549 550 if (ST.has16BitInsts()) { 551 if (ST.hasVOP3PInsts()) { 552 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 553 .legalFor({S32, S16, V2S16}) 554 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 555 .clampMaxNumElements(0, S16, 2) 556 .clampScalar(0, S16, S32) 557 .widenScalarToNextPow2(0) 558 .scalarize(0); 559 } else { 560 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 561 .legalFor({S32, S16}) 562 .widenScalarToNextPow2(0) 563 .clampScalar(0, S16, S32) 564 .scalarize(0); 565 } 566 } else { 567 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 568 .legalFor({S32}) 569 .clampScalar(0, S32, S32) 570 .widenScalarToNextPow2(0) 571 .scalarize(0); 572 } 573 574 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 575 return [=](const LegalityQuery &Query) { 576 return Query.Types[TypeIdx0].getSizeInBits() < 577 Query.Types[TypeIdx1].getSizeInBits(); 578 }; 579 }; 580 581 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 582 return [=](const LegalityQuery &Query) { 583 return Query.Types[TypeIdx0].getSizeInBits() > 584 Query.Types[TypeIdx1].getSizeInBits(); 585 }; 586 }; 587 588 getActionDefinitionsBuilder(G_INTTOPTR) 589 // List the common cases 590 .legalForCartesianProduct(AddrSpaces64, {S64}) 591 .legalForCartesianProduct(AddrSpaces32, {S32}) 592 .scalarize(0) 593 // Accept any address space as long as the size matches 594 .legalIf(sameSize(0, 1)) 595 .widenScalarIf(smallerThan(1, 0), 596 [](const LegalityQuery &Query) { 597 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 598 }) 599 .narrowScalarIf(greaterThan(1, 0), 600 [](const LegalityQuery &Query) { 601 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 602 }); 603 604 getActionDefinitionsBuilder(G_PTRTOINT) 605 // List the common cases 606 .legalForCartesianProduct(AddrSpaces64, {S64}) 607 .legalForCartesianProduct(AddrSpaces32, {S32}) 608 .scalarize(0) 609 // Accept any address space as long as the size matches 610 .legalIf(sameSize(0, 1)) 611 .widenScalarIf(smallerThan(0, 1), 612 [](const LegalityQuery &Query) { 613 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 614 }) 615 .narrowScalarIf( 616 greaterThan(0, 1), 617 [](const LegalityQuery &Query) { 618 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 619 }); 620 621 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 622 .scalarize(0) 623 .custom(); 624 625 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 626 // handle some operations by just promoting the register during 627 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 628 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 629 switch (AS) { 630 // FIXME: Private element size. 631 case AMDGPUAS::PRIVATE_ADDRESS: 632 return 32; 633 // FIXME: Check subtarget 634 case AMDGPUAS::LOCAL_ADDRESS: 635 return ST.useDS128() ? 128 : 64; 636 637 // Treat constant and global as identical. SMRD loads are sometimes usable 638 // for global loads (ideally constant address space should be eliminated) 639 // depending on the context. Legality cannot be context dependent, but 640 // RegBankSelect can split the load as necessary depending on the pointer 641 // register bank/uniformity and if the memory is invariant or not written in 642 // a kernel. 643 case AMDGPUAS::CONSTANT_ADDRESS: 644 case AMDGPUAS::GLOBAL_ADDRESS: 645 return 512; 646 default: 647 return 128; 648 } 649 }; 650 651 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 652 const LLT DstTy = Query.Types[0]; 653 654 // Split vector extloads. 655 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 656 unsigned Align = Query.MMODescrs[0].AlignInBits; 657 658 if (MemSize < DstTy.getSizeInBits()) 659 MemSize = std::max(MemSize, Align); 660 661 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 662 return true; 663 664 const LLT PtrTy = Query.Types[1]; 665 unsigned AS = PtrTy.getAddressSpace(); 666 if (MemSize > maxSizeForAddrSpace(AS)) 667 return true; 668 669 // Catch weird sized loads that don't evenly divide into the access sizes 670 // TODO: May be able to widen depending on alignment etc. 671 unsigned NumRegs = MemSize / 32; 672 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 673 return true; 674 675 if (Align < MemSize) { 676 const SITargetLowering *TLI = ST.getTargetLowering(); 677 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 678 } 679 680 return false; 681 }; 682 683 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 684 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 685 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 686 687 // TODO: Refine based on subtargets which support unaligned access or 128-bit 688 // LDS 689 // TODO: Unsupported flat for SI. 690 691 for (unsigned Op : {G_LOAD, G_STORE}) { 692 const bool IsStore = Op == G_STORE; 693 694 auto &Actions = getActionDefinitionsBuilder(Op); 695 // Whitelist the common cases. 696 // TODO: Pointer loads 697 // TODO: Wide constant loads 698 // TODO: Only CI+ has 3x loads 699 // TODO: Loads to s16 on gfx9 700 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 701 {V2S32, GlobalPtr, 64, GlobalAlign32}, 702 {V3S32, GlobalPtr, 96, GlobalAlign32}, 703 {S96, GlobalPtr, 96, GlobalAlign32}, 704 {V4S32, GlobalPtr, 128, GlobalAlign32}, 705 {S128, GlobalPtr, 128, GlobalAlign32}, 706 {S64, GlobalPtr, 64, GlobalAlign32}, 707 {V2S64, GlobalPtr, 128, GlobalAlign32}, 708 {V2S16, GlobalPtr, 32, GlobalAlign32}, 709 {S32, GlobalPtr, 8, GlobalAlign8}, 710 {S32, GlobalPtr, 16, GlobalAlign16}, 711 712 {S32, LocalPtr, 32, 32}, 713 {S64, LocalPtr, 64, 32}, 714 {V2S32, LocalPtr, 64, 32}, 715 {S32, LocalPtr, 8, 8}, 716 {S32, LocalPtr, 16, 16}, 717 {V2S16, LocalPtr, 32, 32}, 718 719 {S32, PrivatePtr, 32, 32}, 720 {S32, PrivatePtr, 8, 8}, 721 {S32, PrivatePtr, 16, 16}, 722 {V2S16, PrivatePtr, 32, 32}, 723 724 {S32, FlatPtr, 32, GlobalAlign32}, 725 {S32, FlatPtr, 16, GlobalAlign16}, 726 {S32, FlatPtr, 8, GlobalAlign8}, 727 {V2S16, FlatPtr, 32, GlobalAlign32}, 728 729 {S32, ConstantPtr, 32, GlobalAlign32}, 730 {V2S32, ConstantPtr, 64, GlobalAlign32}, 731 {V3S32, ConstantPtr, 96, GlobalAlign32}, 732 {V4S32, ConstantPtr, 128, GlobalAlign32}, 733 {S64, ConstantPtr, 64, GlobalAlign32}, 734 {S128, ConstantPtr, 128, GlobalAlign32}, 735 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 736 Actions 737 .customIf(typeIs(1, Constant32Ptr)) 738 .narrowScalarIf( 739 [=](const LegalityQuery &Query) -> bool { 740 return !Query.Types[0].isVector() && needToSplitLoad(Query); 741 }, 742 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 743 const LLT DstTy = Query.Types[0]; 744 const LLT PtrTy = Query.Types[1]; 745 746 const unsigned DstSize = DstTy.getSizeInBits(); 747 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 748 749 // Split extloads. 750 if (DstSize > MemSize) 751 return std::make_pair(0, LLT::scalar(MemSize)); 752 753 if (DstSize > 32 && (DstSize % 32 != 0)) { 754 // FIXME: Need a way to specify non-extload of larger size if 755 // suitably aligned. 756 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 757 } 758 759 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 760 if (MemSize > MaxSize) 761 return std::make_pair(0, LLT::scalar(MaxSize)); 762 763 unsigned Align = Query.MMODescrs[0].AlignInBits; 764 return std::make_pair(0, LLT::scalar(Align)); 765 }) 766 .fewerElementsIf( 767 [=](const LegalityQuery &Query) -> bool { 768 return Query.Types[0].isVector() && needToSplitLoad(Query); 769 }, 770 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 771 const LLT DstTy = Query.Types[0]; 772 const LLT PtrTy = Query.Types[1]; 773 774 LLT EltTy = DstTy.getElementType(); 775 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 776 777 // Split if it's too large for the address space. 778 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 779 unsigned NumElts = DstTy.getNumElements(); 780 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 781 782 // FIXME: Refine when odd breakdowns handled 783 // The scalars will need to be re-legalized. 784 if (NumPieces == 1 || NumPieces >= NumElts || 785 NumElts % NumPieces != 0) 786 return std::make_pair(0, EltTy); 787 788 return std::make_pair(0, 789 LLT::vector(NumElts / NumPieces, EltTy)); 790 } 791 792 // Need to split because of alignment. 793 unsigned Align = Query.MMODescrs[0].AlignInBits; 794 unsigned EltSize = EltTy.getSizeInBits(); 795 if (EltSize > Align && 796 (EltSize / Align < DstTy.getNumElements())) { 797 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 798 } 799 800 // May need relegalization for the scalars. 801 return std::make_pair(0, EltTy); 802 }) 803 .minScalar(0, S32); 804 805 if (IsStore) 806 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 807 808 // TODO: Need a bitcast lower option? 809 Actions 810 .legalIf([=](const LegalityQuery &Query) { 811 const LLT Ty0 = Query.Types[0]; 812 unsigned Size = Ty0.getSizeInBits(); 813 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 814 unsigned Align = Query.MMODescrs[0].AlignInBits; 815 816 // FIXME: Widening store from alignment not valid. 817 if (MemSize < Size) 818 MemSize = std::max(MemSize, Align); 819 820 // No extending vector loads. 821 if (Size > MemSize && Ty0.isVector()) 822 return false; 823 824 switch (MemSize) { 825 case 8: 826 case 16: 827 return Size == 32; 828 case 32: 829 case 64: 830 case 128: 831 return true; 832 case 96: 833 return ST.hasDwordx3LoadStores(); 834 case 256: 835 case 512: 836 return true; 837 default: 838 return false; 839 } 840 }) 841 .widenScalarToNextPow2(0) 842 // TODO: v3s32->v4s32 with alignment 843 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 844 } 845 846 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 847 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 848 {S32, GlobalPtr, 16, 2 * 8}, 849 {S32, LocalPtr, 8, 8}, 850 {S32, LocalPtr, 16, 16}, 851 {S32, PrivatePtr, 8, 8}, 852 {S32, PrivatePtr, 16, 16}, 853 {S32, ConstantPtr, 8, 8}, 854 {S32, ConstantPtr, 16, 2 * 8}}); 855 if (ST.hasFlatAddressSpace()) { 856 ExtLoads.legalForTypesWithMemDesc( 857 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 858 } 859 860 ExtLoads.clampScalar(0, S32, S32) 861 .widenScalarToNextPow2(0) 862 .unsupportedIfMemSizeNotPow2() 863 .lower(); 864 865 auto &Atomics = getActionDefinitionsBuilder( 866 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 867 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 868 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 869 G_ATOMICRMW_UMIN}) 870 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 871 {S64, GlobalPtr}, {S64, LocalPtr}}); 872 if (ST.hasFlatAddressSpace()) { 873 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 874 } 875 876 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 877 .legalFor({{S32, LocalPtr}}); 878 879 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 880 // demarshalling 881 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 882 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 883 {S32, FlatPtr}, {S64, FlatPtr}}) 884 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 885 {S32, RegionPtr}, {S64, RegionPtr}}); 886 // TODO: Pointer types, any 32-bit or 64-bit vector 887 888 // Condition should be s32 for scalar, s1 for vector. 889 getActionDefinitionsBuilder(G_SELECT) 890 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 891 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 892 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 893 .clampScalar(0, S16, S64) 894 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 895 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 896 .scalarize(1) 897 .clampMaxNumElements(0, S32, 2) 898 .clampMaxNumElements(0, LocalPtr, 2) 899 .clampMaxNumElements(0, PrivatePtr, 2) 900 .scalarize(0) 901 .widenScalarToNextPow2(0) 902 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 903 904 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 905 // be more flexible with the shift amount type. 906 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 907 .legalFor({{S32, S32}, {S64, S32}}); 908 if (ST.has16BitInsts()) { 909 if (ST.hasVOP3PInsts()) { 910 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 911 .clampMaxNumElements(0, S16, 2); 912 } else 913 Shifts.legalFor({{S16, S32}, {S16, S16}}); 914 915 // TODO: Support 16-bit shift amounts 916 Shifts.clampScalar(1, S32, S32); 917 Shifts.clampScalar(0, S16, S64); 918 Shifts.widenScalarToNextPow2(0, 16); 919 } else { 920 // Make sure we legalize the shift amount type first, as the general 921 // expansion for the shifted type will produce much worse code if it hasn't 922 // been truncated already. 923 Shifts.clampScalar(1, S32, S32); 924 Shifts.clampScalar(0, S32, S64); 925 Shifts.widenScalarToNextPow2(0, 32); 926 } 927 Shifts.scalarize(0); 928 929 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 930 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 931 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 932 unsigned IdxTypeIdx = 2; 933 934 getActionDefinitionsBuilder(Op) 935 .customIf([=](const LegalityQuery &Query) { 936 const LLT EltTy = Query.Types[EltTypeIdx]; 937 const LLT VecTy = Query.Types[VecTypeIdx]; 938 const LLT IdxTy = Query.Types[IdxTypeIdx]; 939 return (EltTy.getSizeInBits() == 16 || 940 EltTy.getSizeInBits() % 32 == 0) && 941 VecTy.getSizeInBits() % 32 == 0 && 942 VecTy.getSizeInBits() <= 1024 && 943 IdxTy.getSizeInBits() == 32; 944 }) 945 .clampScalar(EltTypeIdx, S32, S64) 946 .clampScalar(VecTypeIdx, S32, S64) 947 .clampScalar(IdxTypeIdx, S32, S32); 948 } 949 950 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 951 .unsupportedIf([=](const LegalityQuery &Query) { 952 const LLT &EltTy = Query.Types[1].getElementType(); 953 return Query.Types[0] != EltTy; 954 }); 955 956 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 957 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 958 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 959 960 // FIXME: Doesn't handle extract of illegal sizes. 961 getActionDefinitionsBuilder(Op) 962 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 963 // FIXME: Multiples of 16 should not be legal. 964 .legalIf([=](const LegalityQuery &Query) { 965 const LLT BigTy = Query.Types[BigTyIdx]; 966 const LLT LitTy = Query.Types[LitTyIdx]; 967 return (BigTy.getSizeInBits() % 32 == 0) && 968 (LitTy.getSizeInBits() % 16 == 0); 969 }) 970 .widenScalarIf( 971 [=](const LegalityQuery &Query) { 972 const LLT BigTy = Query.Types[BigTyIdx]; 973 return (BigTy.getScalarSizeInBits() < 16); 974 }, 975 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 976 .widenScalarIf( 977 [=](const LegalityQuery &Query) { 978 const LLT LitTy = Query.Types[LitTyIdx]; 979 return (LitTy.getScalarSizeInBits() < 16); 980 }, 981 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 982 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 983 .widenScalarToNextPow2(BigTyIdx, 32); 984 985 } 986 987 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 988 .legalForCartesianProduct(AllS32Vectors, {S32}) 989 .legalForCartesianProduct(AllS64Vectors, {S64}) 990 .clampNumElements(0, V16S32, V32S32) 991 .clampNumElements(0, V2S64, V16S64) 992 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 993 994 if (ST.hasScalarPackInsts()) 995 BuildVector.legalFor({V2S16, S32}); 996 997 BuildVector 998 .minScalarSameAs(1, 0) 999 .legalIf(isRegisterType(0)) 1000 .minScalarOrElt(0, S32); 1001 1002 if (ST.hasScalarPackInsts()) { 1003 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1004 .legalFor({V2S16, S32}) 1005 .lower(); 1006 } else { 1007 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1008 .lower(); 1009 } 1010 1011 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1012 .legalIf(isRegisterType(0)); 1013 1014 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1015 // pre-legalize. 1016 if (ST.hasVOP3PInsts()) { 1017 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1018 .customFor({V2S16, V2S16}) 1019 .lower(); 1020 } else 1021 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1022 1023 // Merge/Unmerge 1024 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1025 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1026 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1027 1028 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1029 const LLT &Ty = Query.Types[TypeIdx]; 1030 if (Ty.isVector()) { 1031 const LLT &EltTy = Ty.getElementType(); 1032 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1033 return true; 1034 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1035 return true; 1036 } 1037 return false; 1038 }; 1039 1040 auto &Builder = getActionDefinitionsBuilder(Op) 1041 // Try to widen to s16 first for small types. 1042 // TODO: Only do this on targets with legal s16 shifts 1043 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1044 1045 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1046 .lowerFor({{S16, V2S16}}) 1047 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1048 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1049 elementTypeIs(1, S16)), 1050 changeTo(1, V2S16)) 1051 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1052 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1053 // valid. 1054 .clampScalar(LitTyIdx, S32, S256) 1055 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1056 // Break up vectors with weird elements into scalars 1057 .fewerElementsIf( 1058 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1059 scalarize(0)) 1060 .fewerElementsIf( 1061 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1062 scalarize(1)) 1063 .clampScalar(BigTyIdx, S32, S1024); 1064 1065 if (Op == G_MERGE_VALUES) { 1066 Builder.widenScalarIf( 1067 // TODO: Use 16-bit shifts if legal for 8-bit values? 1068 [=](const LegalityQuery &Query) { 1069 const LLT Ty = Query.Types[LitTyIdx]; 1070 return Ty.getSizeInBits() < 32; 1071 }, 1072 changeTo(LitTyIdx, S32)); 1073 } 1074 1075 Builder.widenScalarIf( 1076 [=](const LegalityQuery &Query) { 1077 const LLT Ty = Query.Types[BigTyIdx]; 1078 return !isPowerOf2_32(Ty.getSizeInBits()) && 1079 Ty.getSizeInBits() % 16 != 0; 1080 }, 1081 [=](const LegalityQuery &Query) { 1082 // Pick the next power of 2, or a multiple of 64 over 128. 1083 // Whichever is smaller. 1084 const LLT &Ty = Query.Types[BigTyIdx]; 1085 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1086 if (NewSizeInBits >= 256) { 1087 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1088 if (RoundedTo < NewSizeInBits) 1089 NewSizeInBits = RoundedTo; 1090 } 1091 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1092 }) 1093 .legalIf([=](const LegalityQuery &Query) { 1094 const LLT &BigTy = Query.Types[BigTyIdx]; 1095 const LLT &LitTy = Query.Types[LitTyIdx]; 1096 1097 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1098 return false; 1099 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1100 return false; 1101 1102 return BigTy.getSizeInBits() % 16 == 0 && 1103 LitTy.getSizeInBits() % 16 == 0 && 1104 BigTy.getSizeInBits() <= 1024; 1105 }) 1106 // Any vectors left are the wrong size. Scalarize them. 1107 .scalarize(0) 1108 .scalarize(1); 1109 } 1110 1111 // TODO: Make legal for s32, s64. s64 case needs break down in regbankselect. 1112 getActionDefinitionsBuilder(G_SEXT_INREG) 1113 .clampScalar(0, MinLegalScalarShiftTy, S64) 1114 .lower(); 1115 1116 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1117 .legalFor({S64}); 1118 1119 getActionDefinitionsBuilder({ 1120 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1121 G_FCOPYSIGN, 1122 1123 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1124 G_READ_REGISTER, 1125 G_WRITE_REGISTER, 1126 1127 G_SADDO, G_SSUBO, 1128 1129 // TODO: Implement 1130 G_FMINIMUM, G_FMAXIMUM 1131 }).lower(); 1132 1133 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1134 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1135 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1136 .unsupported(); 1137 1138 computeTables(); 1139 verify(*ST.getInstrInfo()); 1140 } 1141 1142 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1143 MachineRegisterInfo &MRI, 1144 MachineIRBuilder &B, 1145 GISelChangeObserver &Observer) const { 1146 switch (MI.getOpcode()) { 1147 case TargetOpcode::G_ADDRSPACE_CAST: 1148 return legalizeAddrSpaceCast(MI, MRI, B); 1149 case TargetOpcode::G_FRINT: 1150 return legalizeFrint(MI, MRI, B); 1151 case TargetOpcode::G_FCEIL: 1152 return legalizeFceil(MI, MRI, B); 1153 case TargetOpcode::G_INTRINSIC_TRUNC: 1154 return legalizeIntrinsicTrunc(MI, MRI, B); 1155 case TargetOpcode::G_SITOFP: 1156 return legalizeITOFP(MI, MRI, B, true); 1157 case TargetOpcode::G_UITOFP: 1158 return legalizeITOFP(MI, MRI, B, false); 1159 case TargetOpcode::G_FMINNUM: 1160 case TargetOpcode::G_FMAXNUM: 1161 case TargetOpcode::G_FMINNUM_IEEE: 1162 case TargetOpcode::G_FMAXNUM_IEEE: 1163 return legalizeMinNumMaxNum(MI, MRI, B); 1164 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1165 return legalizeExtractVectorElt(MI, MRI, B); 1166 case TargetOpcode::G_INSERT_VECTOR_ELT: 1167 return legalizeInsertVectorElt(MI, MRI, B); 1168 case TargetOpcode::G_SHUFFLE_VECTOR: 1169 return legalizeShuffleVector(MI, MRI, B); 1170 case TargetOpcode::G_FSIN: 1171 case TargetOpcode::G_FCOS: 1172 return legalizeSinCos(MI, MRI, B); 1173 case TargetOpcode::G_GLOBAL_VALUE: 1174 return legalizeGlobalValue(MI, MRI, B); 1175 case TargetOpcode::G_LOAD: 1176 return legalizeLoad(MI, MRI, B, Observer); 1177 case TargetOpcode::G_FMAD: 1178 return legalizeFMad(MI, MRI, B); 1179 case TargetOpcode::G_FDIV: 1180 return legalizeFDIV(MI, MRI, B); 1181 case TargetOpcode::G_ATOMIC_CMPXCHG: 1182 return legalizeAtomicCmpXChg(MI, MRI, B); 1183 default: 1184 return false; 1185 } 1186 1187 llvm_unreachable("expected switch to return"); 1188 } 1189 1190 Register AMDGPULegalizerInfo::getSegmentAperture( 1191 unsigned AS, 1192 MachineRegisterInfo &MRI, 1193 MachineIRBuilder &B) const { 1194 MachineFunction &MF = B.getMF(); 1195 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1196 const LLT S32 = LLT::scalar(32); 1197 1198 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1199 1200 if (ST.hasApertureRegs()) { 1201 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1202 // getreg. 1203 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1204 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1205 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1206 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1207 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1208 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1209 unsigned Encoding = 1210 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1211 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1212 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1213 1214 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1215 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1216 1217 B.buildInstr(AMDGPU::S_GETREG_B32) 1218 .addDef(GetReg) 1219 .addImm(Encoding); 1220 MRI.setType(GetReg, S32); 1221 1222 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1223 B.buildInstr(TargetOpcode::G_SHL) 1224 .addDef(ApertureReg) 1225 .addUse(GetReg) 1226 .addUse(ShiftAmt.getReg(0)); 1227 1228 return ApertureReg; 1229 } 1230 1231 Register QueuePtr = MRI.createGenericVirtualRegister( 1232 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1233 1234 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1235 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1236 return Register(); 1237 1238 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1239 // private_segment_aperture_base_hi. 1240 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1241 1242 // TODO: can we be smarter about machine pointer info? 1243 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1244 MachineMemOperand *MMO = MF.getMachineMemOperand( 1245 PtrInfo, 1246 MachineMemOperand::MOLoad | 1247 MachineMemOperand::MODereferenceable | 1248 MachineMemOperand::MOInvariant, 1249 4, 1250 MinAlign(64, StructOffset)); 1251 1252 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1253 Register LoadAddr; 1254 1255 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1256 B.buildLoad(LoadResult, LoadAddr, *MMO); 1257 return LoadResult; 1258 } 1259 1260 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1261 MachineInstr &MI, MachineRegisterInfo &MRI, 1262 MachineIRBuilder &B) const { 1263 MachineFunction &MF = B.getMF(); 1264 1265 B.setInstr(MI); 1266 1267 const LLT S32 = LLT::scalar(32); 1268 Register Dst = MI.getOperand(0).getReg(); 1269 Register Src = MI.getOperand(1).getReg(); 1270 1271 LLT DstTy = MRI.getType(Dst); 1272 LLT SrcTy = MRI.getType(Src); 1273 unsigned DestAS = DstTy.getAddressSpace(); 1274 unsigned SrcAS = SrcTy.getAddressSpace(); 1275 1276 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1277 // vector element. 1278 assert(!DstTy.isVector()); 1279 1280 const AMDGPUTargetMachine &TM 1281 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1282 1283 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1284 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1285 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1286 return true; 1287 } 1288 1289 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1290 // Truncate. 1291 B.buildExtract(Dst, Src, 0); 1292 MI.eraseFromParent(); 1293 return true; 1294 } 1295 1296 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1297 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1298 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1299 1300 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1301 // another. Merge operands are required to be the same type, but creating an 1302 // extra ptrtoint would be kind of pointless. 1303 auto HighAddr = B.buildConstant( 1304 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1305 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1306 MI.eraseFromParent(); 1307 return true; 1308 } 1309 1310 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1311 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1312 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1313 unsigned NullVal = TM.getNullPointerValue(DestAS); 1314 1315 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1316 auto FlatNull = B.buildConstant(SrcTy, 0); 1317 1318 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1319 1320 // Extract low 32-bits of the pointer. 1321 B.buildExtract(PtrLo32, Src, 0); 1322 1323 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1324 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1325 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1326 1327 MI.eraseFromParent(); 1328 return true; 1329 } 1330 1331 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1332 return false; 1333 1334 if (!ST.hasFlatAddressSpace()) 1335 return false; 1336 1337 auto SegmentNull = 1338 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1339 auto FlatNull = 1340 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1341 1342 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1343 if (!ApertureReg.isValid()) 1344 return false; 1345 1346 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1347 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1348 1349 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1350 1351 // Coerce the type of the low half of the result so we can use merge_values. 1352 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1353 B.buildInstr(TargetOpcode::G_PTRTOINT) 1354 .addDef(SrcAsInt) 1355 .addUse(Src); 1356 1357 // TODO: Should we allow mismatched types but matching sizes in merges to 1358 // avoid the ptrtoint? 1359 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1360 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1361 1362 MI.eraseFromParent(); 1363 return true; 1364 } 1365 1366 bool AMDGPULegalizerInfo::legalizeFrint( 1367 MachineInstr &MI, MachineRegisterInfo &MRI, 1368 MachineIRBuilder &B) const { 1369 B.setInstr(MI); 1370 1371 Register Src = MI.getOperand(1).getReg(); 1372 LLT Ty = MRI.getType(Src); 1373 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1374 1375 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1376 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1377 1378 auto C1 = B.buildFConstant(Ty, C1Val); 1379 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1380 1381 // TODO: Should this propagate fast-math-flags? 1382 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1383 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1384 1385 auto C2 = B.buildFConstant(Ty, C2Val); 1386 auto Fabs = B.buildFAbs(Ty, Src); 1387 1388 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1389 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1390 return true; 1391 } 1392 1393 bool AMDGPULegalizerInfo::legalizeFceil( 1394 MachineInstr &MI, MachineRegisterInfo &MRI, 1395 MachineIRBuilder &B) const { 1396 B.setInstr(MI); 1397 1398 const LLT S1 = LLT::scalar(1); 1399 const LLT S64 = LLT::scalar(64); 1400 1401 Register Src = MI.getOperand(1).getReg(); 1402 assert(MRI.getType(Src) == S64); 1403 1404 // result = trunc(src) 1405 // if (src > 0.0 && src != result) 1406 // result += 1.0 1407 1408 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1409 1410 const auto Zero = B.buildFConstant(S64, 0.0); 1411 const auto One = B.buildFConstant(S64, 1.0); 1412 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1413 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1414 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1415 auto Add = B.buildSelect(S64, And, One, Zero); 1416 1417 // TODO: Should this propagate fast-math-flags? 1418 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1419 return true; 1420 } 1421 1422 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1423 MachineIRBuilder &B) { 1424 const unsigned FractBits = 52; 1425 const unsigned ExpBits = 11; 1426 LLT S32 = LLT::scalar(32); 1427 1428 auto Const0 = B.buildConstant(S32, FractBits - 32); 1429 auto Const1 = B.buildConstant(S32, ExpBits); 1430 1431 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1432 .addUse(Const0.getReg(0)) 1433 .addUse(Const1.getReg(0)); 1434 1435 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1436 } 1437 1438 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1439 MachineInstr &MI, MachineRegisterInfo &MRI, 1440 MachineIRBuilder &B) const { 1441 B.setInstr(MI); 1442 1443 const LLT S1 = LLT::scalar(1); 1444 const LLT S32 = LLT::scalar(32); 1445 const LLT S64 = LLT::scalar(64); 1446 1447 Register Src = MI.getOperand(1).getReg(); 1448 assert(MRI.getType(Src) == S64); 1449 1450 // TODO: Should this use extract since the low half is unused? 1451 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1452 Register Hi = Unmerge.getReg(1); 1453 1454 // Extract the upper half, since this is where we will find the sign and 1455 // exponent. 1456 auto Exp = extractF64Exponent(Hi, B); 1457 1458 const unsigned FractBits = 52; 1459 1460 // Extract the sign bit. 1461 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1462 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1463 1464 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1465 1466 const auto Zero32 = B.buildConstant(S32, 0); 1467 1468 // Extend back to 64-bits. 1469 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1470 1471 auto Shr = B.buildAShr(S64, FractMask, Exp); 1472 auto Not = B.buildNot(S64, Shr); 1473 auto Tmp0 = B.buildAnd(S64, Src, Not); 1474 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1475 1476 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1477 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1478 1479 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1480 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1481 return true; 1482 } 1483 1484 bool AMDGPULegalizerInfo::legalizeITOFP( 1485 MachineInstr &MI, MachineRegisterInfo &MRI, 1486 MachineIRBuilder &B, bool Signed) const { 1487 B.setInstr(MI); 1488 1489 Register Dst = MI.getOperand(0).getReg(); 1490 Register Src = MI.getOperand(1).getReg(); 1491 1492 const LLT S64 = LLT::scalar(64); 1493 const LLT S32 = LLT::scalar(32); 1494 1495 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1496 1497 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1498 1499 auto CvtHi = Signed ? 1500 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1501 B.buildUITOFP(S64, Unmerge.getReg(1)); 1502 1503 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1504 1505 auto ThirtyTwo = B.buildConstant(S32, 32); 1506 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1507 .addUse(CvtHi.getReg(0)) 1508 .addUse(ThirtyTwo.getReg(0)); 1509 1510 // TODO: Should this propagate fast-math-flags? 1511 B.buildFAdd(Dst, LdExp, CvtLo); 1512 MI.eraseFromParent(); 1513 return true; 1514 } 1515 1516 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1517 MachineInstr &MI, MachineRegisterInfo &MRI, 1518 MachineIRBuilder &B) const { 1519 MachineFunction &MF = B.getMF(); 1520 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1521 1522 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1523 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1524 1525 // With ieee_mode disabled, the instructions have the correct behavior 1526 // already for G_FMINNUM/G_FMAXNUM 1527 if (!MFI->getMode().IEEE) 1528 return !IsIEEEOp; 1529 1530 if (IsIEEEOp) 1531 return true; 1532 1533 MachineIRBuilder HelperBuilder(MI); 1534 GISelObserverWrapper DummyObserver; 1535 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1536 HelperBuilder.setInstr(MI); 1537 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1538 } 1539 1540 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1541 MachineInstr &MI, MachineRegisterInfo &MRI, 1542 MachineIRBuilder &B) const { 1543 // TODO: Should move some of this into LegalizerHelper. 1544 1545 // TODO: Promote dynamic indexing of s16 to s32 1546 // TODO: Dynamic s64 indexing is only legal for SGPR. 1547 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1548 if (!IdxVal) // Dynamic case will be selected to register indexing. 1549 return true; 1550 1551 Register Dst = MI.getOperand(0).getReg(); 1552 Register Vec = MI.getOperand(1).getReg(); 1553 1554 LLT VecTy = MRI.getType(Vec); 1555 LLT EltTy = VecTy.getElementType(); 1556 assert(EltTy == MRI.getType(Dst)); 1557 1558 B.setInstr(MI); 1559 1560 if (IdxVal.getValue() < VecTy.getNumElements()) 1561 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1562 else 1563 B.buildUndef(Dst); 1564 1565 MI.eraseFromParent(); 1566 return true; 1567 } 1568 1569 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1570 MachineInstr &MI, MachineRegisterInfo &MRI, 1571 MachineIRBuilder &B) const { 1572 // TODO: Should move some of this into LegalizerHelper. 1573 1574 // TODO: Promote dynamic indexing of s16 to s32 1575 // TODO: Dynamic s64 indexing is only legal for SGPR. 1576 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1577 if (!IdxVal) // Dynamic case will be selected to register indexing. 1578 return true; 1579 1580 Register Dst = MI.getOperand(0).getReg(); 1581 Register Vec = MI.getOperand(1).getReg(); 1582 Register Ins = MI.getOperand(2).getReg(); 1583 1584 LLT VecTy = MRI.getType(Vec); 1585 LLT EltTy = VecTy.getElementType(); 1586 assert(EltTy == MRI.getType(Ins)); 1587 1588 B.setInstr(MI); 1589 1590 if (IdxVal.getValue() < VecTy.getNumElements()) 1591 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1592 else 1593 B.buildUndef(Dst); 1594 1595 MI.eraseFromParent(); 1596 return true; 1597 } 1598 1599 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) { 1600 assert(Mask.size() == 2); 1601 1602 // If one half is undef, the other is trivially in the same reg. 1603 if (Mask[0] == -1 || Mask[1] == -1) 1604 return true; 1605 return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) || 1606 ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3)); 1607 } 1608 1609 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1610 MachineInstr &MI, MachineRegisterInfo &MRI, 1611 MachineIRBuilder &B) const { 1612 const LLT V2S16 = LLT::vector(2, 16); 1613 1614 Register Dst = MI.getOperand(0).getReg(); 1615 Register Src0 = MI.getOperand(1).getReg(); 1616 LLT DstTy = MRI.getType(Dst); 1617 LLT SrcTy = MRI.getType(Src0); 1618 1619 if (SrcTy == V2S16 && DstTy == V2S16 && 1620 isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1621 return true; 1622 1623 MachineIRBuilder HelperBuilder(MI); 1624 GISelObserverWrapper DummyObserver; 1625 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1626 HelperBuilder.setInstr(MI); 1627 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1628 } 1629 1630 bool AMDGPULegalizerInfo::legalizeSinCos( 1631 MachineInstr &MI, MachineRegisterInfo &MRI, 1632 MachineIRBuilder &B) const { 1633 B.setInstr(MI); 1634 1635 Register DstReg = MI.getOperand(0).getReg(); 1636 Register SrcReg = MI.getOperand(1).getReg(); 1637 LLT Ty = MRI.getType(DstReg); 1638 unsigned Flags = MI.getFlags(); 1639 1640 Register TrigVal; 1641 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1642 if (ST.hasTrigReducedRange()) { 1643 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1644 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1645 .addUse(MulVal.getReg(0)) 1646 .setMIFlags(Flags).getReg(0); 1647 } else 1648 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1649 1650 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1651 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1652 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1653 .addUse(TrigVal) 1654 .setMIFlags(Flags); 1655 MI.eraseFromParent(); 1656 return true; 1657 } 1658 1659 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1660 Register DstReg, LLT PtrTy, 1661 MachineIRBuilder &B, const GlobalValue *GV, 1662 unsigned Offset, unsigned GAFlags) const { 1663 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1664 // to the following code sequence: 1665 // 1666 // For constant address space: 1667 // s_getpc_b64 s[0:1] 1668 // s_add_u32 s0, s0, $symbol 1669 // s_addc_u32 s1, s1, 0 1670 // 1671 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1672 // a fixup or relocation is emitted to replace $symbol with a literal 1673 // constant, which is a pc-relative offset from the encoding of the $symbol 1674 // operand to the global variable. 1675 // 1676 // For global address space: 1677 // s_getpc_b64 s[0:1] 1678 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1679 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1680 // 1681 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1682 // fixups or relocations are emitted to replace $symbol@*@lo and 1683 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1684 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1685 // operand to the global variable. 1686 // 1687 // What we want here is an offset from the value returned by s_getpc 1688 // (which is the address of the s_add_u32 instruction) to the global 1689 // variable, but since the encoding of $symbol starts 4 bytes after the start 1690 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1691 // small. This requires us to add 4 to the global variable offset in order to 1692 // compute the correct address. 1693 1694 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1695 1696 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1697 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1698 1699 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1700 .addDef(PCReg); 1701 1702 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1703 if (GAFlags == SIInstrInfo::MO_NONE) 1704 MIB.addImm(0); 1705 else 1706 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1707 1708 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1709 1710 if (PtrTy.getSizeInBits() == 32) 1711 B.buildExtract(DstReg, PCReg, 0); 1712 return true; 1713 } 1714 1715 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1716 MachineInstr &MI, MachineRegisterInfo &MRI, 1717 MachineIRBuilder &B) const { 1718 Register DstReg = MI.getOperand(0).getReg(); 1719 LLT Ty = MRI.getType(DstReg); 1720 unsigned AS = Ty.getAddressSpace(); 1721 1722 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1723 MachineFunction &MF = B.getMF(); 1724 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1725 B.setInstr(MI); 1726 1727 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1728 if (!MFI->isEntryFunction()) { 1729 const Function &Fn = MF.getFunction(); 1730 DiagnosticInfoUnsupported BadLDSDecl( 1731 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1732 Fn.getContext().diagnose(BadLDSDecl); 1733 } 1734 1735 // TODO: We could emit code to handle the initialization somewhere. 1736 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1737 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1738 MI.eraseFromParent(); 1739 return true; 1740 } 1741 1742 const Function &Fn = MF.getFunction(); 1743 DiagnosticInfoUnsupported BadInit( 1744 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1745 Fn.getContext().diagnose(BadInit); 1746 return true; 1747 } 1748 1749 const SITargetLowering *TLI = ST.getTargetLowering(); 1750 1751 if (TLI->shouldEmitFixup(GV)) { 1752 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1753 MI.eraseFromParent(); 1754 return true; 1755 } 1756 1757 if (TLI->shouldEmitPCReloc(GV)) { 1758 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1759 MI.eraseFromParent(); 1760 return true; 1761 } 1762 1763 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1764 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1765 1766 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1767 MachinePointerInfo::getGOT(MF), 1768 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1769 MachineMemOperand::MOInvariant, 1770 8 /*Size*/, 8 /*Align*/); 1771 1772 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1773 1774 if (Ty.getSizeInBits() == 32) { 1775 // Truncate if this is a 32-bit constant adrdess. 1776 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1777 B.buildExtract(DstReg, Load, 0); 1778 } else 1779 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1780 1781 MI.eraseFromParent(); 1782 return true; 1783 } 1784 1785 bool AMDGPULegalizerInfo::legalizeLoad( 1786 MachineInstr &MI, MachineRegisterInfo &MRI, 1787 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1788 B.setInstr(MI); 1789 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1790 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1791 Observer.changingInstr(MI); 1792 MI.getOperand(1).setReg(Cast.getReg(0)); 1793 Observer.changedInstr(MI); 1794 return true; 1795 } 1796 1797 bool AMDGPULegalizerInfo::legalizeFMad( 1798 MachineInstr &MI, MachineRegisterInfo &MRI, 1799 MachineIRBuilder &B) const { 1800 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1801 assert(Ty.isScalar()); 1802 1803 MachineFunction &MF = B.getMF(); 1804 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1805 1806 // TODO: Always legal with future ftz flag. 1807 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1808 return true; 1809 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1810 return true; 1811 1812 1813 MachineIRBuilder HelperBuilder(MI); 1814 GISelObserverWrapper DummyObserver; 1815 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1816 HelperBuilder.setMBB(*MI.getParent()); 1817 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1818 } 1819 1820 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1821 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1822 Register DstReg = MI.getOperand(0).getReg(); 1823 Register PtrReg = MI.getOperand(1).getReg(); 1824 Register CmpVal = MI.getOperand(2).getReg(); 1825 Register NewVal = MI.getOperand(3).getReg(); 1826 1827 assert(SITargetLowering::isFlatGlobalAddrSpace( 1828 MRI.getType(PtrReg).getAddressSpace()) && 1829 "this should not have been custom lowered"); 1830 1831 LLT ValTy = MRI.getType(CmpVal); 1832 LLT VecTy = LLT::vector(2, ValTy); 1833 1834 B.setInstr(MI); 1835 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1836 1837 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1838 .addDef(DstReg) 1839 .addUse(PtrReg) 1840 .addUse(PackedVal) 1841 .setMemRefs(MI.memoperands()); 1842 1843 MI.eraseFromParent(); 1844 return true; 1845 } 1846 1847 // Return the use branch instruction, otherwise null if the usage is invalid. 1848 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1849 MachineRegisterInfo &MRI, 1850 MachineInstr *&Br) { 1851 Register CondDef = MI.getOperand(0).getReg(); 1852 if (!MRI.hasOneNonDBGUse(CondDef)) 1853 return nullptr; 1854 1855 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1856 if (UseMI.getParent() != MI.getParent() || 1857 UseMI.getOpcode() != AMDGPU::G_BRCOND) 1858 return nullptr; 1859 1860 // Make sure the cond br is followed by a G_BR 1861 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 1862 if (Next != MI.getParent()->end()) { 1863 if (Next->getOpcode() != AMDGPU::G_BR) 1864 return nullptr; 1865 Br = &*Next; 1866 } 1867 1868 return &UseMI; 1869 } 1870 1871 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1872 Register Reg, LLT Ty) const { 1873 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1874 if (LiveIn) 1875 return LiveIn; 1876 1877 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1878 MRI.addLiveIn(Reg, NewReg); 1879 return NewReg; 1880 } 1881 1882 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1883 const ArgDescriptor *Arg) const { 1884 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1885 return false; // TODO: Handle these 1886 1887 assert(Arg->getRegister().isPhysical()); 1888 1889 MachineRegisterInfo &MRI = *B.getMRI(); 1890 1891 LLT Ty = MRI.getType(DstReg); 1892 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1893 1894 if (Arg->isMasked()) { 1895 // TODO: Should we try to emit this once in the entry block? 1896 const LLT S32 = LLT::scalar(32); 1897 const unsigned Mask = Arg->getMask(); 1898 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1899 1900 Register AndMaskSrc = LiveIn; 1901 1902 if (Shift != 0) { 1903 auto ShiftAmt = B.buildConstant(S32, Shift); 1904 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1905 } 1906 1907 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1908 } else 1909 B.buildCopy(DstReg, LiveIn); 1910 1911 // Insert the argument copy if it doens't already exist. 1912 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1913 if (!MRI.getVRegDef(LiveIn)) { 1914 // FIXME: Should have scoped insert pt 1915 MachineBasicBlock &OrigInsBB = B.getMBB(); 1916 auto OrigInsPt = B.getInsertPt(); 1917 1918 MachineBasicBlock &EntryMBB = B.getMF().front(); 1919 EntryMBB.addLiveIn(Arg->getRegister()); 1920 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1921 B.buildCopy(LiveIn, Arg->getRegister()); 1922 1923 B.setInsertPt(OrigInsBB, OrigInsPt); 1924 } 1925 1926 return true; 1927 } 1928 1929 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1930 MachineInstr &MI, 1931 MachineRegisterInfo &MRI, 1932 MachineIRBuilder &B, 1933 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1934 B.setInstr(MI); 1935 1936 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1937 1938 const ArgDescriptor *Arg; 1939 const TargetRegisterClass *RC; 1940 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1941 if (!Arg) { 1942 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1943 return false; 1944 } 1945 1946 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1947 MI.eraseFromParent(); 1948 return true; 1949 } 1950 1951 return false; 1952 } 1953 1954 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1955 MachineRegisterInfo &MRI, 1956 MachineIRBuilder &B) const { 1957 B.setInstr(MI); 1958 Register Dst = MI.getOperand(0).getReg(); 1959 LLT DstTy = MRI.getType(Dst); 1960 LLT S16 = LLT::scalar(16); 1961 LLT S32 = LLT::scalar(32); 1962 LLT S64 = LLT::scalar(64); 1963 1964 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1965 return true; 1966 1967 if (DstTy == S16) 1968 return legalizeFDIV16(MI, MRI, B); 1969 if (DstTy == S32) 1970 return legalizeFDIV32(MI, MRI, B); 1971 if (DstTy == S64) 1972 return legalizeFDIV64(MI, MRI, B); 1973 1974 return false; 1975 } 1976 1977 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1978 MachineRegisterInfo &MRI, 1979 MachineIRBuilder &B) const { 1980 Register Res = MI.getOperand(0).getReg(); 1981 Register LHS = MI.getOperand(1).getReg(); 1982 Register RHS = MI.getOperand(2).getReg(); 1983 1984 uint16_t Flags = MI.getFlags(); 1985 1986 LLT ResTy = MRI.getType(Res); 1987 LLT S32 = LLT::scalar(32); 1988 LLT S64 = LLT::scalar(64); 1989 1990 const MachineFunction &MF = B.getMF(); 1991 bool Unsafe = 1992 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1993 1994 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1995 return false; 1996 1997 if (!Unsafe && ResTy == S32 && 1998 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 1999 return false; 2000 2001 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2002 // 1 / x -> RCP(x) 2003 if (CLHS->isExactlyValue(1.0)) { 2004 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2005 .addUse(RHS) 2006 .setMIFlags(Flags); 2007 2008 MI.eraseFromParent(); 2009 return true; 2010 } 2011 2012 // -1 / x -> RCP( FNEG(x) ) 2013 if (CLHS->isExactlyValue(-1.0)) { 2014 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2015 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2016 .addUse(FNeg.getReg(0)) 2017 .setMIFlags(Flags); 2018 2019 MI.eraseFromParent(); 2020 return true; 2021 } 2022 } 2023 2024 // x / y -> x * (1.0 / y) 2025 if (Unsafe) { 2026 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2027 .addUse(RHS) 2028 .setMIFlags(Flags); 2029 B.buildFMul(Res, LHS, RCP, Flags); 2030 2031 MI.eraseFromParent(); 2032 return true; 2033 } 2034 2035 return false; 2036 } 2037 2038 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2039 MachineRegisterInfo &MRI, 2040 MachineIRBuilder &B) const { 2041 B.setInstr(MI); 2042 Register Res = MI.getOperand(0).getReg(); 2043 Register LHS = MI.getOperand(1).getReg(); 2044 Register RHS = MI.getOperand(2).getReg(); 2045 2046 uint16_t Flags = MI.getFlags(); 2047 2048 LLT S16 = LLT::scalar(16); 2049 LLT S32 = LLT::scalar(32); 2050 2051 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2052 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2053 2054 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2055 .addUse(RHSExt.getReg(0)) 2056 .setMIFlags(Flags); 2057 2058 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2059 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2060 2061 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2062 .addUse(RDst.getReg(0)) 2063 .addUse(RHS) 2064 .addUse(LHS) 2065 .setMIFlags(Flags); 2066 2067 MI.eraseFromParent(); 2068 return true; 2069 } 2070 2071 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2072 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2073 static void toggleSPDenormMode(bool Enable, 2074 MachineIRBuilder &B, 2075 const GCNSubtarget &ST, 2076 AMDGPU::SIModeRegisterDefaults Mode) { 2077 // Set SP denorm mode to this value. 2078 unsigned SPDenormMode = 2079 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2080 2081 if (ST.hasDenormModeInst()) { 2082 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2083 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 2084 ? FP_DENORM_FLUSH_NONE 2085 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2086 2087 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2088 B.buildInstr(AMDGPU::S_DENORM_MODE) 2089 .addImm(NewDenormModeValue); 2090 2091 } else { 2092 // Select FP32 bit field in mode register. 2093 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2094 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2095 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2096 2097 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2098 .addImm(SPDenormMode) 2099 .addImm(SPDenormModeBitField); 2100 } 2101 } 2102 2103 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2104 MachineRegisterInfo &MRI, 2105 MachineIRBuilder &B) const { 2106 B.setInstr(MI); 2107 Register Res = MI.getOperand(0).getReg(); 2108 Register LHS = MI.getOperand(1).getReg(); 2109 Register RHS = MI.getOperand(2).getReg(); 2110 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2111 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2112 2113 uint16_t Flags = MI.getFlags(); 2114 2115 LLT S32 = LLT::scalar(32); 2116 LLT S1 = LLT::scalar(1); 2117 2118 auto One = B.buildFConstant(S32, 1.0f); 2119 2120 auto DenominatorScaled = 2121 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2122 .addUse(RHS) 2123 .addUse(LHS) 2124 .addImm(1) 2125 .setMIFlags(Flags); 2126 auto NumeratorScaled = 2127 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2128 .addUse(LHS) 2129 .addUse(RHS) 2130 .addImm(0) 2131 .setMIFlags(Flags); 2132 2133 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2134 .addUse(DenominatorScaled.getReg(0)) 2135 .setMIFlags(Flags); 2136 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2137 2138 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2139 // aren't modeled as reading it. 2140 if (!Mode.FP32Denormals) 2141 toggleSPDenormMode(true, B, ST, Mode); 2142 2143 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2144 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2145 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2146 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2147 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2148 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2149 2150 if (!Mode.FP32Denormals) 2151 toggleSPDenormMode(false, B, ST, Mode); 2152 2153 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2154 .addUse(Fma4.getReg(0)) 2155 .addUse(Fma1.getReg(0)) 2156 .addUse(Fma3.getReg(0)) 2157 .addUse(NumeratorScaled.getReg(1)) 2158 .setMIFlags(Flags); 2159 2160 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2161 .addUse(Fmas.getReg(0)) 2162 .addUse(RHS) 2163 .addUse(LHS) 2164 .setMIFlags(Flags); 2165 2166 MI.eraseFromParent(); 2167 return true; 2168 } 2169 2170 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2171 MachineRegisterInfo &MRI, 2172 MachineIRBuilder &B) const { 2173 B.setInstr(MI); 2174 Register Res = MI.getOperand(0).getReg(); 2175 Register LHS = MI.getOperand(1).getReg(); 2176 Register RHS = MI.getOperand(2).getReg(); 2177 2178 uint16_t Flags = MI.getFlags(); 2179 2180 LLT S64 = LLT::scalar(64); 2181 LLT S1 = LLT::scalar(1); 2182 2183 auto One = B.buildFConstant(S64, 1.0); 2184 2185 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2186 .addUse(LHS) 2187 .addUse(RHS) 2188 .addImm(1) 2189 .setMIFlags(Flags); 2190 2191 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2192 2193 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2194 .addUse(DivScale0.getReg(0)) 2195 .setMIFlags(Flags); 2196 2197 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2198 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2199 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2200 2201 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2202 .addUse(LHS) 2203 .addUse(RHS) 2204 .addImm(0) 2205 .setMIFlags(Flags); 2206 2207 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2208 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2209 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2210 2211 Register Scale; 2212 if (!ST.hasUsableDivScaleConditionOutput()) { 2213 // Workaround a hardware bug on SI where the condition output from div_scale 2214 // is not usable. 2215 2216 Scale = MRI.createGenericVirtualRegister(S1); 2217 2218 LLT S32 = LLT::scalar(32); 2219 2220 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2221 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2222 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2223 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2224 2225 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2226 Scale1Unmerge.getReg(1)); 2227 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2228 Scale0Unmerge.getReg(1)); 2229 B.buildXor(Scale, CmpNum, CmpDen); 2230 } else { 2231 Scale = DivScale1.getReg(1); 2232 } 2233 2234 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2235 .addUse(Fma4.getReg(0)) 2236 .addUse(Fma3.getReg(0)) 2237 .addUse(Mul.getReg(0)) 2238 .addUse(Scale) 2239 .setMIFlags(Flags); 2240 2241 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2242 .addUse(Fmas.getReg(0)) 2243 .addUse(RHS) 2244 .addUse(LHS) 2245 .setMIFlags(Flags); 2246 2247 MI.eraseFromParent(); 2248 return true; 2249 } 2250 2251 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2252 MachineRegisterInfo &MRI, 2253 MachineIRBuilder &B) const { 2254 B.setInstr(MI); 2255 Register Res = MI.getOperand(0).getReg(); 2256 Register LHS = MI.getOperand(2).getReg(); 2257 Register RHS = MI.getOperand(3).getReg(); 2258 uint16_t Flags = MI.getFlags(); 2259 2260 LLT S32 = LLT::scalar(32); 2261 LLT S1 = LLT::scalar(1); 2262 2263 auto Abs = B.buildFAbs(S32, RHS, Flags); 2264 const APFloat C0Val(1.0f); 2265 2266 auto C0 = B.buildConstant(S32, 0x6f800000); 2267 auto C1 = B.buildConstant(S32, 0x2f800000); 2268 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2269 2270 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2271 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2272 2273 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2274 2275 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2276 .addUse(Mul0.getReg(0)) 2277 .setMIFlags(Flags); 2278 2279 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2280 2281 B.buildFMul(Res, Sel, Mul1, Flags); 2282 2283 MI.eraseFromParent(); 2284 return true; 2285 } 2286 2287 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2288 MachineRegisterInfo &MRI, 2289 MachineIRBuilder &B) const { 2290 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2291 if (!MFI->isEntryFunction()) { 2292 return legalizePreloadedArgIntrin(MI, MRI, B, 2293 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2294 } 2295 2296 B.setInstr(MI); 2297 2298 uint64_t Offset = 2299 ST.getTargetLowering()->getImplicitParameterOffset( 2300 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2301 Register DstReg = MI.getOperand(0).getReg(); 2302 LLT DstTy = MRI.getType(DstReg); 2303 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2304 2305 const ArgDescriptor *Arg; 2306 const TargetRegisterClass *RC; 2307 std::tie(Arg, RC) 2308 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2309 if (!Arg) 2310 return false; 2311 2312 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2313 if (!loadInputValue(KernargPtrReg, B, Arg)) 2314 return false; 2315 2316 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2317 MI.eraseFromParent(); 2318 return true; 2319 } 2320 2321 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2322 MachineRegisterInfo &MRI, 2323 MachineIRBuilder &B, 2324 unsigned AddrSpace) const { 2325 B.setInstr(MI); 2326 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2327 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2328 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2329 MI.eraseFromParent(); 2330 return true; 2331 } 2332 2333 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 2334 // offset (the offset that is included in bounds checking and swizzling, to be 2335 // split between the instruction's voffset and immoffset fields) and soffset 2336 // (the offset that is excluded from bounds checking and swizzling, to go in 2337 // the instruction's soffset field). This function takes the first kind of 2338 // offset and figures out how to split it between voffset and immoffset. 2339 std::tuple<Register, unsigned, unsigned> 2340 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 2341 Register OrigOffset) const { 2342 const unsigned MaxImm = 4095; 2343 Register BaseReg; 2344 unsigned TotalConstOffset; 2345 MachineInstr *OffsetDef; 2346 const LLT S32 = LLT::scalar(32); 2347 2348 std::tie(BaseReg, TotalConstOffset, OffsetDef) 2349 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 2350 2351 unsigned ImmOffset = TotalConstOffset; 2352 2353 // If the immediate value is too big for the immoffset field, put the value 2354 // and -4096 into the immoffset field so that the value that is copied/added 2355 // for the voffset field is a multiple of 4096, and it stands more chance 2356 // of being CSEd with the copy/add for another similar load/store. 2357 // However, do not do that rounding down to a multiple of 4096 if that is a 2358 // negative number, as it appears to be illegal to have a negative offset 2359 // in the vgpr, even if adding the immediate offset makes it positive. 2360 unsigned Overflow = ImmOffset & ~MaxImm; 2361 ImmOffset -= Overflow; 2362 if ((int32_t)Overflow < 0) { 2363 Overflow += ImmOffset; 2364 ImmOffset = 0; 2365 } 2366 2367 if (Overflow != 0) { 2368 if (!BaseReg) { 2369 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 2370 } else { 2371 auto OverflowVal = B.buildConstant(S32, Overflow); 2372 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 2373 } 2374 } 2375 2376 if (!BaseReg) 2377 BaseReg = B.buildConstant(S32, 0).getReg(0); 2378 2379 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 2380 } 2381 2382 /// Handle register layout difference for f16 images for some subtargets. 2383 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2384 MachineRegisterInfo &MRI, 2385 Register Reg) const { 2386 if (!ST.hasUnpackedD16VMem()) 2387 return Reg; 2388 2389 const LLT S16 = LLT::scalar(16); 2390 const LLT S32 = LLT::scalar(32); 2391 LLT StoreVT = MRI.getType(Reg); 2392 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2393 2394 auto Unmerge = B.buildUnmerge(S16, Reg); 2395 2396 SmallVector<Register, 4> WideRegs; 2397 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2398 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2399 2400 int NumElts = StoreVT.getNumElements(); 2401 2402 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2403 } 2404 2405 Register AMDGPULegalizerInfo::fixStoreSourceType( 2406 MachineIRBuilder &B, Register VData, bool IsFormat) const { 2407 MachineRegisterInfo *MRI = B.getMRI(); 2408 LLT Ty = MRI->getType(VData); 2409 2410 const LLT S16 = LLT::scalar(16); 2411 2412 // Fixup illegal register types for i8 stores. 2413 if (Ty == LLT::scalar(8) || Ty == S16) { 2414 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2415 return AnyExt; 2416 } 2417 2418 if (Ty.isVector()) { 2419 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2420 if (IsFormat) 2421 return handleD16VData(B, *MRI, VData); 2422 } 2423 } 2424 2425 return VData; 2426 } 2427 2428 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 2429 MachineRegisterInfo &MRI, 2430 MachineIRBuilder &B, 2431 bool IsTyped, 2432 bool IsFormat) const { 2433 B.setInstr(MI); 2434 2435 Register VData = MI.getOperand(1).getReg(); 2436 LLT Ty = MRI.getType(VData); 2437 LLT EltTy = Ty.getScalarType(); 2438 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2439 const LLT S32 = LLT::scalar(32); 2440 2441 VData = fixStoreSourceType(B, VData, IsFormat); 2442 Register RSrc = MI.getOperand(2).getReg(); 2443 2444 MachineMemOperand *MMO = *MI.memoperands_begin(); 2445 const int MemSize = MMO->getSize(); 2446 2447 unsigned ImmOffset; 2448 unsigned TotalOffset; 2449 2450 // The typed intrinsics add an immediate after the registers. 2451 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2452 2453 // The struct intrinsic variants add one additional operand over raw. 2454 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2455 Register VIndex; 2456 int OpOffset = 0; 2457 if (HasVIndex) { 2458 VIndex = MI.getOperand(3).getReg(); 2459 OpOffset = 1; 2460 } 2461 2462 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2463 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2464 2465 unsigned Format = 0; 2466 if (IsTyped) { 2467 Format = MI.getOperand(5 + OpOffset).getImm(); 2468 ++OpOffset; 2469 } 2470 2471 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2472 2473 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2474 if (TotalOffset != 0) 2475 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2476 2477 unsigned Opc; 2478 if (IsTyped) { 2479 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 2480 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 2481 } else if (IsFormat) { 2482 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 2483 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 2484 } else { 2485 switch (MemSize) { 2486 case 1: 2487 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 2488 break; 2489 case 2: 2490 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 2491 break; 2492 default: 2493 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 2494 break; 2495 } 2496 } 2497 2498 if (!VIndex) 2499 VIndex = B.buildConstant(S32, 0).getReg(0); 2500 2501 auto MIB = B.buildInstr(Opc) 2502 .addUse(VData) // vdata 2503 .addUse(RSrc) // rsrc 2504 .addUse(VIndex) // vindex 2505 .addUse(VOffset) // voffset 2506 .addUse(SOffset) // soffset 2507 .addImm(ImmOffset); // offset(imm) 2508 2509 if (IsTyped) 2510 MIB.addImm(Format); 2511 2512 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2513 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2514 .addMemOperand(MMO); 2515 2516 MI.eraseFromParent(); 2517 return true; 2518 } 2519 2520 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 2521 MachineRegisterInfo &MRI, 2522 MachineIRBuilder &B, 2523 bool IsFormat, 2524 bool IsTyped) const { 2525 B.setInstr(MI); 2526 2527 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 2528 MachineMemOperand *MMO = *MI.memoperands_begin(); 2529 const int MemSize = MMO->getSize(); 2530 const LLT S32 = LLT::scalar(32); 2531 2532 Register Dst = MI.getOperand(0).getReg(); 2533 Register RSrc = MI.getOperand(2).getReg(); 2534 2535 // The typed intrinsics add an immediate after the registers. 2536 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 2537 2538 // The struct intrinsic variants add one additional operand over raw. 2539 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2540 Register VIndex; 2541 int OpOffset = 0; 2542 if (HasVIndex) { 2543 VIndex = MI.getOperand(3).getReg(); 2544 OpOffset = 1; 2545 } 2546 2547 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 2548 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 2549 2550 unsigned Format = 0; 2551 if (IsTyped) { 2552 Format = MI.getOperand(5 + OpOffset).getImm(); 2553 ++OpOffset; 2554 } 2555 2556 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 2557 unsigned ImmOffset; 2558 unsigned TotalOffset; 2559 2560 LLT Ty = MRI.getType(Dst); 2561 LLT EltTy = Ty.getScalarType(); 2562 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 2563 const bool Unpacked = ST.hasUnpackedD16VMem(); 2564 2565 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2566 if (TotalOffset != 0) 2567 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 2568 2569 unsigned Opc; 2570 2571 if (IsTyped) { 2572 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 2573 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 2574 } else if (IsFormat) { 2575 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 2576 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 2577 } else { 2578 switch (MemSize) { 2579 case 1: 2580 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 2581 break; 2582 case 2: 2583 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 2584 break; 2585 default: 2586 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 2587 break; 2588 } 2589 } 2590 2591 Register LoadDstReg; 2592 2593 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 2594 LLT UnpackedTy = Ty.changeElementSize(32); 2595 2596 if (IsExtLoad) 2597 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 2598 else if (Unpacked && IsD16 && Ty.isVector()) 2599 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 2600 else 2601 LoadDstReg = Dst; 2602 2603 if (!VIndex) 2604 VIndex = B.buildConstant(S32, 0).getReg(0); 2605 2606 auto MIB = B.buildInstr(Opc) 2607 .addDef(LoadDstReg) // vdata 2608 .addUse(RSrc) // rsrc 2609 .addUse(VIndex) // vindex 2610 .addUse(VOffset) // voffset 2611 .addUse(SOffset) // soffset 2612 .addImm(ImmOffset); // offset(imm) 2613 2614 if (IsTyped) 2615 MIB.addImm(Format); 2616 2617 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2618 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2619 .addMemOperand(MMO); 2620 2621 if (LoadDstReg != Dst) { 2622 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2623 2624 // Widen result for extending loads was widened. 2625 if (IsExtLoad) 2626 B.buildTrunc(Dst, LoadDstReg); 2627 else { 2628 // Repack to original 16-bit vector result 2629 // FIXME: G_TRUNC should work, but legalization currently fails 2630 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 2631 SmallVector<Register, 4> Repack; 2632 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 2633 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 2634 B.buildMerge(Dst, Repack); 2635 } 2636 } 2637 2638 MI.eraseFromParent(); 2639 return true; 2640 } 2641 2642 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 2643 MachineIRBuilder &B, 2644 bool IsInc) const { 2645 B.setInstr(MI); 2646 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 2647 AMDGPU::G_AMDGPU_ATOMIC_DEC; 2648 B.buildInstr(Opc) 2649 .addDef(MI.getOperand(0).getReg()) 2650 .addUse(MI.getOperand(2).getReg()) 2651 .addUse(MI.getOperand(3).getReg()) 2652 .cloneMemRefs(MI); 2653 MI.eraseFromParent(); 2654 return true; 2655 } 2656 2657 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 2658 switch (IntrID) { 2659 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2660 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2661 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 2662 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2663 case Intrinsic::amdgcn_struct_buffer_atomic_add: 2664 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 2665 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 2666 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 2667 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 2668 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 2669 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 2670 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 2671 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2672 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2673 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 2674 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2675 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2676 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 2677 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2678 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2679 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 2680 case Intrinsic::amdgcn_raw_buffer_atomic_and: 2681 case Intrinsic::amdgcn_struct_buffer_atomic_and: 2682 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 2683 case Intrinsic::amdgcn_raw_buffer_atomic_or: 2684 case Intrinsic::amdgcn_struct_buffer_atomic_or: 2685 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 2686 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 2687 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 2688 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 2689 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 2690 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 2691 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 2692 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 2693 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 2694 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 2695 default: 2696 llvm_unreachable("unhandled atomic opcode"); 2697 } 2698 } 2699 2700 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 2701 MachineIRBuilder &B, 2702 Intrinsic::ID IID) const { 2703 B.setInstr(MI); 2704 2705 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 2706 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 2707 2708 Register Dst = MI.getOperand(0).getReg(); 2709 Register VData = MI.getOperand(2).getReg(); 2710 2711 Register CmpVal; 2712 int OpOffset = 0; 2713 2714 if (IsCmpSwap) { 2715 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 2716 ++OpOffset; 2717 } 2718 2719 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 2720 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 2721 2722 // The struct intrinsic variants add one additional operand over raw. 2723 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 2724 Register VIndex; 2725 if (HasVIndex) { 2726 VIndex = MI.getOperand(4).getReg(); 2727 ++OpOffset; 2728 } 2729 2730 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 2731 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 2732 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 2733 2734 MachineMemOperand *MMO = *MI.memoperands_begin(); 2735 2736 unsigned ImmOffset; 2737 unsigned TotalOffset; 2738 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 2739 if (TotalOffset != 0) 2740 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 2741 2742 if (!VIndex) 2743 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 2744 2745 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 2746 .addDef(Dst) 2747 .addUse(VData); // vdata 2748 2749 if (IsCmpSwap) 2750 MIB.addReg(CmpVal); 2751 2752 MIB.addUse(RSrc) // rsrc 2753 .addUse(VIndex) // vindex 2754 .addUse(VOffset) // voffset 2755 .addUse(SOffset) // soffset 2756 .addImm(ImmOffset) // offset(imm) 2757 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 2758 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 2759 .addMemOperand(MMO); 2760 2761 MI.eraseFromParent(); 2762 return true; 2763 } 2764 2765 // FIMXE: Needs observer like custom 2766 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2767 MachineRegisterInfo &MRI, 2768 MachineIRBuilder &B) const { 2769 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2770 auto IntrID = MI.getIntrinsicID(); 2771 switch (IntrID) { 2772 case Intrinsic::amdgcn_if: 2773 case Intrinsic::amdgcn_else: { 2774 MachineInstr *Br = nullptr; 2775 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2776 const SIRegisterInfo *TRI 2777 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2778 2779 B.setInstr(*BrCond); 2780 Register Def = MI.getOperand(1).getReg(); 2781 Register Use = MI.getOperand(3).getReg(); 2782 2783 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 2784 if (Br) 2785 BrTarget = Br->getOperand(0).getMBB(); 2786 2787 if (IntrID == Intrinsic::amdgcn_if) { 2788 B.buildInstr(AMDGPU::SI_IF) 2789 .addDef(Def) 2790 .addUse(Use) 2791 .addMBB(BrTarget); 2792 } else { 2793 B.buildInstr(AMDGPU::SI_ELSE) 2794 .addDef(Def) 2795 .addUse(Use) 2796 .addMBB(BrTarget) 2797 .addImm(0); 2798 } 2799 2800 if (Br) 2801 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 2802 2803 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2804 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2805 MI.eraseFromParent(); 2806 BrCond->eraseFromParent(); 2807 return true; 2808 } 2809 2810 return false; 2811 } 2812 case Intrinsic::amdgcn_loop: { 2813 MachineInstr *Br = nullptr; 2814 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2815 const SIRegisterInfo *TRI 2816 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2817 2818 B.setInstr(*BrCond); 2819 2820 // FIXME: Need to adjust branch targets based on unconditional branch. 2821 Register Reg = MI.getOperand(2).getReg(); 2822 B.buildInstr(AMDGPU::SI_LOOP) 2823 .addUse(Reg) 2824 .addMBB(BrCond->getOperand(1).getMBB()); 2825 MI.eraseFromParent(); 2826 BrCond->eraseFromParent(); 2827 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2828 return true; 2829 } 2830 2831 return false; 2832 } 2833 case Intrinsic::amdgcn_kernarg_segment_ptr: 2834 return legalizePreloadedArgIntrin( 2835 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2836 case Intrinsic::amdgcn_implicitarg_ptr: 2837 return legalizeImplicitArgPtr(MI, MRI, B); 2838 case Intrinsic::amdgcn_workitem_id_x: 2839 return legalizePreloadedArgIntrin(MI, MRI, B, 2840 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2841 case Intrinsic::amdgcn_workitem_id_y: 2842 return legalizePreloadedArgIntrin(MI, MRI, B, 2843 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2844 case Intrinsic::amdgcn_workitem_id_z: 2845 return legalizePreloadedArgIntrin(MI, MRI, B, 2846 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2847 case Intrinsic::amdgcn_workgroup_id_x: 2848 return legalizePreloadedArgIntrin(MI, MRI, B, 2849 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2850 case Intrinsic::amdgcn_workgroup_id_y: 2851 return legalizePreloadedArgIntrin(MI, MRI, B, 2852 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2853 case Intrinsic::amdgcn_workgroup_id_z: 2854 return legalizePreloadedArgIntrin(MI, MRI, B, 2855 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2856 case Intrinsic::amdgcn_dispatch_ptr: 2857 return legalizePreloadedArgIntrin(MI, MRI, B, 2858 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2859 case Intrinsic::amdgcn_queue_ptr: 2860 return legalizePreloadedArgIntrin(MI, MRI, B, 2861 AMDGPUFunctionArgInfo::QUEUE_PTR); 2862 case Intrinsic::amdgcn_implicit_buffer_ptr: 2863 return legalizePreloadedArgIntrin( 2864 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2865 case Intrinsic::amdgcn_dispatch_id: 2866 return legalizePreloadedArgIntrin(MI, MRI, B, 2867 AMDGPUFunctionArgInfo::DISPATCH_ID); 2868 case Intrinsic::amdgcn_fdiv_fast: 2869 return legalizeFDIVFastIntrin(MI, MRI, B); 2870 case Intrinsic::amdgcn_is_shared: 2871 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2872 case Intrinsic::amdgcn_is_private: 2873 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2874 case Intrinsic::amdgcn_wavefrontsize: { 2875 B.setInstr(MI); 2876 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2877 MI.eraseFromParent(); 2878 return true; 2879 } 2880 case Intrinsic::amdgcn_raw_buffer_store: 2881 case Intrinsic::amdgcn_struct_buffer_store: 2882 return legalizeBufferStore(MI, MRI, B, false, false); 2883 case Intrinsic::amdgcn_raw_buffer_store_format: 2884 case Intrinsic::amdgcn_struct_buffer_store_format: 2885 return legalizeBufferStore(MI, MRI, B, false, true); 2886 case Intrinsic::amdgcn_raw_tbuffer_store: 2887 case Intrinsic::amdgcn_struct_tbuffer_store: 2888 return legalizeBufferStore(MI, MRI, B, true, true); 2889 case Intrinsic::amdgcn_raw_buffer_load: 2890 case Intrinsic::amdgcn_struct_buffer_load: 2891 return legalizeBufferLoad(MI, MRI, B, false, false); 2892 case Intrinsic::amdgcn_raw_buffer_load_format: 2893 case Intrinsic::amdgcn_struct_buffer_load_format: 2894 return legalizeBufferLoad(MI, MRI, B, true, false); 2895 case Intrinsic::amdgcn_raw_tbuffer_load: 2896 case Intrinsic::amdgcn_struct_tbuffer_load: 2897 return legalizeBufferLoad(MI, MRI, B, true, true); 2898 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 2899 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 2900 case Intrinsic::amdgcn_raw_buffer_atomic_add: 2901 case Intrinsic::amdgcn_struct_buffer_atomic_add: 2902 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 2903 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 2904 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 2905 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 2906 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 2907 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 2908 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 2909 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 2910 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 2911 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 2912 case Intrinsic::amdgcn_raw_buffer_atomic_and: 2913 case Intrinsic::amdgcn_struct_buffer_atomic_and: 2914 case Intrinsic::amdgcn_raw_buffer_atomic_or: 2915 case Intrinsic::amdgcn_struct_buffer_atomic_or: 2916 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 2917 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 2918 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 2919 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 2920 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 2921 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 2922 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 2923 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 2924 return legalizeBufferAtomic(MI, B, IntrID); 2925 case Intrinsic::amdgcn_atomic_inc: 2926 return legalizeAtomicIncDec(MI, B, true); 2927 case Intrinsic::amdgcn_atomic_dec: 2928 return legalizeAtomicIncDec(MI, B, false); 2929 default: 2930 return true; 2931 } 2932 2933 return true; 2934 } 2935