1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/ADT/ScopeExit.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 31 #include "llvm/CodeGen/TargetOpcodes.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/DerivedTypes.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/Support/Debug.h" 37 38 #define DEBUG_TYPE "amdgpu-legalinfo" 39 40 using namespace llvm; 41 using namespace LegalizeActions; 42 using namespace LegalizeMutations; 43 using namespace LegalityPredicates; 44 using namespace MIPatternMatch; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 static LegalityPredicate isMultiple32(unsigned TypeIdx, 61 unsigned MaxSize = 1024) { 62 return [=](const LegalityQuery &Query) { 63 const LLT Ty = Query.Types[TypeIdx]; 64 const LLT EltTy = Ty.getScalarType(); 65 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 66 }; 67 } 68 69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 70 return [=](const LegalityQuery &Query) { 71 return Query.Types[TypeIdx].getSizeInBits() == Size; 72 }; 73 } 74 75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 return Ty.isVector() && 79 Ty.getNumElements() % 2 != 0 && 80 Ty.getElementType().getSizeInBits() < 32 && 81 Ty.getSizeInBits() % 32 != 0; 82 }; 83 } 84 85 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 86 return [=](const LegalityQuery &Query) { 87 const LLT Ty = Query.Types[TypeIdx]; 88 const LLT EltTy = Ty.getScalarType(); 89 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 90 }; 91 } 92 93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getElementType(); 97 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 98 }; 99 } 100 101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 unsigned Size = Ty.getSizeInBits(); 106 unsigned Pieces = (Size + 63) / 64; 107 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 108 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 109 }; 110 } 111 112 // Increase the number of vector elements to reach the next multiple of 32-bit 113 // type. 114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 115 return [=](const LegalityQuery &Query) { 116 const LLT Ty = Query.Types[TypeIdx]; 117 118 const LLT EltTy = Ty.getElementType(); 119 const int Size = Ty.getSizeInBits(); 120 const int EltSize = EltTy.getSizeInBits(); 121 const int NextMul32 = (Size + 31) / 32; 122 123 assert(EltSize < 32); 124 125 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 126 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 127 }; 128 } 129 130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 131 return [=](const LegalityQuery &Query) { 132 const LLT QueryTy = Query.Types[TypeIdx]; 133 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 134 }; 135 } 136 137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 138 return [=](const LegalityQuery &Query) { 139 const LLT QueryTy = Query.Types[TypeIdx]; 140 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 141 }; 142 } 143 144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT QueryTy = Query.Types[TypeIdx]; 147 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 148 }; 149 } 150 151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 152 // v2s16. 153 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 154 return [=](const LegalityQuery &Query) { 155 const LLT Ty = Query.Types[TypeIdx]; 156 if (Ty.isVector()) { 157 const int EltSize = Ty.getElementType().getSizeInBits(); 158 return EltSize == 32 || EltSize == 64 || 159 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 160 EltSize == 128 || EltSize == 256; 161 } 162 163 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 164 }; 165 } 166 167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getElementType() == Type; 171 }; 172 } 173 174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 if (!QueryTy.isVector()) 178 return false; 179 const LLT EltTy = QueryTy.getElementType(); 180 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 181 }; 182 } 183 184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 const LLT Ty = Query.Types[TypeIdx]; 187 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 188 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 189 }; 190 } 191 192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 193 return [=](const LegalityQuery &Query) { 194 return Query.Types[TypeIdx0].getSizeInBits() < 195 Query.Types[TypeIdx1].getSizeInBits(); 196 }; 197 } 198 199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 200 return [=](const LegalityQuery &Query) { 201 return Query.Types[TypeIdx0].getSizeInBits() > 202 Query.Types[TypeIdx1].getSizeInBits(); 203 }; 204 } 205 206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 207 const GCNTargetMachine &TM) 208 : ST(ST_) { 209 using namespace TargetOpcode; 210 211 auto GetAddrSpacePtr = [&TM](unsigned AS) { 212 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 213 }; 214 215 const LLT S1 = LLT::scalar(1); 216 const LLT S16 = LLT::scalar(16); 217 const LLT S32 = LLT::scalar(32); 218 const LLT S64 = LLT::scalar(64); 219 const LLT S128 = LLT::scalar(128); 220 const LLT S256 = LLT::scalar(256); 221 const LLT S512 = LLT::scalar(512); 222 const LLT S1024 = LLT::scalar(1024); 223 224 const LLT V2S16 = LLT::vector(2, 16); 225 const LLT V4S16 = LLT::vector(4, 16); 226 227 const LLT V2S32 = LLT::vector(2, 32); 228 const LLT V3S32 = LLT::vector(3, 32); 229 const LLT V4S32 = LLT::vector(4, 32); 230 const LLT V5S32 = LLT::vector(5, 32); 231 const LLT V6S32 = LLT::vector(6, 32); 232 const LLT V7S32 = LLT::vector(7, 32); 233 const LLT V8S32 = LLT::vector(8, 32); 234 const LLT V9S32 = LLT::vector(9, 32); 235 const LLT V10S32 = LLT::vector(10, 32); 236 const LLT V11S32 = LLT::vector(11, 32); 237 const LLT V12S32 = LLT::vector(12, 32); 238 const LLT V13S32 = LLT::vector(13, 32); 239 const LLT V14S32 = LLT::vector(14, 32); 240 const LLT V15S32 = LLT::vector(15, 32); 241 const LLT V16S32 = LLT::vector(16, 32); 242 const LLT V32S32 = LLT::vector(32, 32); 243 244 const LLT V2S64 = LLT::vector(2, 64); 245 const LLT V3S64 = LLT::vector(3, 64); 246 const LLT V4S64 = LLT::vector(4, 64); 247 const LLT V5S64 = LLT::vector(5, 64); 248 const LLT V6S64 = LLT::vector(6, 64); 249 const LLT V7S64 = LLT::vector(7, 64); 250 const LLT V8S64 = LLT::vector(8, 64); 251 const LLT V16S64 = LLT::vector(16, 64); 252 253 std::initializer_list<LLT> AllS32Vectors = 254 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 255 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 256 std::initializer_list<LLT> AllS64Vectors = 257 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 258 259 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 260 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 261 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 262 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 263 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 264 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 265 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 266 267 const LLT CodePtr = FlatPtr; 268 269 const std::initializer_list<LLT> AddrSpaces64 = { 270 GlobalPtr, ConstantPtr, FlatPtr 271 }; 272 273 const std::initializer_list<LLT> AddrSpaces32 = { 274 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 275 }; 276 277 const std::initializer_list<LLT> FPTypesBase = { 278 S32, S64 279 }; 280 281 const std::initializer_list<LLT> FPTypes16 = { 282 S32, S64, S16 283 }; 284 285 const std::initializer_list<LLT> FPTypesPK16 = { 286 S32, S64, S16, V2S16 287 }; 288 289 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 290 291 setAction({G_BRCOND, S1}, Legal); // VCC branches 292 setAction({G_BRCOND, S32}, Legal); // SCC branches 293 294 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 295 // elements for v3s16 296 getActionDefinitionsBuilder(G_PHI) 297 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 298 .legalFor(AllS32Vectors) 299 .legalFor(AllS64Vectors) 300 .legalFor(AddrSpaces64) 301 .legalFor(AddrSpaces32) 302 .clampScalar(0, S32, S256) 303 .widenScalarToNextPow2(0, 32) 304 .clampMaxNumElements(0, S32, 16) 305 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 306 .legalIf(isPointer(0)); 307 308 if (ST.hasVOP3PInsts()) { 309 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 310 .legalFor({S32, S16, V2S16}) 311 .clampScalar(0, S16, S32) 312 .clampMaxNumElements(0, S16, 2) 313 .scalarize(0) 314 .widenScalarToNextPow2(0, 32); 315 } else if (ST.has16BitInsts()) { 316 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 317 .legalFor({S32, S16}) 318 .clampScalar(0, S16, S32) 319 .scalarize(0) 320 .widenScalarToNextPow2(0, 32); 321 } else { 322 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 323 .legalFor({S32}) 324 .clampScalar(0, S32, S32) 325 .scalarize(0); 326 } 327 328 // FIXME: Not really legal. Placeholder for custom lowering. 329 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 330 .customFor({S32, S64}) 331 .clampScalar(0, S32, S64) 332 .widenScalarToNextPow2(0, 32) 333 .scalarize(0); 334 335 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 336 .legalFor({S32}) 337 .clampScalar(0, S32, S32) 338 .scalarize(0); 339 340 // Report legal for any types we can handle anywhere. For the cases only legal 341 // on the SALU, RegBankSelect will be able to re-legalize. 342 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 343 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 344 .clampScalar(0, S32, S64) 345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 346 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 347 .widenScalarToNextPow2(0) 348 .scalarize(0); 349 350 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 351 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 352 .legalFor({{S32, S1}, {S32, S32}}) 353 .minScalar(0, S32) 354 // TODO: .scalarize(0) 355 .lower(); 356 357 getActionDefinitionsBuilder(G_BITCAST) 358 // Don't worry about the size constraint. 359 .legalIf(all(isRegisterType(0), isRegisterType(1))) 360 .lower(); 361 362 363 getActionDefinitionsBuilder(G_CONSTANT) 364 .legalFor({S1, S32, S64, S16, GlobalPtr, 365 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 366 .clampScalar(0, S32, S64) 367 .widenScalarToNextPow2(0) 368 .legalIf(isPointer(0)); 369 370 getActionDefinitionsBuilder(G_FCONSTANT) 371 .legalFor({S32, S64, S16}) 372 .clampScalar(0, S16, S64); 373 374 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 375 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 376 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 377 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 378 .clampScalarOrElt(0, S32, S1024) 379 .legalIf(isMultiple32(0)) 380 .widenScalarToNextPow2(0, 32) 381 .clampMaxNumElements(0, S32, 16); 382 383 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 384 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 385 .unsupportedFor({PrivatePtr}) 386 .custom(); 387 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 388 389 auto &FPOpActions = getActionDefinitionsBuilder( 390 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 391 .legalFor({S32, S64}); 392 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 393 .customFor({S32, S64}); 394 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 395 .customFor({S32, S64}); 396 397 if (ST.has16BitInsts()) { 398 if (ST.hasVOP3PInsts()) 399 FPOpActions.legalFor({S16, V2S16}); 400 else 401 FPOpActions.legalFor({S16}); 402 403 TrigActions.customFor({S16}); 404 FDIVActions.customFor({S16}); 405 } 406 407 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 408 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 409 410 if (ST.hasVOP3PInsts()) { 411 MinNumMaxNum.customFor(FPTypesPK16) 412 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 413 .clampMaxNumElements(0, S16, 2) 414 .clampScalar(0, S16, S64) 415 .scalarize(0); 416 } else if (ST.has16BitInsts()) { 417 MinNumMaxNum.customFor(FPTypes16) 418 .clampScalar(0, S16, S64) 419 .scalarize(0); 420 } else { 421 MinNumMaxNum.customFor(FPTypesBase) 422 .clampScalar(0, S32, S64) 423 .scalarize(0); 424 } 425 426 if (ST.hasVOP3PInsts()) 427 FPOpActions.clampMaxNumElements(0, S16, 2); 428 429 FPOpActions 430 .scalarize(0) 431 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 432 433 TrigActions 434 .scalarize(0) 435 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 436 437 FDIVActions 438 .scalarize(0) 439 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 440 441 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 442 .legalFor(FPTypesPK16) 443 .clampMaxNumElements(0, S16, 2) 444 .scalarize(0) 445 .clampScalar(0, S16, S64); 446 447 if (ST.has16BitInsts()) { 448 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 449 .legalFor({S32, S64, S16}) 450 .scalarize(0) 451 .clampScalar(0, S16, S64); 452 } else { 453 getActionDefinitionsBuilder(G_FSQRT) 454 .legalFor({S32, S64}) 455 .scalarize(0) 456 .clampScalar(0, S32, S64); 457 458 if (ST.hasFractBug()) { 459 getActionDefinitionsBuilder(G_FFLOOR) 460 .customFor({S64}) 461 .legalFor({S32, S64}) 462 .scalarize(0) 463 .clampScalar(0, S32, S64); 464 } else { 465 getActionDefinitionsBuilder(G_FFLOOR) 466 .legalFor({S32, S64}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64); 469 } 470 } 471 472 getActionDefinitionsBuilder(G_FPTRUNC) 473 .legalFor({{S32, S64}, {S16, S32}}) 474 .scalarize(0) 475 .lower(); 476 477 getActionDefinitionsBuilder(G_FPEXT) 478 .legalFor({{S64, S32}, {S32, S16}}) 479 .lowerFor({{S64, S16}}) // FIXME: Implement 480 .scalarize(0); 481 482 getActionDefinitionsBuilder(G_FSUB) 483 // Use actual fsub instruction 484 .legalFor({S32}) 485 // Must use fadd + fneg 486 .lowerFor({S64, S16, V2S16}) 487 .scalarize(0) 488 .clampScalar(0, S32, S64); 489 490 // Whether this is legal depends on the floating point mode for the function. 491 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 492 if (ST.hasMadF16()) 493 FMad.customFor({S32, S16}); 494 else 495 FMad.customFor({S32}); 496 FMad.scalarize(0) 497 .lower(); 498 499 // TODO: Do we need to clamp maximum bitwidth? 500 getActionDefinitionsBuilder(G_TRUNC) 501 .legalIf(isScalar(0)) 502 .legalFor({{V2S16, V2S32}}) 503 .clampMaxNumElements(0, S16, 2) 504 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 505 // situations (like an invalid implicit use), we don't want to infinite loop 506 // in the legalizer. 507 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 508 .alwaysLegal(); 509 510 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 511 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 512 {S32, S1}, {S64, S1}, {S16, S1}}) 513 .scalarize(0) 514 .clampScalar(0, S32, S64) 515 .widenScalarToNextPow2(1, 32); 516 517 // TODO: Split s1->s64 during regbankselect for VALU. 518 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 519 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 520 .lowerFor({{S32, S64}}) 521 .lowerIf(typeIs(1, S1)) 522 .customFor({{S64, S64}}); 523 if (ST.has16BitInsts()) 524 IToFP.legalFor({{S16, S16}}); 525 IToFP.clampScalar(1, S32, S64) 526 .scalarize(0) 527 .widenScalarToNextPow2(1); 528 529 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 530 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 531 .customFor({{S64, S64}}); 532 if (ST.has16BitInsts()) 533 FPToI.legalFor({{S16, S16}}); 534 else 535 FPToI.minScalar(1, S32); 536 537 FPToI.minScalar(0, S32) 538 .scalarize(0) 539 .lower(); 540 541 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 542 .scalarize(0) 543 .lower(); 544 545 if (ST.has16BitInsts()) { 546 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 547 .legalFor({S16, S32, S64}) 548 .clampScalar(0, S16, S64) 549 .scalarize(0); 550 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 551 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 552 .legalFor({S32, S64}) 553 .clampScalar(0, S32, S64) 554 .scalarize(0); 555 } else { 556 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 557 .legalFor({S32}) 558 .customFor({S64}) 559 .clampScalar(0, S32, S64) 560 .scalarize(0); 561 } 562 563 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 564 .scalarize(0) 565 .alwaysLegal(); 566 567 auto &CmpBuilder = 568 getActionDefinitionsBuilder(G_ICMP) 569 // The compare output type differs based on the register bank of the output, 570 // so make both s1 and s32 legal. 571 // 572 // Scalar compares producing output in scc will be promoted to s32, as that 573 // is the allocatable register type that will be needed for the copy from 574 // scc. This will be promoted during RegBankSelect, and we assume something 575 // before that won't try to use s32 result types. 576 // 577 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 578 // bank. 579 .legalForCartesianProduct( 580 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 581 .legalForCartesianProduct( 582 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 583 if (ST.has16BitInsts()) { 584 CmpBuilder.legalFor({{S1, S16}}); 585 } 586 587 CmpBuilder 588 .widenScalarToNextPow2(1) 589 .clampScalar(1, S32, S64) 590 .scalarize(0) 591 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 592 593 getActionDefinitionsBuilder(G_FCMP) 594 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 595 .widenScalarToNextPow2(1) 596 .clampScalar(1, S32, S64) 597 .scalarize(0); 598 599 // FIXME: fpow has a selection pattern that should move to custom lowering. 600 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 601 if (ST.has16BitInsts()) 602 Exp2Ops.legalFor({S32, S16}); 603 else 604 Exp2Ops.legalFor({S32}); 605 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 606 Exp2Ops.scalarize(0); 607 608 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 609 if (ST.has16BitInsts()) 610 ExpOps.customFor({{S32}, {S16}}); 611 else 612 ExpOps.customFor({S32}); 613 ExpOps.clampScalar(0, MinScalarFPTy, S32) 614 .scalarize(0); 615 616 // The 64-bit versions produce 32-bit results, but only on the SALU. 617 getActionDefinitionsBuilder(G_CTPOP) 618 .legalFor({{S32, S32}, {S32, S64}}) 619 .clampScalar(0, S32, S32) 620 .clampScalar(1, S32, S64) 621 .scalarize(0) 622 .widenScalarToNextPow2(0, 32) 623 .widenScalarToNextPow2(1, 32); 624 625 // The hardware instructions return a different result on 0 than the generic 626 // instructions expect. The hardware produces -1, but these produce the 627 // bitwidth. 628 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 629 .scalarize(0) 630 .clampScalar(0, S32, S32) 631 .clampScalar(1, S32, S64) 632 .widenScalarToNextPow2(0, 32) 633 .widenScalarToNextPow2(1, 32) 634 .lower(); 635 636 // The 64-bit versions produce 32-bit results, but only on the SALU. 637 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 638 .legalFor({{S32, S32}, {S32, S64}}) 639 .clampScalar(0, S32, S32) 640 .clampScalar(1, S32, S64) 641 .scalarize(0) 642 .widenScalarToNextPow2(0, 32) 643 .widenScalarToNextPow2(1, 32); 644 645 getActionDefinitionsBuilder(G_BITREVERSE) 646 .legalFor({S32}) 647 .clampScalar(0, S32, S32) 648 .scalarize(0); 649 650 if (ST.has16BitInsts()) { 651 getActionDefinitionsBuilder(G_BSWAP) 652 .legalFor({S16, S32, V2S16}) 653 .clampMaxNumElements(0, S16, 2) 654 // FIXME: Fixing non-power-of-2 before clamp is workaround for 655 // narrowScalar limitation. 656 .widenScalarToNextPow2(0) 657 .clampScalar(0, S16, S32) 658 .scalarize(0); 659 660 if (ST.hasVOP3PInsts()) { 661 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 662 .legalFor({S32, S16, V2S16}) 663 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 664 .clampMaxNumElements(0, S16, 2) 665 .minScalar(0, S16) 666 .widenScalarToNextPow2(0) 667 .scalarize(0) 668 .lower(); 669 } else { 670 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 671 .legalFor({S32, S16}) 672 .widenScalarToNextPow2(0) 673 .minScalar(0, S16) 674 .scalarize(0) 675 .lower(); 676 } 677 } else { 678 // TODO: Should have same legality without v_perm_b32 679 getActionDefinitionsBuilder(G_BSWAP) 680 .legalFor({S32}) 681 .lowerIf(narrowerThan(0, 32)) 682 // FIXME: Fixing non-power-of-2 before clamp is workaround for 683 // narrowScalar limitation. 684 .widenScalarToNextPow2(0) 685 .maxScalar(0, S32) 686 .scalarize(0) 687 .lower(); 688 689 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 690 .legalFor({S32}) 691 .minScalar(0, S32) 692 .widenScalarToNextPow2(0) 693 .scalarize(0) 694 .lower(); 695 } 696 697 getActionDefinitionsBuilder(G_INTTOPTR) 698 // List the common cases 699 .legalForCartesianProduct(AddrSpaces64, {S64}) 700 .legalForCartesianProduct(AddrSpaces32, {S32}) 701 .scalarize(0) 702 // Accept any address space as long as the size matches 703 .legalIf(sameSize(0, 1)) 704 .widenScalarIf(smallerThan(1, 0), 705 [](const LegalityQuery &Query) { 706 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 707 }) 708 .narrowScalarIf(greaterThan(1, 0), 709 [](const LegalityQuery &Query) { 710 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 711 }); 712 713 getActionDefinitionsBuilder(G_PTRTOINT) 714 // List the common cases 715 .legalForCartesianProduct(AddrSpaces64, {S64}) 716 .legalForCartesianProduct(AddrSpaces32, {S32}) 717 .scalarize(0) 718 // Accept any address space as long as the size matches 719 .legalIf(sameSize(0, 1)) 720 .widenScalarIf(smallerThan(0, 1), 721 [](const LegalityQuery &Query) { 722 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 723 }) 724 .narrowScalarIf( 725 greaterThan(0, 1), 726 [](const LegalityQuery &Query) { 727 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 728 }); 729 730 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 731 .scalarize(0) 732 .custom(); 733 734 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 735 // handle some operations by just promoting the register during 736 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 737 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 738 switch (AS) { 739 // FIXME: Private element size. 740 case AMDGPUAS::PRIVATE_ADDRESS: 741 return 32; 742 // FIXME: Check subtarget 743 case AMDGPUAS::LOCAL_ADDRESS: 744 return ST.useDS128() ? 128 : 64; 745 746 // Treat constant and global as identical. SMRD loads are sometimes usable 747 // for global loads (ideally constant address space should be eliminated) 748 // depending on the context. Legality cannot be context dependent, but 749 // RegBankSelect can split the load as necessary depending on the pointer 750 // register bank/uniformity and if the memory is invariant or not written in 751 // a kernel. 752 case AMDGPUAS::CONSTANT_ADDRESS: 753 case AMDGPUAS::GLOBAL_ADDRESS: 754 return IsLoad ? 512 : 128; 755 default: 756 return 128; 757 } 758 }; 759 760 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 761 bool IsLoad) -> bool { 762 const LLT DstTy = Query.Types[0]; 763 764 // Split vector extloads. 765 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 766 unsigned Align = Query.MMODescrs[0].AlignInBits; 767 768 if (MemSize < DstTy.getSizeInBits()) 769 MemSize = std::max(MemSize, Align); 770 771 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 772 return true; 773 774 const LLT PtrTy = Query.Types[1]; 775 unsigned AS = PtrTy.getAddressSpace(); 776 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 777 return true; 778 779 // Catch weird sized loads that don't evenly divide into the access sizes 780 // TODO: May be able to widen depending on alignment etc. 781 unsigned NumRegs = (MemSize + 31) / 32; 782 if (NumRegs == 3) { 783 if (!ST.hasDwordx3LoadStores()) 784 return true; 785 } else { 786 // If the alignment allows, these should have been widened. 787 if (!isPowerOf2_32(NumRegs)) 788 return true; 789 } 790 791 if (Align < MemSize) { 792 const SITargetLowering *TLI = ST.getTargetLowering(); 793 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 794 } 795 796 return false; 797 }; 798 799 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 800 unsigned Size = Query.Types[0].getSizeInBits(); 801 if (isPowerOf2_32(Size)) 802 return false; 803 804 if (Size == 96 && ST.hasDwordx3LoadStores()) 805 return false; 806 807 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 808 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 809 return false; 810 811 unsigned Align = Query.MMODescrs[0].AlignInBits; 812 unsigned RoundedSize = NextPowerOf2(Size); 813 return (Align >= RoundedSize); 814 }; 815 816 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 817 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 818 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 819 820 // TODO: Refine based on subtargets which support unaligned access or 128-bit 821 // LDS 822 // TODO: Unsupported flat for SI. 823 824 for (unsigned Op : {G_LOAD, G_STORE}) { 825 const bool IsStore = Op == G_STORE; 826 827 auto &Actions = getActionDefinitionsBuilder(Op); 828 // Whitelist the common cases. 829 // TODO: Loads to s16 on gfx9 830 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 831 {V2S32, GlobalPtr, 64, GlobalAlign32}, 832 {V4S32, GlobalPtr, 128, GlobalAlign32}, 833 {S128, GlobalPtr, 128, GlobalAlign32}, 834 {S64, GlobalPtr, 64, GlobalAlign32}, 835 {V2S64, GlobalPtr, 128, GlobalAlign32}, 836 {V2S16, GlobalPtr, 32, GlobalAlign32}, 837 {S32, GlobalPtr, 8, GlobalAlign8}, 838 {S32, GlobalPtr, 16, GlobalAlign16}, 839 840 {S32, LocalPtr, 32, 32}, 841 {S64, LocalPtr, 64, 32}, 842 {V2S32, LocalPtr, 64, 32}, 843 {S32, LocalPtr, 8, 8}, 844 {S32, LocalPtr, 16, 16}, 845 {V2S16, LocalPtr, 32, 32}, 846 847 {S32, PrivatePtr, 32, 32}, 848 {S32, PrivatePtr, 8, 8}, 849 {S32, PrivatePtr, 16, 16}, 850 {V2S16, PrivatePtr, 32, 32}, 851 852 {S32, FlatPtr, 32, GlobalAlign32}, 853 {S32, FlatPtr, 16, GlobalAlign16}, 854 {S32, FlatPtr, 8, GlobalAlign8}, 855 {V2S16, FlatPtr, 32, GlobalAlign32}, 856 857 {S32, ConstantPtr, 32, GlobalAlign32}, 858 {V2S32, ConstantPtr, 64, GlobalAlign32}, 859 {V4S32, ConstantPtr, 128, GlobalAlign32}, 860 {S64, ConstantPtr, 64, GlobalAlign32}, 861 {S128, ConstantPtr, 128, GlobalAlign32}, 862 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 863 Actions 864 .customIf(typeIs(1, Constant32Ptr)) 865 // Widen suitably aligned loads by loading extra elements. 866 .moreElementsIf([=](const LegalityQuery &Query) { 867 const LLT Ty = Query.Types[0]; 868 return Op == G_LOAD && Ty.isVector() && 869 shouldWidenLoadResult(Query); 870 }, moreElementsToNextPow2(0)) 871 .widenScalarIf([=](const LegalityQuery &Query) { 872 const LLT Ty = Query.Types[0]; 873 return Op == G_LOAD && !Ty.isVector() && 874 shouldWidenLoadResult(Query); 875 }, widenScalarOrEltToNextPow2(0)) 876 .narrowScalarIf( 877 [=](const LegalityQuery &Query) -> bool { 878 return !Query.Types[0].isVector() && 879 needToSplitMemOp(Query, Op == G_LOAD); 880 }, 881 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 882 const LLT DstTy = Query.Types[0]; 883 const LLT PtrTy = Query.Types[1]; 884 885 const unsigned DstSize = DstTy.getSizeInBits(); 886 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 887 888 // Split extloads. 889 if (DstSize > MemSize) 890 return std::make_pair(0, LLT::scalar(MemSize)); 891 892 if (!isPowerOf2_32(DstSize)) { 893 // We're probably decomposing an odd sized store. Try to split 894 // to the widest type. TODO: Account for alignment. As-is it 895 // should be OK, since the new parts will be further legalized. 896 unsigned FloorSize = PowerOf2Floor(DstSize); 897 return std::make_pair(0, LLT::scalar(FloorSize)); 898 } 899 900 if (DstSize > 32 && (DstSize % 32 != 0)) { 901 // FIXME: Need a way to specify non-extload of larger size if 902 // suitably aligned. 903 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 904 } 905 906 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 907 Op == G_LOAD); 908 if (MemSize > MaxSize) 909 return std::make_pair(0, LLT::scalar(MaxSize)); 910 911 unsigned Align = Query.MMODescrs[0].AlignInBits; 912 return std::make_pair(0, LLT::scalar(Align)); 913 }) 914 .fewerElementsIf( 915 [=](const LegalityQuery &Query) -> bool { 916 return Query.Types[0].isVector() && 917 needToSplitMemOp(Query, Op == G_LOAD); 918 }, 919 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 920 const LLT DstTy = Query.Types[0]; 921 const LLT PtrTy = Query.Types[1]; 922 923 LLT EltTy = DstTy.getElementType(); 924 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 925 Op == G_LOAD); 926 927 // FIXME: Handle widened to power of 2 results better. This ends 928 // up scalarizing. 929 // FIXME: 3 element stores scalarized on SI 930 931 // Split if it's too large for the address space. 932 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 933 unsigned NumElts = DstTy.getNumElements(); 934 unsigned EltSize = EltTy.getSizeInBits(); 935 936 if (MaxSize % EltSize == 0) { 937 return std::make_pair( 938 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 939 } 940 941 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 942 943 // FIXME: Refine when odd breakdowns handled 944 // The scalars will need to be re-legalized. 945 if (NumPieces == 1 || NumPieces >= NumElts || 946 NumElts % NumPieces != 0) 947 return std::make_pair(0, EltTy); 948 949 return std::make_pair(0, 950 LLT::vector(NumElts / NumPieces, EltTy)); 951 } 952 953 // FIXME: We could probably handle weird extending loads better. 954 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 955 if (DstTy.getSizeInBits() > MemSize) 956 return std::make_pair(0, EltTy); 957 958 unsigned EltSize = EltTy.getSizeInBits(); 959 unsigned DstSize = DstTy.getSizeInBits(); 960 if (!isPowerOf2_32(DstSize)) { 961 // We're probably decomposing an odd sized store. Try to split 962 // to the widest type. TODO: Account for alignment. As-is it 963 // should be OK, since the new parts will be further legalized. 964 unsigned FloorSize = PowerOf2Floor(DstSize); 965 return std::make_pair( 966 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 967 } 968 969 // Need to split because of alignment. 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 if (EltSize > Align && 972 (EltSize / Align < DstTy.getNumElements())) { 973 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 974 } 975 976 // May need relegalization for the scalars. 977 return std::make_pair(0, EltTy); 978 }) 979 .minScalar(0, S32); 980 981 if (IsStore) 982 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 983 984 // TODO: Need a bitcast lower option? 985 Actions 986 .legalIf([=](const LegalityQuery &Query) { 987 const LLT Ty0 = Query.Types[0]; 988 unsigned Size = Ty0.getSizeInBits(); 989 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 990 unsigned Align = Query.MMODescrs[0].AlignInBits; 991 992 // FIXME: Widening store from alignment not valid. 993 if (MemSize < Size) 994 MemSize = std::max(MemSize, Align); 995 996 // No extending vector loads. 997 if (Size > MemSize && Ty0.isVector()) 998 return false; 999 1000 switch (MemSize) { 1001 case 8: 1002 case 16: 1003 return Size == 32; 1004 case 32: 1005 case 64: 1006 case 128: 1007 return true; 1008 case 96: 1009 return ST.hasDwordx3LoadStores(); 1010 case 256: 1011 case 512: 1012 return true; 1013 default: 1014 return false; 1015 } 1016 }) 1017 .widenScalarToNextPow2(0) 1018 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1019 } 1020 1021 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1022 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1023 {S32, GlobalPtr, 16, 2 * 8}, 1024 {S32, LocalPtr, 8, 8}, 1025 {S32, LocalPtr, 16, 16}, 1026 {S32, PrivatePtr, 8, 8}, 1027 {S32, PrivatePtr, 16, 16}, 1028 {S32, ConstantPtr, 8, 8}, 1029 {S32, ConstantPtr, 16, 2 * 8}}); 1030 if (ST.hasFlatAddressSpace()) { 1031 ExtLoads.legalForTypesWithMemDesc( 1032 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1033 } 1034 1035 ExtLoads.clampScalar(0, S32, S32) 1036 .widenScalarToNextPow2(0) 1037 .unsupportedIfMemSizeNotPow2() 1038 .lower(); 1039 1040 auto &Atomics = getActionDefinitionsBuilder( 1041 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1042 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1043 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1044 G_ATOMICRMW_UMIN}) 1045 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1046 {S64, GlobalPtr}, {S64, LocalPtr}}); 1047 if (ST.hasFlatAddressSpace()) { 1048 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1049 } 1050 1051 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1052 .legalFor({{S32, LocalPtr}}); 1053 1054 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1055 // demarshalling 1056 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1057 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1058 {S32, FlatPtr}, {S64, FlatPtr}}) 1059 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1060 {S32, RegionPtr}, {S64, RegionPtr}}); 1061 // TODO: Pointer types, any 32-bit or 64-bit vector 1062 1063 // Condition should be s32 for scalar, s1 for vector. 1064 getActionDefinitionsBuilder(G_SELECT) 1065 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1066 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1067 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1068 .clampScalar(0, S16, S64) 1069 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1070 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1071 .scalarize(1) 1072 .clampMaxNumElements(0, S32, 2) 1073 .clampMaxNumElements(0, LocalPtr, 2) 1074 .clampMaxNumElements(0, PrivatePtr, 2) 1075 .scalarize(0) 1076 .widenScalarToNextPow2(0) 1077 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1078 1079 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1080 // be more flexible with the shift amount type. 1081 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1082 .legalFor({{S32, S32}, {S64, S32}}); 1083 if (ST.has16BitInsts()) { 1084 if (ST.hasVOP3PInsts()) { 1085 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1086 .clampMaxNumElements(0, S16, 2); 1087 } else 1088 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1089 1090 // TODO: Support 16-bit shift amounts 1091 Shifts.clampScalar(1, S32, S32); 1092 Shifts.clampScalar(0, S16, S64); 1093 Shifts.widenScalarToNextPow2(0, 16); 1094 } else { 1095 // Make sure we legalize the shift amount type first, as the general 1096 // expansion for the shifted type will produce much worse code if it hasn't 1097 // been truncated already. 1098 Shifts.clampScalar(1, S32, S32); 1099 Shifts.clampScalar(0, S32, S64); 1100 Shifts.widenScalarToNextPow2(0, 32); 1101 } 1102 Shifts.scalarize(0); 1103 1104 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1105 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1106 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1107 unsigned IdxTypeIdx = 2; 1108 1109 getActionDefinitionsBuilder(Op) 1110 .customIf([=](const LegalityQuery &Query) { 1111 const LLT EltTy = Query.Types[EltTypeIdx]; 1112 const LLT VecTy = Query.Types[VecTypeIdx]; 1113 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1114 return (EltTy.getSizeInBits() == 16 || 1115 EltTy.getSizeInBits() % 32 == 0) && 1116 VecTy.getSizeInBits() % 32 == 0 && 1117 VecTy.getSizeInBits() <= 1024 && 1118 IdxTy.getSizeInBits() == 32; 1119 }) 1120 .clampScalar(EltTypeIdx, S32, S64) 1121 .clampScalar(VecTypeIdx, S32, S64) 1122 .clampScalar(IdxTypeIdx, S32, S32); 1123 } 1124 1125 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1126 .unsupportedIf([=](const LegalityQuery &Query) { 1127 const LLT &EltTy = Query.Types[1].getElementType(); 1128 return Query.Types[0] != EltTy; 1129 }); 1130 1131 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1132 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1133 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1134 1135 // FIXME: Doesn't handle extract of illegal sizes. 1136 getActionDefinitionsBuilder(Op) 1137 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1138 // FIXME: Multiples of 16 should not be legal. 1139 .legalIf([=](const LegalityQuery &Query) { 1140 const LLT BigTy = Query.Types[BigTyIdx]; 1141 const LLT LitTy = Query.Types[LitTyIdx]; 1142 return (BigTy.getSizeInBits() % 32 == 0) && 1143 (LitTy.getSizeInBits() % 16 == 0); 1144 }) 1145 .widenScalarIf( 1146 [=](const LegalityQuery &Query) { 1147 const LLT BigTy = Query.Types[BigTyIdx]; 1148 return (BigTy.getScalarSizeInBits() < 16); 1149 }, 1150 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1151 .widenScalarIf( 1152 [=](const LegalityQuery &Query) { 1153 const LLT LitTy = Query.Types[LitTyIdx]; 1154 return (LitTy.getScalarSizeInBits() < 16); 1155 }, 1156 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1157 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1158 .widenScalarToNextPow2(BigTyIdx, 32); 1159 1160 } 1161 1162 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1163 .legalForCartesianProduct(AllS32Vectors, {S32}) 1164 .legalForCartesianProduct(AllS64Vectors, {S64}) 1165 .clampNumElements(0, V16S32, V32S32) 1166 .clampNumElements(0, V2S64, V16S64) 1167 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1168 1169 if (ST.hasScalarPackInsts()) { 1170 BuildVector 1171 // FIXME: Should probably widen s1 vectors straight to s32 1172 .minScalarOrElt(0, S16) 1173 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1174 .minScalar(1, S32); 1175 1176 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1177 .legalFor({V2S16, S32}) 1178 .lower(); 1179 BuildVector.minScalarOrElt(0, S32); 1180 } else { 1181 BuildVector.customFor({V2S16, S16}); 1182 BuildVector.minScalarOrElt(0, S32); 1183 1184 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1185 .customFor({V2S16, S32}) 1186 .lower(); 1187 } 1188 1189 BuildVector.legalIf(isRegisterType(0)); 1190 1191 // FIXME: Clamp maximum size 1192 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1193 .legalIf(isRegisterType(0)); 1194 1195 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1196 // pre-legalize. 1197 if (ST.hasVOP3PInsts()) { 1198 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1199 .customFor({V2S16, V2S16}) 1200 .lower(); 1201 } else 1202 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1203 1204 // Merge/Unmerge 1205 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1206 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1207 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1208 1209 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1210 const LLT Ty = Query.Types[TypeIdx]; 1211 if (Ty.isVector()) { 1212 const LLT &EltTy = Ty.getElementType(); 1213 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1214 return true; 1215 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1216 return true; 1217 } 1218 return false; 1219 }; 1220 1221 auto &Builder = getActionDefinitionsBuilder(Op) 1222 // Try to widen to s16 first for small types. 1223 // TODO: Only do this on targets with legal s16 shifts 1224 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1225 1226 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1227 .lowerFor({{S16, V2S16}}) 1228 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1229 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1230 elementTypeIs(1, S16)), 1231 changeTo(1, V2S16)) 1232 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1233 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1234 // valid. 1235 .clampScalar(LitTyIdx, S32, S512) 1236 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1237 // Break up vectors with weird elements into scalars 1238 .fewerElementsIf( 1239 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1240 scalarize(0)) 1241 .fewerElementsIf( 1242 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1243 scalarize(1)) 1244 .clampScalar(BigTyIdx, S32, S1024); 1245 1246 if (Op == G_MERGE_VALUES) { 1247 Builder.widenScalarIf( 1248 // TODO: Use 16-bit shifts if legal for 8-bit values? 1249 [=](const LegalityQuery &Query) { 1250 const LLT Ty = Query.Types[LitTyIdx]; 1251 return Ty.getSizeInBits() < 32; 1252 }, 1253 changeTo(LitTyIdx, S32)); 1254 } 1255 1256 Builder.widenScalarIf( 1257 [=](const LegalityQuery &Query) { 1258 const LLT Ty = Query.Types[BigTyIdx]; 1259 return !isPowerOf2_32(Ty.getSizeInBits()) && 1260 Ty.getSizeInBits() % 16 != 0; 1261 }, 1262 [=](const LegalityQuery &Query) { 1263 // Pick the next power of 2, or a multiple of 64 over 128. 1264 // Whichever is smaller. 1265 const LLT &Ty = Query.Types[BigTyIdx]; 1266 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1267 if (NewSizeInBits >= 256) { 1268 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1269 if (RoundedTo < NewSizeInBits) 1270 NewSizeInBits = RoundedTo; 1271 } 1272 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1273 }) 1274 .legalIf([=](const LegalityQuery &Query) { 1275 const LLT &BigTy = Query.Types[BigTyIdx]; 1276 const LLT &LitTy = Query.Types[LitTyIdx]; 1277 1278 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1279 return false; 1280 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1281 return false; 1282 1283 return BigTy.getSizeInBits() % 16 == 0 && 1284 LitTy.getSizeInBits() % 16 == 0 && 1285 BigTy.getSizeInBits() <= 1024; 1286 }) 1287 // Any vectors left are the wrong size. Scalarize them. 1288 .scalarize(0) 1289 .scalarize(1); 1290 } 1291 1292 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1293 // RegBankSelect. 1294 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1295 .legalFor({{S32}, {S64}}); 1296 1297 if (ST.hasVOP3PInsts()) { 1298 SextInReg.lowerFor({{V2S16}}) 1299 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1300 // get more vector shift opportunities, since we'll get those when 1301 // expanded. 1302 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1303 } else if (ST.has16BitInsts()) { 1304 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1305 } else { 1306 // Prefer to promote to s32 before lowering if we don't have 16-bit 1307 // shifts. This avoid a lot of intermediate truncate and extend operations. 1308 SextInReg.lowerFor({{S32}, {S64}}); 1309 } 1310 1311 SextInReg 1312 .scalarize(0) 1313 .clampScalar(0, S32, S64) 1314 .lower(); 1315 1316 getActionDefinitionsBuilder(G_FSHR) 1317 .legalFor({{S32, S32}}) 1318 .scalarize(0) 1319 .lower(); 1320 1321 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1322 .legalFor({S64}); 1323 1324 getActionDefinitionsBuilder({ 1325 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1326 G_FCOPYSIGN, 1327 1328 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1329 G_READ_REGISTER, 1330 G_WRITE_REGISTER, 1331 1332 G_SADDO, G_SSUBO, 1333 1334 // TODO: Implement 1335 G_FMINIMUM, G_FMAXIMUM, 1336 G_FSHL 1337 }).lower(); 1338 1339 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1340 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1341 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1342 .unsupported(); 1343 1344 computeTables(); 1345 verify(*ST.getInstrInfo()); 1346 } 1347 1348 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1349 MachineRegisterInfo &MRI, 1350 MachineIRBuilder &B, 1351 GISelChangeObserver &Observer) const { 1352 switch (MI.getOpcode()) { 1353 case TargetOpcode::G_ADDRSPACE_CAST: 1354 return legalizeAddrSpaceCast(MI, MRI, B); 1355 case TargetOpcode::G_FRINT: 1356 return legalizeFrint(MI, MRI, B); 1357 case TargetOpcode::G_FCEIL: 1358 return legalizeFceil(MI, MRI, B); 1359 case TargetOpcode::G_INTRINSIC_TRUNC: 1360 return legalizeIntrinsicTrunc(MI, MRI, B); 1361 case TargetOpcode::G_SITOFP: 1362 return legalizeITOFP(MI, MRI, B, true); 1363 case TargetOpcode::G_UITOFP: 1364 return legalizeITOFP(MI, MRI, B, false); 1365 case TargetOpcode::G_FPTOSI: 1366 return legalizeFPTOI(MI, MRI, B, true); 1367 case TargetOpcode::G_FPTOUI: 1368 return legalizeFPTOI(MI, MRI, B, false); 1369 case TargetOpcode::G_FMINNUM: 1370 case TargetOpcode::G_FMAXNUM: 1371 case TargetOpcode::G_FMINNUM_IEEE: 1372 case TargetOpcode::G_FMAXNUM_IEEE: 1373 return legalizeMinNumMaxNum(MI, MRI, B); 1374 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1375 return legalizeExtractVectorElt(MI, MRI, B); 1376 case TargetOpcode::G_INSERT_VECTOR_ELT: 1377 return legalizeInsertVectorElt(MI, MRI, B); 1378 case TargetOpcode::G_SHUFFLE_VECTOR: 1379 return legalizeShuffleVector(MI, MRI, B); 1380 case TargetOpcode::G_FSIN: 1381 case TargetOpcode::G_FCOS: 1382 return legalizeSinCos(MI, MRI, B); 1383 case TargetOpcode::G_GLOBAL_VALUE: 1384 return legalizeGlobalValue(MI, MRI, B); 1385 case TargetOpcode::G_LOAD: 1386 return legalizeLoad(MI, MRI, B, Observer); 1387 case TargetOpcode::G_FMAD: 1388 return legalizeFMad(MI, MRI, B); 1389 case TargetOpcode::G_FDIV: 1390 return legalizeFDIV(MI, MRI, B); 1391 case TargetOpcode::G_UDIV: 1392 case TargetOpcode::G_UREM: 1393 return legalizeUDIV_UREM(MI, MRI, B); 1394 case TargetOpcode::G_SDIV: 1395 case TargetOpcode::G_SREM: 1396 return legalizeSDIV_SREM(MI, MRI, B); 1397 case TargetOpcode::G_ATOMIC_CMPXCHG: 1398 return legalizeAtomicCmpXChg(MI, MRI, B); 1399 case TargetOpcode::G_FLOG: 1400 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1401 case TargetOpcode::G_FLOG10: 1402 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1403 case TargetOpcode::G_FEXP: 1404 return legalizeFExp(MI, B); 1405 case TargetOpcode::G_FPOW: 1406 return legalizeFPow(MI, B); 1407 case TargetOpcode::G_FFLOOR: 1408 return legalizeFFloor(MI, MRI, B); 1409 case TargetOpcode::G_BUILD_VECTOR: 1410 return legalizeBuildVector(MI, MRI, B); 1411 default: 1412 return false; 1413 } 1414 1415 llvm_unreachable("expected switch to return"); 1416 } 1417 1418 Register AMDGPULegalizerInfo::getSegmentAperture( 1419 unsigned AS, 1420 MachineRegisterInfo &MRI, 1421 MachineIRBuilder &B) const { 1422 MachineFunction &MF = B.getMF(); 1423 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1424 const LLT S32 = LLT::scalar(32); 1425 1426 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1427 1428 if (ST.hasApertureRegs()) { 1429 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1430 // getreg. 1431 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1432 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1433 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1434 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1435 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1436 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1437 unsigned Encoding = 1438 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1439 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1440 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1441 1442 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1443 1444 B.buildInstr(AMDGPU::S_GETREG_B32) 1445 .addDef(GetReg) 1446 .addImm(Encoding); 1447 MRI.setType(GetReg, S32); 1448 1449 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1450 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1451 } 1452 1453 Register QueuePtr = MRI.createGenericVirtualRegister( 1454 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1455 1456 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1457 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1458 return Register(); 1459 1460 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1461 // private_segment_aperture_base_hi. 1462 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1463 1464 // TODO: can we be smarter about machine pointer info? 1465 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1466 MachineMemOperand *MMO = MF.getMachineMemOperand( 1467 PtrInfo, 1468 MachineMemOperand::MOLoad | 1469 MachineMemOperand::MODereferenceable | 1470 MachineMemOperand::MOInvariant, 1471 4, 1472 MinAlign(64, StructOffset)); 1473 1474 Register LoadAddr; 1475 1476 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1477 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1478 } 1479 1480 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1481 MachineInstr &MI, MachineRegisterInfo &MRI, 1482 MachineIRBuilder &B) const { 1483 MachineFunction &MF = B.getMF(); 1484 1485 B.setInstr(MI); 1486 1487 const LLT S32 = LLT::scalar(32); 1488 Register Dst = MI.getOperand(0).getReg(); 1489 Register Src = MI.getOperand(1).getReg(); 1490 1491 LLT DstTy = MRI.getType(Dst); 1492 LLT SrcTy = MRI.getType(Src); 1493 unsigned DestAS = DstTy.getAddressSpace(); 1494 unsigned SrcAS = SrcTy.getAddressSpace(); 1495 1496 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1497 // vector element. 1498 assert(!DstTy.isVector()); 1499 1500 const AMDGPUTargetMachine &TM 1501 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1502 1503 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1504 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1505 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1506 return true; 1507 } 1508 1509 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1510 // Truncate. 1511 B.buildExtract(Dst, Src, 0); 1512 MI.eraseFromParent(); 1513 return true; 1514 } 1515 1516 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1517 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1518 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1519 1520 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1521 // another. Merge operands are required to be the same type, but creating an 1522 // extra ptrtoint would be kind of pointless. 1523 auto HighAddr = B.buildConstant( 1524 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1525 B.buildMerge(Dst, {Src, HighAddr}); 1526 MI.eraseFromParent(); 1527 return true; 1528 } 1529 1530 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1531 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1532 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1533 unsigned NullVal = TM.getNullPointerValue(DestAS); 1534 1535 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1536 auto FlatNull = B.buildConstant(SrcTy, 0); 1537 1538 // Extract low 32-bits of the pointer. 1539 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1540 1541 auto CmpRes = 1542 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1543 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1544 1545 MI.eraseFromParent(); 1546 return true; 1547 } 1548 1549 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1550 return false; 1551 1552 if (!ST.hasFlatAddressSpace()) 1553 return false; 1554 1555 auto SegmentNull = 1556 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1557 auto FlatNull = 1558 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1559 1560 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1561 if (!ApertureReg.isValid()) 1562 return false; 1563 1564 auto CmpRes = 1565 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1566 1567 // Coerce the type of the low half of the result so we can use merge_values. 1568 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1569 1570 // TODO: Should we allow mismatched types but matching sizes in merges to 1571 // avoid the ptrtoint? 1572 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1573 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1574 1575 MI.eraseFromParent(); 1576 return true; 1577 } 1578 1579 bool AMDGPULegalizerInfo::legalizeFrint( 1580 MachineInstr &MI, MachineRegisterInfo &MRI, 1581 MachineIRBuilder &B) const { 1582 B.setInstr(MI); 1583 1584 Register Src = MI.getOperand(1).getReg(); 1585 LLT Ty = MRI.getType(Src); 1586 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1587 1588 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1589 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1590 1591 auto C1 = B.buildFConstant(Ty, C1Val); 1592 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1593 1594 // TODO: Should this propagate fast-math-flags? 1595 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1596 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1597 1598 auto C2 = B.buildFConstant(Ty, C2Val); 1599 auto Fabs = B.buildFAbs(Ty, Src); 1600 1601 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1602 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1603 return true; 1604 } 1605 1606 bool AMDGPULegalizerInfo::legalizeFceil( 1607 MachineInstr &MI, MachineRegisterInfo &MRI, 1608 MachineIRBuilder &B) const { 1609 B.setInstr(MI); 1610 1611 const LLT S1 = LLT::scalar(1); 1612 const LLT S64 = LLT::scalar(64); 1613 1614 Register Src = MI.getOperand(1).getReg(); 1615 assert(MRI.getType(Src) == S64); 1616 1617 // result = trunc(src) 1618 // if (src > 0.0 && src != result) 1619 // result += 1.0 1620 1621 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1622 1623 const auto Zero = B.buildFConstant(S64, 0.0); 1624 const auto One = B.buildFConstant(S64, 1.0); 1625 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1626 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1627 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1628 auto Add = B.buildSelect(S64, And, One, Zero); 1629 1630 // TODO: Should this propagate fast-math-flags? 1631 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1632 return true; 1633 } 1634 1635 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1636 MachineIRBuilder &B) { 1637 const unsigned FractBits = 52; 1638 const unsigned ExpBits = 11; 1639 LLT S32 = LLT::scalar(32); 1640 1641 auto Const0 = B.buildConstant(S32, FractBits - 32); 1642 auto Const1 = B.buildConstant(S32, ExpBits); 1643 1644 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1645 .addUse(Const0.getReg(0)) 1646 .addUse(Const1.getReg(0)); 1647 1648 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1649 } 1650 1651 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1652 MachineInstr &MI, MachineRegisterInfo &MRI, 1653 MachineIRBuilder &B) const { 1654 B.setInstr(MI); 1655 1656 const LLT S1 = LLT::scalar(1); 1657 const LLT S32 = LLT::scalar(32); 1658 const LLT S64 = LLT::scalar(64); 1659 1660 Register Src = MI.getOperand(1).getReg(); 1661 assert(MRI.getType(Src) == S64); 1662 1663 // TODO: Should this use extract since the low half is unused? 1664 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1665 Register Hi = Unmerge.getReg(1); 1666 1667 // Extract the upper half, since this is where we will find the sign and 1668 // exponent. 1669 auto Exp = extractF64Exponent(Hi, B); 1670 1671 const unsigned FractBits = 52; 1672 1673 // Extract the sign bit. 1674 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1675 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1676 1677 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1678 1679 const auto Zero32 = B.buildConstant(S32, 0); 1680 1681 // Extend back to 64-bits. 1682 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1683 1684 auto Shr = B.buildAShr(S64, FractMask, Exp); 1685 auto Not = B.buildNot(S64, Shr); 1686 auto Tmp0 = B.buildAnd(S64, Src, Not); 1687 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1688 1689 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1690 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1691 1692 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1693 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1694 return true; 1695 } 1696 1697 bool AMDGPULegalizerInfo::legalizeITOFP( 1698 MachineInstr &MI, MachineRegisterInfo &MRI, 1699 MachineIRBuilder &B, bool Signed) const { 1700 B.setInstr(MI); 1701 1702 Register Dst = MI.getOperand(0).getReg(); 1703 Register Src = MI.getOperand(1).getReg(); 1704 1705 const LLT S64 = LLT::scalar(64); 1706 const LLT S32 = LLT::scalar(32); 1707 1708 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1709 1710 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1711 1712 auto CvtHi = Signed ? 1713 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1714 B.buildUITOFP(S64, Unmerge.getReg(1)); 1715 1716 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1717 1718 auto ThirtyTwo = B.buildConstant(S32, 32); 1719 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1720 .addUse(CvtHi.getReg(0)) 1721 .addUse(ThirtyTwo.getReg(0)); 1722 1723 // TODO: Should this propagate fast-math-flags? 1724 B.buildFAdd(Dst, LdExp, CvtLo); 1725 MI.eraseFromParent(); 1726 return true; 1727 } 1728 1729 // TODO: Copied from DAG implementation. Verify logic and document how this 1730 // actually works. 1731 bool AMDGPULegalizerInfo::legalizeFPTOI( 1732 MachineInstr &MI, MachineRegisterInfo &MRI, 1733 MachineIRBuilder &B, bool Signed) const { 1734 B.setInstr(MI); 1735 1736 Register Dst = MI.getOperand(0).getReg(); 1737 Register Src = MI.getOperand(1).getReg(); 1738 1739 const LLT S64 = LLT::scalar(64); 1740 const LLT S32 = LLT::scalar(32); 1741 1742 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1743 1744 unsigned Flags = MI.getFlags(); 1745 1746 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1747 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1748 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1749 1750 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1751 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1752 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1753 1754 auto Hi = Signed ? 1755 B.buildFPTOSI(S32, FloorMul) : 1756 B.buildFPTOUI(S32, FloorMul); 1757 auto Lo = B.buildFPTOUI(S32, Fma); 1758 1759 B.buildMerge(Dst, { Lo, Hi }); 1760 MI.eraseFromParent(); 1761 1762 return true; 1763 } 1764 1765 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1766 MachineInstr &MI, MachineRegisterInfo &MRI, 1767 MachineIRBuilder &B) const { 1768 MachineFunction &MF = B.getMF(); 1769 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1770 1771 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1772 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1773 1774 // With ieee_mode disabled, the instructions have the correct behavior 1775 // already for G_FMINNUM/G_FMAXNUM 1776 if (!MFI->getMode().IEEE) 1777 return !IsIEEEOp; 1778 1779 if (IsIEEEOp) 1780 return true; 1781 1782 MachineIRBuilder HelperBuilder(MI); 1783 GISelObserverWrapper DummyObserver; 1784 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1785 HelperBuilder.setInstr(MI); 1786 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1787 } 1788 1789 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1790 MachineInstr &MI, MachineRegisterInfo &MRI, 1791 MachineIRBuilder &B) const { 1792 // TODO: Should move some of this into LegalizerHelper. 1793 1794 // TODO: Promote dynamic indexing of s16 to s32 1795 1796 // FIXME: Artifact combiner probably should have replaced the truncated 1797 // constant before this, so we shouldn't need 1798 // getConstantVRegValWithLookThrough. 1799 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1800 MI.getOperand(2).getReg(), MRI); 1801 if (!IdxVal) // Dynamic case will be selected to register indexing. 1802 return true; 1803 1804 Register Dst = MI.getOperand(0).getReg(); 1805 Register Vec = MI.getOperand(1).getReg(); 1806 1807 LLT VecTy = MRI.getType(Vec); 1808 LLT EltTy = VecTy.getElementType(); 1809 assert(EltTy == MRI.getType(Dst)); 1810 1811 B.setInstr(MI); 1812 1813 if (IdxVal->Value < VecTy.getNumElements()) 1814 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1815 else 1816 B.buildUndef(Dst); 1817 1818 MI.eraseFromParent(); 1819 return true; 1820 } 1821 1822 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1823 MachineInstr &MI, MachineRegisterInfo &MRI, 1824 MachineIRBuilder &B) const { 1825 // TODO: Should move some of this into LegalizerHelper. 1826 1827 // TODO: Promote dynamic indexing of s16 to s32 1828 1829 // FIXME: Artifact combiner probably should have replaced the truncated 1830 // constant before this, so we shouldn't need 1831 // getConstantVRegValWithLookThrough. 1832 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1833 MI.getOperand(3).getReg(), MRI); 1834 if (!IdxVal) // Dynamic case will be selected to register indexing. 1835 return true; 1836 1837 Register Dst = MI.getOperand(0).getReg(); 1838 Register Vec = MI.getOperand(1).getReg(); 1839 Register Ins = MI.getOperand(2).getReg(); 1840 1841 LLT VecTy = MRI.getType(Vec); 1842 LLT EltTy = VecTy.getElementType(); 1843 assert(EltTy == MRI.getType(Ins)); 1844 1845 B.setInstr(MI); 1846 1847 if (IdxVal->Value < VecTy.getNumElements()) 1848 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1849 else 1850 B.buildUndef(Dst); 1851 1852 MI.eraseFromParent(); 1853 return true; 1854 } 1855 1856 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1857 MachineInstr &MI, MachineRegisterInfo &MRI, 1858 MachineIRBuilder &B) const { 1859 const LLT V2S16 = LLT::vector(2, 16); 1860 1861 Register Dst = MI.getOperand(0).getReg(); 1862 Register Src0 = MI.getOperand(1).getReg(); 1863 LLT DstTy = MRI.getType(Dst); 1864 LLT SrcTy = MRI.getType(Src0); 1865 1866 if (SrcTy == V2S16 && DstTy == V2S16 && 1867 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1868 return true; 1869 1870 MachineIRBuilder HelperBuilder(MI); 1871 GISelObserverWrapper DummyObserver; 1872 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1873 HelperBuilder.setInstr(MI); 1874 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1875 } 1876 1877 bool AMDGPULegalizerInfo::legalizeSinCos( 1878 MachineInstr &MI, MachineRegisterInfo &MRI, 1879 MachineIRBuilder &B) const { 1880 B.setInstr(MI); 1881 1882 Register DstReg = MI.getOperand(0).getReg(); 1883 Register SrcReg = MI.getOperand(1).getReg(); 1884 LLT Ty = MRI.getType(DstReg); 1885 unsigned Flags = MI.getFlags(); 1886 1887 Register TrigVal; 1888 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1889 if (ST.hasTrigReducedRange()) { 1890 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1891 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1892 .addUse(MulVal.getReg(0)) 1893 .setMIFlags(Flags).getReg(0); 1894 } else 1895 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1896 1897 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1898 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1899 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1900 .addUse(TrigVal) 1901 .setMIFlags(Flags); 1902 MI.eraseFromParent(); 1903 return true; 1904 } 1905 1906 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1907 Register DstReg, LLT PtrTy, 1908 MachineIRBuilder &B, const GlobalValue *GV, 1909 unsigned Offset, unsigned GAFlags) const { 1910 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1911 // to the following code sequence: 1912 // 1913 // For constant address space: 1914 // s_getpc_b64 s[0:1] 1915 // s_add_u32 s0, s0, $symbol 1916 // s_addc_u32 s1, s1, 0 1917 // 1918 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1919 // a fixup or relocation is emitted to replace $symbol with a literal 1920 // constant, which is a pc-relative offset from the encoding of the $symbol 1921 // operand to the global variable. 1922 // 1923 // For global address space: 1924 // s_getpc_b64 s[0:1] 1925 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1926 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1927 // 1928 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1929 // fixups or relocations are emitted to replace $symbol@*@lo and 1930 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1931 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1932 // operand to the global variable. 1933 // 1934 // What we want here is an offset from the value returned by s_getpc 1935 // (which is the address of the s_add_u32 instruction) to the global 1936 // variable, but since the encoding of $symbol starts 4 bytes after the start 1937 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1938 // small. This requires us to add 4 to the global variable offset in order to 1939 // compute the correct address. 1940 1941 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1942 1943 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1944 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1945 1946 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1947 .addDef(PCReg); 1948 1949 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1950 if (GAFlags == SIInstrInfo::MO_NONE) 1951 MIB.addImm(0); 1952 else 1953 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1954 1955 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1956 1957 if (PtrTy.getSizeInBits() == 32) 1958 B.buildExtract(DstReg, PCReg, 0); 1959 return true; 1960 } 1961 1962 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1963 MachineInstr &MI, MachineRegisterInfo &MRI, 1964 MachineIRBuilder &B) const { 1965 Register DstReg = MI.getOperand(0).getReg(); 1966 LLT Ty = MRI.getType(DstReg); 1967 unsigned AS = Ty.getAddressSpace(); 1968 1969 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1970 MachineFunction &MF = B.getMF(); 1971 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1972 B.setInstr(MI); 1973 1974 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1975 if (!MFI->isEntryFunction()) { 1976 const Function &Fn = MF.getFunction(); 1977 DiagnosticInfoUnsupported BadLDSDecl( 1978 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1979 DS_Warning); 1980 Fn.getContext().diagnose(BadLDSDecl); 1981 1982 // We currently don't have a way to correctly allocate LDS objects that 1983 // aren't directly associated with a kernel. We do force inlining of 1984 // functions that use local objects. However, if these dead functions are 1985 // not eliminated, we don't want a compile time error. Just emit a warning 1986 // and a trap, since there should be no callable path here. 1987 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1988 B.buildUndef(DstReg); 1989 MI.eraseFromParent(); 1990 return true; 1991 } 1992 1993 // TODO: We could emit code to handle the initialization somewhere. 1994 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1995 const SITargetLowering *TLI = ST.getTargetLowering(); 1996 if (!TLI->shouldUseLDSConstAddress(GV)) { 1997 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1998 return true; // Leave in place; 1999 } 2000 2001 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 2002 MI.eraseFromParent(); 2003 return true; 2004 } 2005 2006 const Function &Fn = MF.getFunction(); 2007 DiagnosticInfoUnsupported BadInit( 2008 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2009 Fn.getContext().diagnose(BadInit); 2010 return true; 2011 } 2012 2013 const SITargetLowering *TLI = ST.getTargetLowering(); 2014 2015 if (TLI->shouldEmitFixup(GV)) { 2016 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2017 MI.eraseFromParent(); 2018 return true; 2019 } 2020 2021 if (TLI->shouldEmitPCReloc(GV)) { 2022 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2023 MI.eraseFromParent(); 2024 return true; 2025 } 2026 2027 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2028 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2029 2030 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2031 MachinePointerInfo::getGOT(MF), 2032 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2033 MachineMemOperand::MOInvariant, 2034 8 /*Size*/, 8 /*Align*/); 2035 2036 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2037 2038 if (Ty.getSizeInBits() == 32) { 2039 // Truncate if this is a 32-bit constant adrdess. 2040 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2041 B.buildExtract(DstReg, Load, 0); 2042 } else 2043 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2044 2045 MI.eraseFromParent(); 2046 return true; 2047 } 2048 2049 bool AMDGPULegalizerInfo::legalizeLoad( 2050 MachineInstr &MI, MachineRegisterInfo &MRI, 2051 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2052 B.setInstr(MI); 2053 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2054 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2055 Observer.changingInstr(MI); 2056 MI.getOperand(1).setReg(Cast.getReg(0)); 2057 Observer.changedInstr(MI); 2058 return true; 2059 } 2060 2061 bool AMDGPULegalizerInfo::legalizeFMad( 2062 MachineInstr &MI, MachineRegisterInfo &MRI, 2063 MachineIRBuilder &B) const { 2064 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2065 assert(Ty.isScalar()); 2066 2067 MachineFunction &MF = B.getMF(); 2068 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2069 2070 // TODO: Always legal with future ftz flag. 2071 // FIXME: Do we need just output? 2072 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2073 return true; 2074 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2075 return true; 2076 2077 MachineIRBuilder HelperBuilder(MI); 2078 GISelObserverWrapper DummyObserver; 2079 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2080 HelperBuilder.setMBB(*MI.getParent()); 2081 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2082 } 2083 2084 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2085 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2086 Register DstReg = MI.getOperand(0).getReg(); 2087 Register PtrReg = MI.getOperand(1).getReg(); 2088 Register CmpVal = MI.getOperand(2).getReg(); 2089 Register NewVal = MI.getOperand(3).getReg(); 2090 2091 assert(SITargetLowering::isFlatGlobalAddrSpace( 2092 MRI.getType(PtrReg).getAddressSpace()) && 2093 "this should not have been custom lowered"); 2094 2095 LLT ValTy = MRI.getType(CmpVal); 2096 LLT VecTy = LLT::vector(2, ValTy); 2097 2098 B.setInstr(MI); 2099 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2100 2101 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2102 .addDef(DstReg) 2103 .addUse(PtrReg) 2104 .addUse(PackedVal) 2105 .setMemRefs(MI.memoperands()); 2106 2107 MI.eraseFromParent(); 2108 return true; 2109 } 2110 2111 bool AMDGPULegalizerInfo::legalizeFlog( 2112 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2113 Register Dst = MI.getOperand(0).getReg(); 2114 Register Src = MI.getOperand(1).getReg(); 2115 LLT Ty = B.getMRI()->getType(Dst); 2116 unsigned Flags = MI.getFlags(); 2117 B.setInstr(MI); 2118 2119 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2120 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2121 2122 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2123 MI.eraseFromParent(); 2124 return true; 2125 } 2126 2127 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2128 MachineIRBuilder &B) const { 2129 Register Dst = MI.getOperand(0).getReg(); 2130 Register Src = MI.getOperand(1).getReg(); 2131 unsigned Flags = MI.getFlags(); 2132 LLT Ty = B.getMRI()->getType(Dst); 2133 B.setInstr(MI); 2134 2135 auto K = B.buildFConstant(Ty, numbers::log2e); 2136 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2137 B.buildFExp2(Dst, Mul, Flags); 2138 MI.eraseFromParent(); 2139 return true; 2140 } 2141 2142 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2143 MachineIRBuilder &B) const { 2144 Register Dst = MI.getOperand(0).getReg(); 2145 Register Src0 = MI.getOperand(1).getReg(); 2146 Register Src1 = MI.getOperand(2).getReg(); 2147 unsigned Flags = MI.getFlags(); 2148 LLT Ty = B.getMRI()->getType(Dst); 2149 B.setInstr(MI); 2150 const LLT S16 = LLT::scalar(16); 2151 const LLT S32 = LLT::scalar(32); 2152 2153 if (Ty == S32) { 2154 auto Log = B.buildFLog2(S32, Src0, Flags); 2155 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2156 .addUse(Log.getReg(0)) 2157 .addUse(Src1) 2158 .setMIFlags(Flags); 2159 B.buildFExp2(Dst, Mul, Flags); 2160 } else if (Ty == S16) { 2161 // There's no f16 fmul_legacy, so we need to convert for it. 2162 auto Log = B.buildFLog2(S16, Src0, Flags); 2163 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2164 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2165 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2166 .addUse(Ext0.getReg(0)) 2167 .addUse(Ext1.getReg(0)) 2168 .setMIFlags(Flags); 2169 2170 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2171 } else 2172 return false; 2173 2174 MI.eraseFromParent(); 2175 return true; 2176 } 2177 2178 // Find a source register, ignoring any possible source modifiers. 2179 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2180 Register ModSrc = OrigSrc; 2181 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2182 ModSrc = SrcFNeg->getOperand(1).getReg(); 2183 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2184 ModSrc = SrcFAbs->getOperand(1).getReg(); 2185 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2186 ModSrc = SrcFAbs->getOperand(1).getReg(); 2187 return ModSrc; 2188 } 2189 2190 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2191 MachineRegisterInfo &MRI, 2192 MachineIRBuilder &B) const { 2193 B.setInstr(MI); 2194 2195 const LLT S1 = LLT::scalar(1); 2196 const LLT S64 = LLT::scalar(64); 2197 Register Dst = MI.getOperand(0).getReg(); 2198 Register OrigSrc = MI.getOperand(1).getReg(); 2199 unsigned Flags = MI.getFlags(); 2200 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2201 "this should not have been custom lowered"); 2202 2203 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2204 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2205 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2206 // V_FRACT bug is: 2207 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2208 // 2209 // Convert floor(x) to (x - fract(x)) 2210 2211 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2212 .addUse(OrigSrc) 2213 .setMIFlags(Flags); 2214 2215 // Give source modifier matching some assistance before obscuring a foldable 2216 // pattern. 2217 2218 // TODO: We can avoid the neg on the fract? The input sign to fract 2219 // shouldn't matter? 2220 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2221 2222 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2223 2224 Register Min = MRI.createGenericVirtualRegister(S64); 2225 2226 // We don't need to concern ourselves with the snan handling difference, so 2227 // use the one which will directly select. 2228 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2229 if (MFI->getMode().IEEE) 2230 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2231 else 2232 B.buildFMinNum(Min, Fract, Const, Flags); 2233 2234 Register CorrectedFract = Min; 2235 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2236 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2237 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2238 } 2239 2240 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2241 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2242 2243 MI.eraseFromParent(); 2244 return true; 2245 } 2246 2247 // Turn an illegal packed v2s16 build vector into bit operations. 2248 // TODO: This should probably be a bitcast action in LegalizerHelper. 2249 bool AMDGPULegalizerInfo::legalizeBuildVector( 2250 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2251 Register Dst = MI.getOperand(0).getReg(); 2252 const LLT S32 = LLT::scalar(32); 2253 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2254 2255 Register Src0 = MI.getOperand(1).getReg(); 2256 Register Src1 = MI.getOperand(2).getReg(); 2257 assert(MRI.getType(Src0) == LLT::scalar(16)); 2258 2259 B.setInstr(MI); 2260 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2261 B.buildBitcast(Dst, Merge); 2262 2263 MI.eraseFromParent(); 2264 return true; 2265 } 2266 2267 // Return the use branch instruction, otherwise null if the usage is invalid. 2268 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2269 MachineRegisterInfo &MRI, 2270 MachineInstr *&Br) { 2271 Register CondDef = MI.getOperand(0).getReg(); 2272 if (!MRI.hasOneNonDBGUse(CondDef)) 2273 return nullptr; 2274 2275 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2276 if (UseMI.getParent() != MI.getParent() || 2277 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2278 return nullptr; 2279 2280 // Make sure the cond br is followed by a G_BR 2281 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2282 if (Next != MI.getParent()->end()) { 2283 if (Next->getOpcode() != AMDGPU::G_BR) 2284 return nullptr; 2285 Br = &*Next; 2286 } 2287 2288 return &UseMI; 2289 } 2290 2291 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2292 MachineRegisterInfo &MRI, 2293 Register LiveIn, 2294 Register PhyReg) const { 2295 assert(PhyReg.isPhysical() && "Physical register expected"); 2296 2297 // Insert the live-in copy, if required, by defining destination virtual 2298 // register. 2299 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2300 if (!MRI.getVRegDef(LiveIn)) { 2301 // FIXME: Should have scoped insert pt 2302 MachineBasicBlock &OrigInsBB = B.getMBB(); 2303 auto OrigInsPt = B.getInsertPt(); 2304 2305 MachineBasicBlock &EntryMBB = B.getMF().front(); 2306 EntryMBB.addLiveIn(PhyReg); 2307 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2308 B.buildCopy(LiveIn, PhyReg); 2309 2310 B.setInsertPt(OrigInsBB, OrigInsPt); 2311 } 2312 2313 return LiveIn; 2314 } 2315 2316 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2317 MachineRegisterInfo &MRI, 2318 Register PhyReg, LLT Ty, 2319 bool InsertLiveInCopy) const { 2320 assert(PhyReg.isPhysical() && "Physical register expected"); 2321 2322 // Get or create virtual live-in regester 2323 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2324 if (!LiveIn) { 2325 LiveIn = MRI.createGenericVirtualRegister(Ty); 2326 MRI.addLiveIn(PhyReg, LiveIn); 2327 } 2328 2329 // When the actual true copy required is from virtual register to physical 2330 // register (to be inserted later), live-in copy insertion from physical 2331 // to register virtual register is not required 2332 if (!InsertLiveInCopy) 2333 return LiveIn; 2334 2335 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2336 } 2337 2338 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2339 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2340 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2341 const ArgDescriptor *Arg; 2342 const TargetRegisterClass *RC; 2343 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2344 if (!Arg) { 2345 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2346 return nullptr; 2347 } 2348 return Arg; 2349 } 2350 2351 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2352 const ArgDescriptor *Arg) const { 2353 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2354 return false; // TODO: Handle these 2355 2356 Register SrcReg = Arg->getRegister(); 2357 assert(SrcReg.isPhysical() && "Physical register expected"); 2358 assert(DstReg.isVirtual() && "Virtual register expected"); 2359 2360 MachineRegisterInfo &MRI = *B.getMRI(); 2361 2362 LLT Ty = MRI.getType(DstReg); 2363 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2364 2365 if (Arg->isMasked()) { 2366 // TODO: Should we try to emit this once in the entry block? 2367 const LLT S32 = LLT::scalar(32); 2368 const unsigned Mask = Arg->getMask(); 2369 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2370 2371 Register AndMaskSrc = LiveIn; 2372 2373 if (Shift != 0) { 2374 auto ShiftAmt = B.buildConstant(S32, Shift); 2375 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2376 } 2377 2378 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2379 } else { 2380 B.buildCopy(DstReg, LiveIn); 2381 } 2382 2383 return true; 2384 } 2385 2386 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2387 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2388 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2389 B.setInstr(MI); 2390 2391 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2392 if (!Arg) 2393 return false; 2394 2395 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2396 return false; 2397 2398 MI.eraseFromParent(); 2399 return true; 2400 } 2401 2402 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2403 MachineRegisterInfo &MRI, 2404 MachineIRBuilder &B) const { 2405 B.setInstr(MI); 2406 Register Dst = MI.getOperand(0).getReg(); 2407 LLT DstTy = MRI.getType(Dst); 2408 LLT S16 = LLT::scalar(16); 2409 LLT S32 = LLT::scalar(32); 2410 LLT S64 = LLT::scalar(64); 2411 2412 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2413 return true; 2414 2415 if (DstTy == S16) 2416 return legalizeFDIV16(MI, MRI, B); 2417 if (DstTy == S32) 2418 return legalizeFDIV32(MI, MRI, B); 2419 if (DstTy == S64) 2420 return legalizeFDIV64(MI, MRI, B); 2421 2422 return false; 2423 } 2424 2425 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2426 const LLT S32 = LLT::scalar(32); 2427 2428 auto Cvt0 = B.buildUITOFP(S32, Src); 2429 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2430 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2431 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2432 return B.buildFPTOUI(S32, Mul).getReg(0); 2433 } 2434 2435 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2436 Register DstReg, 2437 Register Num, 2438 Register Den, 2439 bool IsRem) const { 2440 const LLT S1 = LLT::scalar(1); 2441 const LLT S32 = LLT::scalar(32); 2442 2443 // RCP = URECIP(Den) = 2^32 / Den + e 2444 // e is rounding error. 2445 auto RCP = buildDivRCP(B, Den); 2446 2447 // RCP_LO = mul(RCP, Den) 2448 auto RCP_LO = B.buildMul(S32, RCP, Den); 2449 2450 // RCP_HI = mulhu (RCP, Den) */ 2451 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2452 2453 // NEG_RCP_LO = -RCP_LO 2454 auto Zero = B.buildConstant(S32, 0); 2455 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2456 2457 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2458 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2459 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2460 2461 // Calculate the rounding error from the URECIP instruction 2462 // E = mulhu(ABS_RCP_LO, RCP) 2463 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2464 2465 // RCP_A_E = RCP + E 2466 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2467 2468 // RCP_S_E = RCP - E 2469 auto RCP_S_E = B.buildSub(S32, RCP, E); 2470 2471 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2472 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2473 2474 // Quotient = mulhu(Tmp0, Num)stmp 2475 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2476 2477 // Num_S_Remainder = Quotient * Den 2478 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2479 2480 // Remainder = Num - Num_S_Remainder 2481 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2482 2483 // Remainder_GE_Den = Remainder >= Den 2484 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2485 2486 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2487 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2488 Num, Num_S_Remainder); 2489 2490 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2491 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2492 2493 // Calculate Division result: 2494 2495 // Quotient_A_One = Quotient + 1 2496 auto One = B.buildConstant(S32, 1); 2497 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2498 2499 // Quotient_S_One = Quotient - 1 2500 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2501 2502 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2503 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2504 2505 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2506 if (IsRem) { 2507 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2508 2509 // Calculate Rem result: 2510 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2511 2512 // Remainder_A_Den = Remainder + Den 2513 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2514 2515 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2516 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2517 2518 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2519 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2520 } else { 2521 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2522 } 2523 } 2524 2525 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2526 MachineRegisterInfo &MRI, 2527 MachineIRBuilder &B) const { 2528 B.setInstr(MI); 2529 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2530 Register DstReg = MI.getOperand(0).getReg(); 2531 Register Num = MI.getOperand(1).getReg(); 2532 Register Den = MI.getOperand(2).getReg(); 2533 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2534 MI.eraseFromParent(); 2535 return true; 2536 } 2537 2538 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2539 // 2540 // Return lo, hi of result 2541 // 2542 // %cvt.lo = G_UITOFP Val.lo 2543 // %cvt.hi = G_UITOFP Val.hi 2544 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2545 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2546 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2547 // %mul2 = G_FMUL %mul1, 2**(-32) 2548 // %trunc = G_INTRINSIC_TRUNC %mul2 2549 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2550 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2551 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2552 Register Val) { 2553 const LLT S32 = LLT::scalar(32); 2554 auto Unmerge = B.buildUnmerge(S32, Val); 2555 2556 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2557 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2558 2559 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2560 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2561 2562 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2563 auto Mul1 = 2564 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2565 2566 // 2**(-32) 2567 auto Mul2 = 2568 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2569 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2570 2571 // -(2**32) 2572 auto Mad2 = B.buildFMAD(S32, Trunc, 2573 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2574 2575 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2576 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2577 2578 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2579 } 2580 2581 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2582 MachineRegisterInfo &MRI, 2583 MachineIRBuilder &B) const { 2584 B.setInstr(MI); 2585 2586 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2587 const LLT S32 = LLT::scalar(32); 2588 const LLT S64 = LLT::scalar(64); 2589 const LLT S1 = LLT::scalar(1); 2590 Register Numer = MI.getOperand(1).getReg(); 2591 Register Denom = MI.getOperand(2).getReg(); 2592 Register RcpLo, RcpHi; 2593 2594 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2595 2596 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2597 2598 auto Zero64 = B.buildConstant(S64, 0); 2599 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2600 2601 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2602 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2603 2604 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2605 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2606 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2607 2608 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2609 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2610 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2611 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2612 2613 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2614 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2615 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2616 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2617 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2618 2619 auto Zero32 = B.buildConstant(S32, 0); 2620 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2621 auto Add2_HiC = 2622 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2623 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2624 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2625 2626 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2627 Register NumerLo = UnmergeNumer.getReg(0); 2628 Register NumerHi = UnmergeNumer.getReg(1); 2629 2630 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2631 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2632 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2633 Register Mul3_Lo = UnmergeMul3.getReg(0); 2634 Register Mul3_Hi = UnmergeMul3.getReg(1); 2635 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2636 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2637 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2638 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2639 2640 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2641 Register DenomLo = UnmergeDenom.getReg(0); 2642 Register DenomHi = UnmergeDenom.getReg(1); 2643 2644 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2645 auto C1 = B.buildSExt(S32, CmpHi); 2646 2647 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2648 auto C2 = B.buildSExt(S32, CmpLo); 2649 2650 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2651 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2652 2653 // TODO: Here and below portions of the code can be enclosed into if/endif. 2654 // Currently control flow is unconditional and we have 4 selects after 2655 // potential endif to substitute PHIs. 2656 2657 // if C3 != 0 ... 2658 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2659 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2660 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2661 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2662 2663 auto One64 = B.buildConstant(S64, 1); 2664 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2665 2666 auto C4 = 2667 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2668 auto C5 = 2669 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2670 auto C6 = B.buildSelect( 2671 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2672 2673 // if (C6 != 0) 2674 auto Add4 = B.buildAdd(S64, Add3, One64); 2675 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2676 2677 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2678 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2679 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2680 2681 // endif C6 2682 // endif C3 2683 2684 if (IsDiv) { 2685 auto Sel1 = B.buildSelect( 2686 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2687 B.buildSelect(MI.getOperand(0), 2688 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2689 } else { 2690 auto Sel2 = B.buildSelect( 2691 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2692 B.buildSelect(MI.getOperand(0), 2693 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2694 } 2695 2696 MI.eraseFromParent(); 2697 return true; 2698 } 2699 2700 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2701 MachineRegisterInfo &MRI, 2702 MachineIRBuilder &B) const { 2703 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2704 if (Ty == LLT::scalar(32)) 2705 return legalizeUDIV_UREM32(MI, MRI, B); 2706 if (Ty == LLT::scalar(64)) 2707 return legalizeUDIV_UREM64(MI, MRI, B); 2708 return false; 2709 } 2710 2711 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2712 MachineRegisterInfo &MRI, 2713 MachineIRBuilder &B) const { 2714 B.setInstr(MI); 2715 const LLT S32 = LLT::scalar(32); 2716 2717 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2718 Register DstReg = MI.getOperand(0).getReg(); 2719 Register LHS = MI.getOperand(1).getReg(); 2720 Register RHS = MI.getOperand(2).getReg(); 2721 2722 auto ThirtyOne = B.buildConstant(S32, 31); 2723 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2724 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2725 2726 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2727 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2728 2729 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2730 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2731 2732 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2733 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2734 2735 if (IsRem) { 2736 auto RSign = LHSign; // Remainder sign is the same as LHS 2737 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2738 B.buildSub(DstReg, UDivRem, RSign); 2739 } else { 2740 auto DSign = B.buildXor(S32, LHSign, RHSign); 2741 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2742 B.buildSub(DstReg, UDivRem, DSign); 2743 } 2744 2745 MI.eraseFromParent(); 2746 return true; 2747 } 2748 2749 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2750 MachineRegisterInfo &MRI, 2751 MachineIRBuilder &B) const { 2752 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2753 return legalizeSDIV_SREM32(MI, MRI, B); 2754 return false; 2755 } 2756 2757 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2758 MachineRegisterInfo &MRI, 2759 MachineIRBuilder &B) const { 2760 Register Res = MI.getOperand(0).getReg(); 2761 Register LHS = MI.getOperand(1).getReg(); 2762 Register RHS = MI.getOperand(2).getReg(); 2763 2764 uint16_t Flags = MI.getFlags(); 2765 2766 LLT ResTy = MRI.getType(Res); 2767 LLT S32 = LLT::scalar(32); 2768 LLT S64 = LLT::scalar(64); 2769 2770 const MachineFunction &MF = B.getMF(); 2771 bool Unsafe = 2772 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2773 2774 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2775 return false; 2776 2777 if (!Unsafe && ResTy == S32 && 2778 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2779 return false; 2780 2781 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2782 // 1 / x -> RCP(x) 2783 if (CLHS->isExactlyValue(1.0)) { 2784 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2785 .addUse(RHS) 2786 .setMIFlags(Flags); 2787 2788 MI.eraseFromParent(); 2789 return true; 2790 } 2791 2792 // -1 / x -> RCP( FNEG(x) ) 2793 if (CLHS->isExactlyValue(-1.0)) { 2794 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2795 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2796 .addUse(FNeg.getReg(0)) 2797 .setMIFlags(Flags); 2798 2799 MI.eraseFromParent(); 2800 return true; 2801 } 2802 } 2803 2804 // x / y -> x * (1.0 / y) 2805 if (Unsafe) { 2806 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2807 .addUse(RHS) 2808 .setMIFlags(Flags); 2809 B.buildFMul(Res, LHS, RCP, Flags); 2810 2811 MI.eraseFromParent(); 2812 return true; 2813 } 2814 2815 return false; 2816 } 2817 2818 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2819 MachineRegisterInfo &MRI, 2820 MachineIRBuilder &B) const { 2821 B.setInstr(MI); 2822 Register Res = MI.getOperand(0).getReg(); 2823 Register LHS = MI.getOperand(1).getReg(); 2824 Register RHS = MI.getOperand(2).getReg(); 2825 2826 uint16_t Flags = MI.getFlags(); 2827 2828 LLT S16 = LLT::scalar(16); 2829 LLT S32 = LLT::scalar(32); 2830 2831 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2832 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2833 2834 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2835 .addUse(RHSExt.getReg(0)) 2836 .setMIFlags(Flags); 2837 2838 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2839 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2840 2841 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2842 .addUse(RDst.getReg(0)) 2843 .addUse(RHS) 2844 .addUse(LHS) 2845 .setMIFlags(Flags); 2846 2847 MI.eraseFromParent(); 2848 return true; 2849 } 2850 2851 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2852 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2853 static void toggleSPDenormMode(bool Enable, 2854 MachineIRBuilder &B, 2855 const GCNSubtarget &ST, 2856 AMDGPU::SIModeRegisterDefaults Mode) { 2857 // Set SP denorm mode to this value. 2858 unsigned SPDenormMode = 2859 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2860 2861 if (ST.hasDenormModeInst()) { 2862 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2863 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2864 2865 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2866 B.buildInstr(AMDGPU::S_DENORM_MODE) 2867 .addImm(NewDenormModeValue); 2868 2869 } else { 2870 // Select FP32 bit field in mode register. 2871 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2872 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2873 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2874 2875 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2876 .addImm(SPDenormMode) 2877 .addImm(SPDenormModeBitField); 2878 } 2879 } 2880 2881 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2882 MachineRegisterInfo &MRI, 2883 MachineIRBuilder &B) const { 2884 B.setInstr(MI); 2885 Register Res = MI.getOperand(0).getReg(); 2886 Register LHS = MI.getOperand(1).getReg(); 2887 Register RHS = MI.getOperand(2).getReg(); 2888 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2889 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2890 2891 uint16_t Flags = MI.getFlags(); 2892 2893 LLT S32 = LLT::scalar(32); 2894 LLT S1 = LLT::scalar(1); 2895 2896 auto One = B.buildFConstant(S32, 1.0f); 2897 2898 auto DenominatorScaled = 2899 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2900 .addUse(RHS) 2901 .addUse(LHS) 2902 .addImm(1) 2903 .setMIFlags(Flags); 2904 auto NumeratorScaled = 2905 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2906 .addUse(LHS) 2907 .addUse(RHS) 2908 .addImm(0) 2909 .setMIFlags(Flags); 2910 2911 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2912 .addUse(DenominatorScaled.getReg(0)) 2913 .setMIFlags(Flags); 2914 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2915 2916 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2917 // aren't modeled as reading it. 2918 if (!Mode.allFP32Denormals()) 2919 toggleSPDenormMode(true, B, ST, Mode); 2920 2921 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2922 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2923 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2924 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2925 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2926 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2927 2928 if (!Mode.allFP32Denormals()) 2929 toggleSPDenormMode(false, B, ST, Mode); 2930 2931 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2932 .addUse(Fma4.getReg(0)) 2933 .addUse(Fma1.getReg(0)) 2934 .addUse(Fma3.getReg(0)) 2935 .addUse(NumeratorScaled.getReg(1)) 2936 .setMIFlags(Flags); 2937 2938 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2939 .addUse(Fmas.getReg(0)) 2940 .addUse(RHS) 2941 .addUse(LHS) 2942 .setMIFlags(Flags); 2943 2944 MI.eraseFromParent(); 2945 return true; 2946 } 2947 2948 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2949 MachineRegisterInfo &MRI, 2950 MachineIRBuilder &B) const { 2951 B.setInstr(MI); 2952 Register Res = MI.getOperand(0).getReg(); 2953 Register LHS = MI.getOperand(1).getReg(); 2954 Register RHS = MI.getOperand(2).getReg(); 2955 2956 uint16_t Flags = MI.getFlags(); 2957 2958 LLT S64 = LLT::scalar(64); 2959 LLT S1 = LLT::scalar(1); 2960 2961 auto One = B.buildFConstant(S64, 1.0); 2962 2963 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2964 .addUse(LHS) 2965 .addUse(RHS) 2966 .addImm(1) 2967 .setMIFlags(Flags); 2968 2969 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2970 2971 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2972 .addUse(DivScale0.getReg(0)) 2973 .setMIFlags(Flags); 2974 2975 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2976 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2977 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2978 2979 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2980 .addUse(LHS) 2981 .addUse(RHS) 2982 .addImm(0) 2983 .setMIFlags(Flags); 2984 2985 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2986 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2987 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2988 2989 Register Scale; 2990 if (!ST.hasUsableDivScaleConditionOutput()) { 2991 // Workaround a hardware bug on SI where the condition output from div_scale 2992 // is not usable. 2993 2994 LLT S32 = LLT::scalar(32); 2995 2996 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2997 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2998 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2999 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3000 3001 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3002 Scale1Unmerge.getReg(1)); 3003 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3004 Scale0Unmerge.getReg(1)); 3005 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3006 } else { 3007 Scale = DivScale1.getReg(1); 3008 } 3009 3010 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3011 .addUse(Fma4.getReg(0)) 3012 .addUse(Fma3.getReg(0)) 3013 .addUse(Mul.getReg(0)) 3014 .addUse(Scale) 3015 .setMIFlags(Flags); 3016 3017 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3018 .addUse(Fmas.getReg(0)) 3019 .addUse(RHS) 3020 .addUse(LHS) 3021 .setMIFlags(Flags); 3022 3023 MI.eraseFromParent(); 3024 return true; 3025 } 3026 3027 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3028 MachineRegisterInfo &MRI, 3029 MachineIRBuilder &B) const { 3030 B.setInstr(MI); 3031 Register Res = MI.getOperand(0).getReg(); 3032 Register LHS = MI.getOperand(2).getReg(); 3033 Register RHS = MI.getOperand(3).getReg(); 3034 uint16_t Flags = MI.getFlags(); 3035 3036 LLT S32 = LLT::scalar(32); 3037 LLT S1 = LLT::scalar(1); 3038 3039 auto Abs = B.buildFAbs(S32, RHS, Flags); 3040 const APFloat C0Val(1.0f); 3041 3042 auto C0 = B.buildConstant(S32, 0x6f800000); 3043 auto C1 = B.buildConstant(S32, 0x2f800000); 3044 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3045 3046 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3047 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3048 3049 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3050 3051 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3052 .addUse(Mul0.getReg(0)) 3053 .setMIFlags(Flags); 3054 3055 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3056 3057 B.buildFMul(Res, Sel, Mul1, Flags); 3058 3059 MI.eraseFromParent(); 3060 return true; 3061 } 3062 3063 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3064 MachineRegisterInfo &MRI, 3065 MachineIRBuilder &B) const { 3066 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3067 if (!MFI->isEntryFunction()) { 3068 return legalizePreloadedArgIntrin(MI, MRI, B, 3069 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3070 } 3071 3072 B.setInstr(MI); 3073 3074 uint64_t Offset = 3075 ST.getTargetLowering()->getImplicitParameterOffset( 3076 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3077 Register DstReg = MI.getOperand(0).getReg(); 3078 LLT DstTy = MRI.getType(DstReg); 3079 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3080 3081 const ArgDescriptor *Arg; 3082 const TargetRegisterClass *RC; 3083 std::tie(Arg, RC) 3084 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3085 if (!Arg) 3086 return false; 3087 3088 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3089 if (!loadInputValue(KernargPtrReg, B, Arg)) 3090 return false; 3091 3092 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3093 MI.eraseFromParent(); 3094 return true; 3095 } 3096 3097 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3098 MachineRegisterInfo &MRI, 3099 MachineIRBuilder &B, 3100 unsigned AddrSpace) const { 3101 B.setInstr(MI); 3102 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3103 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3104 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3105 MI.eraseFromParent(); 3106 return true; 3107 } 3108 3109 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3110 // offset (the offset that is included in bounds checking and swizzling, to be 3111 // split between the instruction's voffset and immoffset fields) and soffset 3112 // (the offset that is excluded from bounds checking and swizzling, to go in 3113 // the instruction's soffset field). This function takes the first kind of 3114 // offset and figures out how to split it between voffset and immoffset. 3115 std::tuple<Register, unsigned, unsigned> 3116 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3117 Register OrigOffset) const { 3118 const unsigned MaxImm = 4095; 3119 Register BaseReg; 3120 unsigned TotalConstOffset; 3121 MachineInstr *OffsetDef; 3122 const LLT S32 = LLT::scalar(32); 3123 3124 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3125 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3126 3127 unsigned ImmOffset = TotalConstOffset; 3128 3129 // If the immediate value is too big for the immoffset field, put the value 3130 // and -4096 into the immoffset field so that the value that is copied/added 3131 // for the voffset field is a multiple of 4096, and it stands more chance 3132 // of being CSEd with the copy/add for another similar load/store. 3133 // However, do not do that rounding down to a multiple of 4096 if that is a 3134 // negative number, as it appears to be illegal to have a negative offset 3135 // in the vgpr, even if adding the immediate offset makes it positive. 3136 unsigned Overflow = ImmOffset & ~MaxImm; 3137 ImmOffset -= Overflow; 3138 if ((int32_t)Overflow < 0) { 3139 Overflow += ImmOffset; 3140 ImmOffset = 0; 3141 } 3142 3143 if (Overflow != 0) { 3144 if (!BaseReg) { 3145 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3146 } else { 3147 auto OverflowVal = B.buildConstant(S32, Overflow); 3148 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3149 } 3150 } 3151 3152 if (!BaseReg) 3153 BaseReg = B.buildConstant(S32, 0).getReg(0); 3154 3155 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3156 } 3157 3158 /// Handle register layout difference for f16 images for some subtargets. 3159 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3160 MachineRegisterInfo &MRI, 3161 Register Reg) const { 3162 if (!ST.hasUnpackedD16VMem()) 3163 return Reg; 3164 3165 const LLT S16 = LLT::scalar(16); 3166 const LLT S32 = LLT::scalar(32); 3167 LLT StoreVT = MRI.getType(Reg); 3168 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3169 3170 auto Unmerge = B.buildUnmerge(S16, Reg); 3171 3172 SmallVector<Register, 4> WideRegs; 3173 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3174 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3175 3176 int NumElts = StoreVT.getNumElements(); 3177 3178 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3179 } 3180 3181 Register AMDGPULegalizerInfo::fixStoreSourceType( 3182 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3183 MachineRegisterInfo *MRI = B.getMRI(); 3184 LLT Ty = MRI->getType(VData); 3185 3186 const LLT S16 = LLT::scalar(16); 3187 3188 // Fixup illegal register types for i8 stores. 3189 if (Ty == LLT::scalar(8) || Ty == S16) { 3190 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3191 return AnyExt; 3192 } 3193 3194 if (Ty.isVector()) { 3195 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3196 if (IsFormat) 3197 return handleD16VData(B, *MRI, VData); 3198 } 3199 } 3200 3201 return VData; 3202 } 3203 3204 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3205 MachineRegisterInfo &MRI, 3206 MachineIRBuilder &B, 3207 bool IsTyped, 3208 bool IsFormat) const { 3209 B.setInstr(MI); 3210 3211 Register VData = MI.getOperand(1).getReg(); 3212 LLT Ty = MRI.getType(VData); 3213 LLT EltTy = Ty.getScalarType(); 3214 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3215 const LLT S32 = LLT::scalar(32); 3216 3217 VData = fixStoreSourceType(B, VData, IsFormat); 3218 Register RSrc = MI.getOperand(2).getReg(); 3219 3220 MachineMemOperand *MMO = *MI.memoperands_begin(); 3221 const int MemSize = MMO->getSize(); 3222 3223 unsigned ImmOffset; 3224 unsigned TotalOffset; 3225 3226 // The typed intrinsics add an immediate after the registers. 3227 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3228 3229 // The struct intrinsic variants add one additional operand over raw. 3230 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3231 Register VIndex; 3232 int OpOffset = 0; 3233 if (HasVIndex) { 3234 VIndex = MI.getOperand(3).getReg(); 3235 OpOffset = 1; 3236 } 3237 3238 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3239 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3240 3241 unsigned Format = 0; 3242 if (IsTyped) { 3243 Format = MI.getOperand(5 + OpOffset).getImm(); 3244 ++OpOffset; 3245 } 3246 3247 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3248 3249 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3250 if (TotalOffset != 0) 3251 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3252 3253 unsigned Opc; 3254 if (IsTyped) { 3255 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3256 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3257 } else if (IsFormat) { 3258 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3259 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3260 } else { 3261 switch (MemSize) { 3262 case 1: 3263 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3264 break; 3265 case 2: 3266 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3267 break; 3268 default: 3269 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3270 break; 3271 } 3272 } 3273 3274 if (!VIndex) 3275 VIndex = B.buildConstant(S32, 0).getReg(0); 3276 3277 auto MIB = B.buildInstr(Opc) 3278 .addUse(VData) // vdata 3279 .addUse(RSrc) // rsrc 3280 .addUse(VIndex) // vindex 3281 .addUse(VOffset) // voffset 3282 .addUse(SOffset) // soffset 3283 .addImm(ImmOffset); // offset(imm) 3284 3285 if (IsTyped) 3286 MIB.addImm(Format); 3287 3288 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3289 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3290 .addMemOperand(MMO); 3291 3292 MI.eraseFromParent(); 3293 return true; 3294 } 3295 3296 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3297 MachineRegisterInfo &MRI, 3298 MachineIRBuilder &B, 3299 bool IsFormat, 3300 bool IsTyped) const { 3301 B.setInstr(MI); 3302 3303 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3304 MachineMemOperand *MMO = *MI.memoperands_begin(); 3305 const int MemSize = MMO->getSize(); 3306 const LLT S32 = LLT::scalar(32); 3307 3308 Register Dst = MI.getOperand(0).getReg(); 3309 Register RSrc = MI.getOperand(2).getReg(); 3310 3311 // The typed intrinsics add an immediate after the registers. 3312 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3313 3314 // The struct intrinsic variants add one additional operand over raw. 3315 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3316 Register VIndex; 3317 int OpOffset = 0; 3318 if (HasVIndex) { 3319 VIndex = MI.getOperand(3).getReg(); 3320 OpOffset = 1; 3321 } 3322 3323 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3324 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3325 3326 unsigned Format = 0; 3327 if (IsTyped) { 3328 Format = MI.getOperand(5 + OpOffset).getImm(); 3329 ++OpOffset; 3330 } 3331 3332 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3333 unsigned ImmOffset; 3334 unsigned TotalOffset; 3335 3336 LLT Ty = MRI.getType(Dst); 3337 LLT EltTy = Ty.getScalarType(); 3338 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3339 const bool Unpacked = ST.hasUnpackedD16VMem(); 3340 3341 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3342 if (TotalOffset != 0) 3343 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3344 3345 unsigned Opc; 3346 3347 if (IsTyped) { 3348 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3349 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3350 } else if (IsFormat) { 3351 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3352 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3353 } else { 3354 switch (MemSize) { 3355 case 1: 3356 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3357 break; 3358 case 2: 3359 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3360 break; 3361 default: 3362 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3363 break; 3364 } 3365 } 3366 3367 Register LoadDstReg; 3368 3369 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3370 LLT UnpackedTy = Ty.changeElementSize(32); 3371 3372 if (IsExtLoad) 3373 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3374 else if (Unpacked && IsD16 && Ty.isVector()) 3375 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3376 else 3377 LoadDstReg = Dst; 3378 3379 if (!VIndex) 3380 VIndex = B.buildConstant(S32, 0).getReg(0); 3381 3382 auto MIB = B.buildInstr(Opc) 3383 .addDef(LoadDstReg) // vdata 3384 .addUse(RSrc) // rsrc 3385 .addUse(VIndex) // vindex 3386 .addUse(VOffset) // voffset 3387 .addUse(SOffset) // soffset 3388 .addImm(ImmOffset); // offset(imm) 3389 3390 if (IsTyped) 3391 MIB.addImm(Format); 3392 3393 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3394 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3395 .addMemOperand(MMO); 3396 3397 if (LoadDstReg != Dst) { 3398 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3399 3400 // Widen result for extending loads was widened. 3401 if (IsExtLoad) 3402 B.buildTrunc(Dst, LoadDstReg); 3403 else { 3404 // Repack to original 16-bit vector result 3405 // FIXME: G_TRUNC should work, but legalization currently fails 3406 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3407 SmallVector<Register, 4> Repack; 3408 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3409 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3410 B.buildMerge(Dst, Repack); 3411 } 3412 } 3413 3414 MI.eraseFromParent(); 3415 return true; 3416 } 3417 3418 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3419 MachineIRBuilder &B, 3420 bool IsInc) const { 3421 B.setInstr(MI); 3422 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3423 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3424 B.buildInstr(Opc) 3425 .addDef(MI.getOperand(0).getReg()) 3426 .addUse(MI.getOperand(2).getReg()) 3427 .addUse(MI.getOperand(3).getReg()) 3428 .cloneMemRefs(MI); 3429 MI.eraseFromParent(); 3430 return true; 3431 } 3432 3433 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3434 switch (IntrID) { 3435 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3436 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3437 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3438 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3439 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3440 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3441 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3442 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3443 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3444 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3445 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3446 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3447 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3448 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3449 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3450 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3451 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3452 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3453 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3454 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3455 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3456 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3457 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3458 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3459 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3460 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3461 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3462 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3463 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3464 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3465 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3466 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3467 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3468 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3469 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3470 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3471 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3472 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3473 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3474 default: 3475 llvm_unreachable("unhandled atomic opcode"); 3476 } 3477 } 3478 3479 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3480 MachineIRBuilder &B, 3481 Intrinsic::ID IID) const { 3482 B.setInstr(MI); 3483 3484 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3485 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3486 3487 Register Dst = MI.getOperand(0).getReg(); 3488 Register VData = MI.getOperand(2).getReg(); 3489 3490 Register CmpVal; 3491 int OpOffset = 0; 3492 3493 if (IsCmpSwap) { 3494 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3495 ++OpOffset; 3496 } 3497 3498 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3499 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3500 3501 // The struct intrinsic variants add one additional operand over raw. 3502 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3503 Register VIndex; 3504 if (HasVIndex) { 3505 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3506 ++OpOffset; 3507 } 3508 3509 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3510 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3511 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3512 3513 MachineMemOperand *MMO = *MI.memoperands_begin(); 3514 3515 unsigned ImmOffset; 3516 unsigned TotalOffset; 3517 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3518 if (TotalOffset != 0) 3519 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3520 3521 if (!VIndex) 3522 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3523 3524 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3525 .addDef(Dst) 3526 .addUse(VData); // vdata 3527 3528 if (IsCmpSwap) 3529 MIB.addReg(CmpVal); 3530 3531 MIB.addUse(RSrc) // rsrc 3532 .addUse(VIndex) // vindex 3533 .addUse(VOffset) // voffset 3534 .addUse(SOffset) // soffset 3535 .addImm(ImmOffset) // offset(imm) 3536 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3537 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3538 .addMemOperand(MMO); 3539 3540 MI.eraseFromParent(); 3541 return true; 3542 } 3543 3544 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3545 /// vector with s16 typed elements. 3546 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3547 SmallVectorImpl<Register> &PackedAddrs, 3548 int AddrIdx, int DimIdx, int NumVAddrs, 3549 int NumGradients) { 3550 const LLT S16 = LLT::scalar(16); 3551 const LLT V2S16 = LLT::vector(2, 16); 3552 3553 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3554 Register AddrReg = MI.getOperand(I).getReg(); 3555 3556 if (I < DimIdx) { 3557 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3558 PackedAddrs.push_back(AddrReg); 3559 } else { 3560 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3561 // derivatives dx/dh and dx/dv are packed with undef. 3562 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3563 ((NumGradients / 2) % 2 == 1 && 3564 (I == DimIdx + (NumGradients / 2) - 1 || 3565 I == DimIdx + NumGradients - 1))) { 3566 PackedAddrs.push_back( 3567 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3568 .getReg(0)); 3569 } else { 3570 PackedAddrs.push_back( 3571 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3572 .getReg(0)); 3573 ++I; 3574 } 3575 } 3576 } 3577 } 3578 3579 /// Convert from separate vaddr components to a single vector address register, 3580 /// and replace the remaining operands with $noreg. 3581 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3582 int DimIdx, int NumVAddrs) { 3583 SmallVector<Register, 8> AddrRegs(NumVAddrs); 3584 for (int I = 0; I != NumVAddrs; ++I) { 3585 AddrRegs[I] = MI.getOperand(DimIdx + I).getReg(); 3586 assert(B.getMRI()->getType(AddrRegs[I]) == LLT::scalar(32)); 3587 } 3588 3589 auto VAddr = B.buildBuildVector(LLT::vector(NumVAddrs, 32), AddrRegs); 3590 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3591 for (int I = 1; I != NumVAddrs; ++I) 3592 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3593 } 3594 3595 /// Return number of address arguments, and the number of gradients 3596 static std::pair<int, int> 3597 getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 3598 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) { 3599 const AMDGPU::MIMGDimInfo *DimInfo 3600 = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim); 3601 3602 int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0; 3603 int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0; 3604 int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0; 3605 int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM; 3606 return {NumVAddr, NumGradients}; 3607 } 3608 3609 static int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, 3610 int NumDefs) { 3611 assert(!BaseOpcode->Atomic); 3612 return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0); 3613 } 3614 3615 /// Return first address operand index in an image intrinsic. 3616 static int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode, 3617 int NumDefs) { 3618 if (BaseOpcode->Atomic) 3619 return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1); 3620 return getDMaskIdx(BaseOpcode, NumDefs) + 1; 3621 } 3622 3623 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3624 /// 3625 /// Depending on the subtarget, load/store with 16-bit element data need to be 3626 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3627 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3628 /// registers. 3629 /// 3630 /// We don't want to directly select image instructions just yet, but also want 3631 /// to exposes all register repacking to the legalizer/combiners. We also don't 3632 /// want a selected instrution entering RegBankSelect. In order to avoid 3633 /// defining a multitude of intermediate image instructions, directly hack on 3634 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3635 /// now unnecessary arguments with $noreg. 3636 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3637 MachineInstr &MI, MachineIRBuilder &B, 3638 GISelChangeObserver &Observer, 3639 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3640 B.setInstr(MI); 3641 3642 const int NumDefs = MI.getNumExplicitDefs(); 3643 bool IsTFE = NumDefs == 2; 3644 // We are only processing the operands of d16 image operations on subtargets 3645 // that use the unpacked register layout, or need to repack the TFE result. 3646 3647 // TODO: Do we need to guard against already legalized intrinsics? 3648 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3649 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3650 3651 MachineRegisterInfo *MRI = B.getMRI(); 3652 const LLT S32 = LLT::scalar(32); 3653 const LLT S16 = LLT::scalar(16); 3654 const LLT V2S16 = LLT::vector(2, 16); 3655 3656 // Index of first address argument 3657 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3658 3659 // Check for 16 bit addresses and pack if true. 3660 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3661 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3662 const bool IsA16 = AddrTy == S16; 3663 3664 int NumVAddrs, NumGradients; 3665 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3666 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3667 getDMaskIdx(BaseOpcode, NumDefs); 3668 unsigned DMask = 0; 3669 3670 int DMaskLanes = 0; 3671 if (!BaseOpcode->Atomic) { 3672 DMask = MI.getOperand(DMaskIdx).getImm(); 3673 if (BaseOpcode->Gather4) { 3674 DMaskLanes = 4; 3675 } else if (DMask != 0) { 3676 DMaskLanes = countPopulation(DMask); 3677 } else if (!IsTFE && !BaseOpcode->Store) { 3678 // If dmask is 0, this is a no-op load. This can be eliminated. 3679 B.buildUndef(MI.getOperand(0)); 3680 MI.eraseFromParent(); 3681 return true; 3682 } 3683 } 3684 3685 Observer.changingInstr(MI); 3686 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3687 3688 unsigned NewOpcode = NumDefs == 0 ? 3689 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3690 3691 // Track that we legalized this 3692 MI.setDesc(B.getTII().get(NewOpcode)); 3693 3694 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3695 // dmask to be at least 1 otherwise the instruction will fail 3696 if (IsTFE && DMask == 0) { 3697 DMask = 0x1; 3698 DMaskLanes = 1; 3699 MI.getOperand(DMaskIdx).setImm(DMask); 3700 } 3701 3702 // If the register allocator cannot place the address registers contiguously 3703 // without introducing moves, then using the non-sequential address encoding 3704 // is always preferable, since it saves VALU instructions and is usually a 3705 // wash in terms of code size or even better. 3706 // 3707 // However, we currently have no way of hinting to the register allocator 3708 // that MIMG addresses should be placed contiguously when it is possible to 3709 // do so, so force non-NSA for the common 2-address case as a heuristic. 3710 // 3711 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3712 // allocation when possible. 3713 const bool UseNSA = NumVAddrs >= 3 && 3714 ST.hasFeature(AMDGPU::FeatureNSAEncoding); 3715 3716 // Rewrite the addressing register layout before doing anything else. 3717 if (IsA16) { 3718 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3719 // should be introduced. 3720 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3721 return false; 3722 3723 if (NumVAddrs > 1) { 3724 SmallVector<Register, 4> PackedRegs; 3725 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3726 NumGradients); 3727 3728 if (!UseNSA && PackedRegs.size() > 1) { 3729 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3730 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3731 PackedRegs[0] = Concat.getReg(0); 3732 PackedRegs.resize(1); 3733 } 3734 3735 const int NumPacked = PackedRegs.size(); 3736 for (int I = 0; I != NumVAddrs; ++I) { 3737 assert(MI.getOperand(AddrIdx + I).getReg() != AMDGPU::NoRegister); 3738 3739 if (I < NumPacked) 3740 MI.getOperand(AddrIdx + I).setReg(PackedRegs[I]); 3741 else 3742 MI.getOperand(AddrIdx + I).setReg(AMDGPU::NoRegister); 3743 } 3744 } 3745 } else if (!UseNSA && NumVAddrs > 1) { 3746 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3747 } 3748 3749 if (BaseOpcode->Store) { // No TFE for stores? 3750 // TODO: Handle dmask trim 3751 Register VData = MI.getOperand(1).getReg(); 3752 LLT Ty = MRI->getType(VData); 3753 if (!Ty.isVector() || Ty.getElementType() != S16) 3754 return true; 3755 3756 B.setInstr(MI); 3757 3758 Register RepackedReg = handleD16VData(B, *MRI, VData); 3759 if (RepackedReg != VData) { 3760 MI.getOperand(1).setReg(RepackedReg); 3761 } 3762 3763 return true; 3764 } 3765 3766 Register DstReg = MI.getOperand(0).getReg(); 3767 LLT Ty = MRI->getType(DstReg); 3768 const LLT EltTy = Ty.getScalarType(); 3769 const bool IsD16 = Ty.getScalarType() == S16; 3770 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3771 3772 // Confirm that the return type is large enough for the dmask specified 3773 if (NumElts < DMaskLanes) 3774 return false; 3775 3776 if (NumElts > 4 || DMaskLanes > 4) 3777 return false; 3778 3779 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3780 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3781 3782 // The raw dword aligned data component of the load. The only legal cases 3783 // where this matters should be when using the packed D16 format, for 3784 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3785 LLT RoundedTy; 3786 3787 // S32 vector to to cover all data, plus TFE result element. 3788 LLT TFETy; 3789 3790 // Register type to use for each loaded component. Will be S32 or V2S16. 3791 LLT RegTy; 3792 3793 if (IsD16 && ST.hasUnpackedD16VMem()) { 3794 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3795 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3796 RegTy = S32; 3797 } else { 3798 unsigned EltSize = EltTy.getSizeInBits(); 3799 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3800 unsigned RoundedSize = 32 * RoundedElts; 3801 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3802 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3803 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3804 } 3805 3806 // The return type does not need adjustment. 3807 // TODO: Should we change s16 case to s32 or <2 x s16>? 3808 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3809 return true; 3810 3811 Register Dst1Reg; 3812 3813 // Insert after the instruction. 3814 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3815 3816 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3817 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3818 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3819 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3820 3821 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3822 3823 MI.getOperand(0).setReg(NewResultReg); 3824 3825 // In the IR, TFE is supposed to be used with a 2 element struct return 3826 // type. The intruction really returns these two values in one contiguous 3827 // register, with one additional dword beyond the loaded data. Rewrite the 3828 // return type to use a single register result. 3829 3830 if (IsTFE) { 3831 Dst1Reg = MI.getOperand(1).getReg(); 3832 if (MRI->getType(Dst1Reg) != S32) 3833 return false; 3834 3835 // TODO: Make sure the TFE operand bit is set. 3836 MI.RemoveOperand(1); 3837 3838 // Handle the easy case that requires no repack instructions. 3839 if (Ty == S32) { 3840 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3841 return true; 3842 } 3843 } 3844 3845 // Now figure out how to copy the new result register back into the old 3846 // result. 3847 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3848 3849 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3850 3851 if (ResultNumRegs == 1) { 3852 assert(!IsTFE); 3853 ResultRegs[0] = NewResultReg; 3854 } else { 3855 // We have to repack into a new vector of some kind. 3856 for (int I = 0; I != NumDataRegs; ++I) 3857 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3858 B.buildUnmerge(ResultRegs, NewResultReg); 3859 3860 // Drop the final TFE element to get the data part. The TFE result is 3861 // directly written to the right place already. 3862 if (IsTFE) 3863 ResultRegs.resize(NumDataRegs); 3864 } 3865 3866 // For an s16 scalar result, we form an s32 result with a truncate regardless 3867 // of packed vs. unpacked. 3868 if (IsD16 && !Ty.isVector()) { 3869 B.buildTrunc(DstReg, ResultRegs[0]); 3870 return true; 3871 } 3872 3873 // Avoid a build/concat_vector of 1 entry. 3874 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3875 B.buildBitcast(DstReg, ResultRegs[0]); 3876 return true; 3877 } 3878 3879 assert(Ty.isVector()); 3880 3881 if (IsD16) { 3882 // For packed D16 results with TFE enabled, all the data components are 3883 // S32. Cast back to the expected type. 3884 // 3885 // TODO: We don't really need to use load s32 elements. We would only need one 3886 // cast for the TFE result if a multiple of v2s16 was used. 3887 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3888 for (Register &Reg : ResultRegs) 3889 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3890 } else if (ST.hasUnpackedD16VMem()) { 3891 for (Register &Reg : ResultRegs) 3892 Reg = B.buildTrunc(S16, Reg).getReg(0); 3893 } 3894 } 3895 3896 auto padWithUndef = [&](LLT Ty, int NumElts) { 3897 if (NumElts == 0) 3898 return; 3899 Register Undef = B.buildUndef(Ty).getReg(0); 3900 for (int I = 0; I != NumElts; ++I) 3901 ResultRegs.push_back(Undef); 3902 }; 3903 3904 // Pad out any elements eliminated due to the dmask. 3905 LLT ResTy = MRI->getType(ResultRegs[0]); 3906 if (!ResTy.isVector()) { 3907 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3908 B.buildBuildVector(DstReg, ResultRegs); 3909 return true; 3910 } 3911 3912 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 3913 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 3914 3915 // Deal with the one annoying legal case. 3916 const LLT V3S16 = LLT::vector(3, 16); 3917 if (Ty == V3S16) { 3918 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 3919 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 3920 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 3921 return true; 3922 } 3923 3924 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 3925 B.buildConcatVectors(DstReg, ResultRegs); 3926 return true; 3927 } 3928 3929 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3930 MachineInstr &MI, MachineIRBuilder &B, 3931 GISelChangeObserver &Observer) const { 3932 Register Dst = MI.getOperand(0).getReg(); 3933 LLT Ty = B.getMRI()->getType(Dst); 3934 unsigned Size = Ty.getSizeInBits(); 3935 MachineFunction &MF = B.getMF(); 3936 3937 Observer.changingInstr(MI); 3938 3939 // FIXME: We don't really need this intermediate instruction. The intrinsic 3940 // should be fixed to have a memory operand. Since it's readnone, we're not 3941 // allowed to add one. 3942 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3943 MI.RemoveOperand(1); // Remove intrinsic ID 3944 3945 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 3946 // TODO: Should this use datalayout alignment? 3947 const unsigned MemSize = (Size + 7) / 8; 3948 const unsigned MemAlign = 4; 3949 MachineMemOperand *MMO = MF.getMachineMemOperand( 3950 MachinePointerInfo(), 3951 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3952 MachineMemOperand::MOInvariant, MemSize, MemAlign); 3953 MI.addMemOperand(MF, MMO); 3954 3955 // There are no 96-bit result scalar loads, but widening to 128-bit should 3956 // always be legal. We may need to restore this to a 96-bit result if it turns 3957 // out this needs to be converted to a vector load during RegBankSelect. 3958 if (!isPowerOf2_32(Size)) { 3959 LegalizerHelper Helper(MF, *this, Observer, B); 3960 B.setInstr(MI); 3961 3962 if (Ty.isVector()) 3963 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 3964 else 3965 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 3966 } 3967 3968 Observer.changedInstr(MI); 3969 return true; 3970 } 3971 3972 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 3973 MachineRegisterInfo &MRI, 3974 MachineIRBuilder &B) const { 3975 B.setInstr(MI); 3976 3977 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 3978 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 3979 !ST.isTrapHandlerEnabled()) { 3980 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 3981 } else { 3982 // Pass queue pointer to trap handler as input, and insert trap instruction 3983 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 3984 const ArgDescriptor *Arg = 3985 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 3986 if (!Arg) 3987 return false; 3988 MachineRegisterInfo &MRI = *B.getMRI(); 3989 Register SGPR01(AMDGPU::SGPR0_SGPR1); 3990 Register LiveIn = getLiveInRegister( 3991 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 3992 /*InsertLiveInCopy=*/false); 3993 if (!loadInputValue(LiveIn, B, Arg)) 3994 return false; 3995 B.buildCopy(SGPR01, LiveIn); 3996 B.buildInstr(AMDGPU::S_TRAP) 3997 .addImm(GCNSubtarget::TrapIDLLVMTrap) 3998 .addReg(SGPR01, RegState::Implicit); 3999 } 4000 4001 MI.eraseFromParent(); 4002 return true; 4003 } 4004 4005 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4006 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4007 B.setInstr(MI); 4008 4009 // Is non-HSA path or trap-handler disabled? then, report a warning 4010 // accordingly 4011 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4012 !ST.isTrapHandlerEnabled()) { 4013 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4014 "debugtrap handler not supported", 4015 MI.getDebugLoc(), DS_Warning); 4016 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4017 Ctx.diagnose(NoTrap); 4018 } else { 4019 // Insert debug-trap instruction 4020 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4021 } 4022 4023 MI.eraseFromParent(); 4024 return true; 4025 } 4026 4027 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4028 MachineIRBuilder &B, 4029 GISelChangeObserver &Observer) const { 4030 MachineRegisterInfo &MRI = *B.getMRI(); 4031 4032 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4033 auto IntrID = MI.getIntrinsicID(); 4034 switch (IntrID) { 4035 case Intrinsic::amdgcn_if: 4036 case Intrinsic::amdgcn_else: { 4037 MachineInstr *Br = nullptr; 4038 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 4039 const SIRegisterInfo *TRI 4040 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4041 4042 B.setInstr(*BrCond); 4043 Register Def = MI.getOperand(1).getReg(); 4044 Register Use = MI.getOperand(3).getReg(); 4045 4046 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 4047 if (Br) 4048 BrTarget = Br->getOperand(0).getMBB(); 4049 4050 if (IntrID == Intrinsic::amdgcn_if) { 4051 B.buildInstr(AMDGPU::SI_IF) 4052 .addDef(Def) 4053 .addUse(Use) 4054 .addMBB(BrTarget); 4055 } else { 4056 B.buildInstr(AMDGPU::SI_ELSE) 4057 .addDef(Def) 4058 .addUse(Use) 4059 .addMBB(BrTarget) 4060 .addImm(0); 4061 } 4062 4063 if (Br) 4064 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 4065 4066 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4067 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4068 MI.eraseFromParent(); 4069 BrCond->eraseFromParent(); 4070 return true; 4071 } 4072 4073 return false; 4074 } 4075 case Intrinsic::amdgcn_loop: { 4076 MachineInstr *Br = nullptr; 4077 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 4078 const SIRegisterInfo *TRI 4079 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4080 4081 B.setInstr(*BrCond); 4082 4083 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 4084 if (Br) 4085 BrTarget = Br->getOperand(0).getMBB(); 4086 4087 Register Reg = MI.getOperand(2).getReg(); 4088 B.buildInstr(AMDGPU::SI_LOOP) 4089 .addUse(Reg) 4090 .addMBB(BrTarget); 4091 4092 if (Br) 4093 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 4094 4095 MI.eraseFromParent(); 4096 BrCond->eraseFromParent(); 4097 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4098 return true; 4099 } 4100 4101 return false; 4102 } 4103 case Intrinsic::amdgcn_kernarg_segment_ptr: 4104 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4105 B.setInstr(MI); 4106 // This only makes sense to call in a kernel, so just lower to null. 4107 B.buildConstant(MI.getOperand(0).getReg(), 0); 4108 MI.eraseFromParent(); 4109 return true; 4110 } 4111 4112 return legalizePreloadedArgIntrin( 4113 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4114 case Intrinsic::amdgcn_implicitarg_ptr: 4115 return legalizeImplicitArgPtr(MI, MRI, B); 4116 case Intrinsic::amdgcn_workitem_id_x: 4117 return legalizePreloadedArgIntrin(MI, MRI, B, 4118 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4119 case Intrinsic::amdgcn_workitem_id_y: 4120 return legalizePreloadedArgIntrin(MI, MRI, B, 4121 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4122 case Intrinsic::amdgcn_workitem_id_z: 4123 return legalizePreloadedArgIntrin(MI, MRI, B, 4124 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4125 case Intrinsic::amdgcn_workgroup_id_x: 4126 return legalizePreloadedArgIntrin(MI, MRI, B, 4127 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4128 case Intrinsic::amdgcn_workgroup_id_y: 4129 return legalizePreloadedArgIntrin(MI, MRI, B, 4130 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4131 case Intrinsic::amdgcn_workgroup_id_z: 4132 return legalizePreloadedArgIntrin(MI, MRI, B, 4133 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4134 case Intrinsic::amdgcn_dispatch_ptr: 4135 return legalizePreloadedArgIntrin(MI, MRI, B, 4136 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4137 case Intrinsic::amdgcn_queue_ptr: 4138 return legalizePreloadedArgIntrin(MI, MRI, B, 4139 AMDGPUFunctionArgInfo::QUEUE_PTR); 4140 case Intrinsic::amdgcn_implicit_buffer_ptr: 4141 return legalizePreloadedArgIntrin( 4142 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4143 case Intrinsic::amdgcn_dispatch_id: 4144 return legalizePreloadedArgIntrin(MI, MRI, B, 4145 AMDGPUFunctionArgInfo::DISPATCH_ID); 4146 case Intrinsic::amdgcn_fdiv_fast: 4147 return legalizeFDIVFastIntrin(MI, MRI, B); 4148 case Intrinsic::amdgcn_is_shared: 4149 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4150 case Intrinsic::amdgcn_is_private: 4151 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4152 case Intrinsic::amdgcn_wavefrontsize: { 4153 B.setInstr(MI); 4154 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4155 MI.eraseFromParent(); 4156 return true; 4157 } 4158 case Intrinsic::amdgcn_s_buffer_load: 4159 return legalizeSBufferLoad(MI, B, Observer); 4160 case Intrinsic::amdgcn_raw_buffer_store: 4161 case Intrinsic::amdgcn_struct_buffer_store: 4162 return legalizeBufferStore(MI, MRI, B, false, false); 4163 case Intrinsic::amdgcn_raw_buffer_store_format: 4164 case Intrinsic::amdgcn_struct_buffer_store_format: 4165 return legalizeBufferStore(MI, MRI, B, false, true); 4166 case Intrinsic::amdgcn_raw_tbuffer_store: 4167 case Intrinsic::amdgcn_struct_tbuffer_store: 4168 return legalizeBufferStore(MI, MRI, B, true, true); 4169 case Intrinsic::amdgcn_raw_buffer_load: 4170 case Intrinsic::amdgcn_struct_buffer_load: 4171 return legalizeBufferLoad(MI, MRI, B, false, false); 4172 case Intrinsic::amdgcn_raw_buffer_load_format: 4173 case Intrinsic::amdgcn_struct_buffer_load_format: 4174 return legalizeBufferLoad(MI, MRI, B, true, false); 4175 case Intrinsic::amdgcn_raw_tbuffer_load: 4176 case Intrinsic::amdgcn_struct_tbuffer_load: 4177 return legalizeBufferLoad(MI, MRI, B, true, true); 4178 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4179 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4180 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4181 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4182 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4183 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4184 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4185 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4186 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4187 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4188 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4189 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4190 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4191 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4192 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4193 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4194 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4195 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4196 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4197 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4198 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4199 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4200 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4201 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4202 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4203 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4204 return legalizeBufferAtomic(MI, B, IntrID); 4205 case Intrinsic::amdgcn_atomic_inc: 4206 return legalizeAtomicIncDec(MI, B, true); 4207 case Intrinsic::amdgcn_atomic_dec: 4208 return legalizeAtomicIncDec(MI, B, false); 4209 case Intrinsic::trap: 4210 return legalizeTrapIntrinsic(MI, MRI, B); 4211 case Intrinsic::debugtrap: 4212 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4213 default: { 4214 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4215 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4216 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4217 return true; 4218 } 4219 } 4220 4221 return true; 4222 } 4223