1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/ADT/ScopeExit.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 31 #include "llvm/CodeGen/TargetOpcodes.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/DerivedTypes.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/Support/Debug.h" 37 38 #define DEBUG_TYPE "amdgpu-legalinfo" 39 40 using namespace llvm; 41 using namespace LegalizeActions; 42 using namespace LegalizeMutations; 43 using namespace LegalityPredicates; 44 using namespace MIPatternMatch; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 static LegalityPredicate isMultiple32(unsigned TypeIdx, 61 unsigned MaxSize = 1024) { 62 return [=](const LegalityQuery &Query) { 63 const LLT Ty = Query.Types[TypeIdx]; 64 const LLT EltTy = Ty.getScalarType(); 65 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 66 }; 67 } 68 69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 70 return [=](const LegalityQuery &Query) { 71 return Query.Types[TypeIdx].getSizeInBits() == Size; 72 }; 73 } 74 75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 return Ty.isVector() && 79 Ty.getNumElements() % 2 != 0 && 80 Ty.getElementType().getSizeInBits() < 32 && 81 Ty.getSizeInBits() % 32 != 0; 82 }; 83 } 84 85 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 86 return [=](const LegalityQuery &Query) { 87 const LLT Ty = Query.Types[TypeIdx]; 88 const LLT EltTy = Ty.getScalarType(); 89 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 90 }; 91 } 92 93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getElementType(); 97 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 98 }; 99 } 100 101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 unsigned Size = Ty.getSizeInBits(); 106 unsigned Pieces = (Size + 63) / 64; 107 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 108 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 109 }; 110 } 111 112 // Increase the number of vector elements to reach the next multiple of 32-bit 113 // type. 114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 115 return [=](const LegalityQuery &Query) { 116 const LLT Ty = Query.Types[TypeIdx]; 117 118 const LLT EltTy = Ty.getElementType(); 119 const int Size = Ty.getSizeInBits(); 120 const int EltSize = EltTy.getSizeInBits(); 121 const int NextMul32 = (Size + 31) / 32; 122 123 assert(EltSize < 32); 124 125 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 126 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 127 }; 128 } 129 130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 131 return [=](const LegalityQuery &Query) { 132 const LLT QueryTy = Query.Types[TypeIdx]; 133 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 134 }; 135 } 136 137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 138 return [=](const LegalityQuery &Query) { 139 const LLT QueryTy = Query.Types[TypeIdx]; 140 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 141 }; 142 } 143 144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT QueryTy = Query.Types[TypeIdx]; 147 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 148 }; 149 } 150 151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 152 // v2s16. 153 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 154 return [=](const LegalityQuery &Query) { 155 const LLT Ty = Query.Types[TypeIdx]; 156 if (Ty.isVector()) { 157 const int EltSize = Ty.getElementType().getSizeInBits(); 158 return EltSize == 32 || EltSize == 64 || 159 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 160 EltSize == 128 || EltSize == 256; 161 } 162 163 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 164 }; 165 } 166 167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getElementType() == Type; 171 }; 172 } 173 174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 if (!QueryTy.isVector()) 178 return false; 179 const LLT EltTy = QueryTy.getElementType(); 180 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 181 }; 182 } 183 184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 const LLT Ty = Query.Types[TypeIdx]; 187 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 188 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 189 }; 190 } 191 192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 193 return [=](const LegalityQuery &Query) { 194 return Query.Types[TypeIdx0].getSizeInBits() < 195 Query.Types[TypeIdx1].getSizeInBits(); 196 }; 197 } 198 199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 200 return [=](const LegalityQuery &Query) { 201 return Query.Types[TypeIdx0].getSizeInBits() > 202 Query.Types[TypeIdx1].getSizeInBits(); 203 }; 204 } 205 206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 207 const GCNTargetMachine &TM) 208 : ST(ST_) { 209 using namespace TargetOpcode; 210 211 auto GetAddrSpacePtr = [&TM](unsigned AS) { 212 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 213 }; 214 215 const LLT S1 = LLT::scalar(1); 216 const LLT S16 = LLT::scalar(16); 217 const LLT S32 = LLT::scalar(32); 218 const LLT S64 = LLT::scalar(64); 219 const LLT S128 = LLT::scalar(128); 220 const LLT S256 = LLT::scalar(256); 221 const LLT S512 = LLT::scalar(512); 222 const LLT S1024 = LLT::scalar(1024); 223 224 const LLT V2S16 = LLT::vector(2, 16); 225 const LLT V4S16 = LLT::vector(4, 16); 226 227 const LLT V2S32 = LLT::vector(2, 32); 228 const LLT V3S32 = LLT::vector(3, 32); 229 const LLT V4S32 = LLT::vector(4, 32); 230 const LLT V5S32 = LLT::vector(5, 32); 231 const LLT V6S32 = LLT::vector(6, 32); 232 const LLT V7S32 = LLT::vector(7, 32); 233 const LLT V8S32 = LLT::vector(8, 32); 234 const LLT V9S32 = LLT::vector(9, 32); 235 const LLT V10S32 = LLT::vector(10, 32); 236 const LLT V11S32 = LLT::vector(11, 32); 237 const LLT V12S32 = LLT::vector(12, 32); 238 const LLT V13S32 = LLT::vector(13, 32); 239 const LLT V14S32 = LLT::vector(14, 32); 240 const LLT V15S32 = LLT::vector(15, 32); 241 const LLT V16S32 = LLT::vector(16, 32); 242 const LLT V32S32 = LLT::vector(32, 32); 243 244 const LLT V2S64 = LLT::vector(2, 64); 245 const LLT V3S64 = LLT::vector(3, 64); 246 const LLT V4S64 = LLT::vector(4, 64); 247 const LLT V5S64 = LLT::vector(5, 64); 248 const LLT V6S64 = LLT::vector(6, 64); 249 const LLT V7S64 = LLT::vector(7, 64); 250 const LLT V8S64 = LLT::vector(8, 64); 251 const LLT V16S64 = LLT::vector(16, 64); 252 253 std::initializer_list<LLT> AllS32Vectors = 254 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 255 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 256 std::initializer_list<LLT> AllS64Vectors = 257 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 258 259 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 260 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 261 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 262 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 263 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 264 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 265 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 266 267 const LLT CodePtr = FlatPtr; 268 269 const std::initializer_list<LLT> AddrSpaces64 = { 270 GlobalPtr, ConstantPtr, FlatPtr 271 }; 272 273 const std::initializer_list<LLT> AddrSpaces32 = { 274 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 275 }; 276 277 const std::initializer_list<LLT> FPTypesBase = { 278 S32, S64 279 }; 280 281 const std::initializer_list<LLT> FPTypes16 = { 282 S32, S64, S16 283 }; 284 285 const std::initializer_list<LLT> FPTypesPK16 = { 286 S32, S64, S16, V2S16 287 }; 288 289 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 290 291 setAction({G_BRCOND, S1}, Legal); // VCC branches 292 setAction({G_BRCOND, S32}, Legal); // SCC branches 293 294 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 295 // elements for v3s16 296 getActionDefinitionsBuilder(G_PHI) 297 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 298 .legalFor(AllS32Vectors) 299 .legalFor(AllS64Vectors) 300 .legalFor(AddrSpaces64) 301 .legalFor(AddrSpaces32) 302 .clampScalar(0, S32, S256) 303 .widenScalarToNextPow2(0, 32) 304 .clampMaxNumElements(0, S32, 16) 305 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 306 .legalIf(isPointer(0)); 307 308 if (ST.hasVOP3PInsts()) { 309 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 310 .legalFor({S32, S16, V2S16}) 311 .clampScalar(0, S16, S32) 312 .clampMaxNumElements(0, S16, 2) 313 .scalarize(0) 314 .widenScalarToNextPow2(0, 32); 315 } else if (ST.has16BitInsts()) { 316 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 317 .legalFor({S32, S16}) 318 .clampScalar(0, S16, S32) 319 .scalarize(0) 320 .widenScalarToNextPow2(0, 32); 321 } else { 322 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 323 .legalFor({S32}) 324 .clampScalar(0, S32, S32) 325 .scalarize(0); 326 } 327 328 // FIXME: Not really legal. Placeholder for custom lowering. 329 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 330 .customFor({S32, S64}) 331 .clampScalar(0, S32, S64) 332 .widenScalarToNextPow2(0, 32) 333 .scalarize(0); 334 335 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 336 .legalFor({S32}) 337 .clampScalar(0, S32, S32) 338 .scalarize(0); 339 340 // Report legal for any types we can handle anywhere. For the cases only legal 341 // on the SALU, RegBankSelect will be able to re-legalize. 342 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 343 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 344 .clampScalar(0, S32, S64) 345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 346 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 347 .widenScalarToNextPow2(0) 348 .scalarize(0); 349 350 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 351 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 352 .legalFor({{S32, S1}, {S32, S32}}) 353 .minScalar(0, S32) 354 // TODO: .scalarize(0) 355 .lower(); 356 357 getActionDefinitionsBuilder(G_BITCAST) 358 // Don't worry about the size constraint. 359 .legalIf(all(isRegisterType(0), isRegisterType(1))) 360 .lower(); 361 362 363 getActionDefinitionsBuilder(G_CONSTANT) 364 .legalFor({S1, S32, S64, S16, GlobalPtr, 365 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 366 .clampScalar(0, S32, S64) 367 .widenScalarToNextPow2(0) 368 .legalIf(isPointer(0)); 369 370 getActionDefinitionsBuilder(G_FCONSTANT) 371 .legalFor({S32, S64, S16}) 372 .clampScalar(0, S16, S64); 373 374 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 375 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 376 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 377 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 378 .clampScalarOrElt(0, S32, S1024) 379 .legalIf(isMultiple32(0)) 380 .widenScalarToNextPow2(0, 32) 381 .clampMaxNumElements(0, S32, 16); 382 383 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 384 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 385 .unsupportedFor({PrivatePtr}) 386 .custom(); 387 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 388 389 auto &FPOpActions = getActionDefinitionsBuilder( 390 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 391 .legalFor({S32, S64}); 392 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 393 .customFor({S32, S64}); 394 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 395 .customFor({S32, S64}); 396 397 if (ST.has16BitInsts()) { 398 if (ST.hasVOP3PInsts()) 399 FPOpActions.legalFor({S16, V2S16}); 400 else 401 FPOpActions.legalFor({S16}); 402 403 TrigActions.customFor({S16}); 404 FDIVActions.customFor({S16}); 405 } 406 407 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 408 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 409 410 if (ST.hasVOP3PInsts()) { 411 MinNumMaxNum.customFor(FPTypesPK16) 412 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 413 .clampMaxNumElements(0, S16, 2) 414 .clampScalar(0, S16, S64) 415 .scalarize(0); 416 } else if (ST.has16BitInsts()) { 417 MinNumMaxNum.customFor(FPTypes16) 418 .clampScalar(0, S16, S64) 419 .scalarize(0); 420 } else { 421 MinNumMaxNum.customFor(FPTypesBase) 422 .clampScalar(0, S32, S64) 423 .scalarize(0); 424 } 425 426 if (ST.hasVOP3PInsts()) 427 FPOpActions.clampMaxNumElements(0, S16, 2); 428 429 FPOpActions 430 .scalarize(0) 431 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 432 433 TrigActions 434 .scalarize(0) 435 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 436 437 FDIVActions 438 .scalarize(0) 439 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 440 441 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 442 .legalFor(FPTypesPK16) 443 .clampMaxNumElements(0, S16, 2) 444 .scalarize(0) 445 .clampScalar(0, S16, S64); 446 447 if (ST.has16BitInsts()) { 448 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 449 .legalFor({S32, S64, S16}) 450 .scalarize(0) 451 .clampScalar(0, S16, S64); 452 } else { 453 getActionDefinitionsBuilder(G_FSQRT) 454 .legalFor({S32, S64}) 455 .scalarize(0) 456 .clampScalar(0, S32, S64); 457 458 if (ST.hasFractBug()) { 459 getActionDefinitionsBuilder(G_FFLOOR) 460 .customFor({S64}) 461 .legalFor({S32, S64}) 462 .scalarize(0) 463 .clampScalar(0, S32, S64); 464 } else { 465 getActionDefinitionsBuilder(G_FFLOOR) 466 .legalFor({S32, S64}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64); 469 } 470 } 471 472 getActionDefinitionsBuilder(G_FPTRUNC) 473 .legalFor({{S32, S64}, {S16, S32}}) 474 .scalarize(0) 475 .lower(); 476 477 getActionDefinitionsBuilder(G_FPEXT) 478 .legalFor({{S64, S32}, {S32, S16}}) 479 .lowerFor({{S64, S16}}) // FIXME: Implement 480 .scalarize(0); 481 482 getActionDefinitionsBuilder(G_FSUB) 483 // Use actual fsub instruction 484 .legalFor({S32}) 485 // Must use fadd + fneg 486 .lowerFor({S64, S16, V2S16}) 487 .scalarize(0) 488 .clampScalar(0, S32, S64); 489 490 // Whether this is legal depends on the floating point mode for the function. 491 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 492 if (ST.hasMadF16()) 493 FMad.customFor({S32, S16}); 494 else 495 FMad.customFor({S32}); 496 FMad.scalarize(0) 497 .lower(); 498 499 // TODO: Do we need to clamp maximum bitwidth? 500 getActionDefinitionsBuilder(G_TRUNC) 501 .legalIf(isScalar(0)) 502 .legalFor({{V2S16, V2S32}}) 503 .clampMaxNumElements(0, S16, 2) 504 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 505 // situations (like an invalid implicit use), we don't want to infinite loop 506 // in the legalizer. 507 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 508 .alwaysLegal(); 509 510 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 511 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 512 {S32, S1}, {S64, S1}, {S16, S1}}) 513 .scalarize(0) 514 .clampScalar(0, S32, S64) 515 .widenScalarToNextPow2(1, 32); 516 517 // TODO: Split s1->s64 during regbankselect for VALU. 518 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 519 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 520 .lowerFor({{S32, S64}}) 521 .lowerIf(typeIs(1, S1)) 522 .customFor({{S64, S64}}); 523 if (ST.has16BitInsts()) 524 IToFP.legalFor({{S16, S16}}); 525 IToFP.clampScalar(1, S32, S64) 526 .scalarize(0) 527 .widenScalarToNextPow2(1); 528 529 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 530 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 531 .customFor({{S64, S64}}); 532 if (ST.has16BitInsts()) 533 FPToI.legalFor({{S16, S16}}); 534 else 535 FPToI.minScalar(1, S32); 536 537 FPToI.minScalar(0, S32) 538 .scalarize(0) 539 .lower(); 540 541 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 542 .scalarize(0) 543 .lower(); 544 545 if (ST.has16BitInsts()) { 546 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 547 .legalFor({S16, S32, S64}) 548 .clampScalar(0, S16, S64) 549 .scalarize(0); 550 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 551 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 552 .legalFor({S32, S64}) 553 .clampScalar(0, S32, S64) 554 .scalarize(0); 555 } else { 556 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 557 .legalFor({S32}) 558 .customFor({S64}) 559 .clampScalar(0, S32, S64) 560 .scalarize(0); 561 } 562 563 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 564 .scalarize(0) 565 .alwaysLegal(); 566 567 auto &CmpBuilder = 568 getActionDefinitionsBuilder(G_ICMP) 569 // The compare output type differs based on the register bank of the output, 570 // so make both s1 and s32 legal. 571 // 572 // Scalar compares producing output in scc will be promoted to s32, as that 573 // is the allocatable register type that will be needed for the copy from 574 // scc. This will be promoted during RegBankSelect, and we assume something 575 // before that won't try to use s32 result types. 576 // 577 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 578 // bank. 579 .legalForCartesianProduct( 580 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 581 .legalForCartesianProduct( 582 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 583 if (ST.has16BitInsts()) { 584 CmpBuilder.legalFor({{S1, S16}}); 585 } 586 587 CmpBuilder 588 .widenScalarToNextPow2(1) 589 .clampScalar(1, S32, S64) 590 .scalarize(0) 591 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 592 593 getActionDefinitionsBuilder(G_FCMP) 594 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 595 .widenScalarToNextPow2(1) 596 .clampScalar(1, S32, S64) 597 .scalarize(0); 598 599 // FIXME: fpow has a selection pattern that should move to custom lowering. 600 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 601 if (ST.has16BitInsts()) 602 Exp2Ops.legalFor({S32, S16}); 603 else 604 Exp2Ops.legalFor({S32}); 605 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 606 Exp2Ops.scalarize(0); 607 608 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 609 if (ST.has16BitInsts()) 610 ExpOps.customFor({{S32}, {S16}}); 611 else 612 ExpOps.customFor({S32}); 613 ExpOps.clampScalar(0, MinScalarFPTy, S32) 614 .scalarize(0); 615 616 // The 64-bit versions produce 32-bit results, but only on the SALU. 617 getActionDefinitionsBuilder(G_CTPOP) 618 .legalFor({{S32, S32}, {S32, S64}}) 619 .clampScalar(0, S32, S32) 620 .clampScalar(1, S32, S64) 621 .scalarize(0) 622 .widenScalarToNextPow2(0, 32) 623 .widenScalarToNextPow2(1, 32); 624 625 // The hardware instructions return a different result on 0 than the generic 626 // instructions expect. The hardware produces -1, but these produce the 627 // bitwidth. 628 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 629 .scalarize(0) 630 .clampScalar(0, S32, S32) 631 .clampScalar(1, S32, S64) 632 .widenScalarToNextPow2(0, 32) 633 .widenScalarToNextPow2(1, 32) 634 .lower(); 635 636 // The 64-bit versions produce 32-bit results, but only on the SALU. 637 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 638 .legalFor({{S32, S32}, {S32, S64}}) 639 .clampScalar(0, S32, S32) 640 .clampScalar(1, S32, S64) 641 .scalarize(0) 642 .widenScalarToNextPow2(0, 32) 643 .widenScalarToNextPow2(1, 32); 644 645 getActionDefinitionsBuilder(G_BITREVERSE) 646 .legalFor({S32}) 647 .clampScalar(0, S32, S32) 648 .scalarize(0); 649 650 if (ST.has16BitInsts()) { 651 getActionDefinitionsBuilder(G_BSWAP) 652 .legalFor({S16, S32, V2S16}) 653 .clampMaxNumElements(0, S16, 2) 654 // FIXME: Fixing non-power-of-2 before clamp is workaround for 655 // narrowScalar limitation. 656 .widenScalarToNextPow2(0) 657 .clampScalar(0, S16, S32) 658 .scalarize(0); 659 660 if (ST.hasVOP3PInsts()) { 661 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 662 .legalFor({S32, S16, V2S16}) 663 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 664 .clampMaxNumElements(0, S16, 2) 665 .minScalar(0, S16) 666 .widenScalarToNextPow2(0) 667 .scalarize(0) 668 .lower(); 669 } else { 670 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 671 .legalFor({S32, S16}) 672 .widenScalarToNextPow2(0) 673 .minScalar(0, S16) 674 .scalarize(0) 675 .lower(); 676 } 677 } else { 678 // TODO: Should have same legality without v_perm_b32 679 getActionDefinitionsBuilder(G_BSWAP) 680 .legalFor({S32}) 681 .lowerIf(narrowerThan(0, 32)) 682 // FIXME: Fixing non-power-of-2 before clamp is workaround for 683 // narrowScalar limitation. 684 .widenScalarToNextPow2(0) 685 .maxScalar(0, S32) 686 .scalarize(0) 687 .lower(); 688 689 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 690 .legalFor({S32}) 691 .minScalar(0, S32) 692 .widenScalarToNextPow2(0) 693 .scalarize(0) 694 .lower(); 695 } 696 697 getActionDefinitionsBuilder(G_INTTOPTR) 698 // List the common cases 699 .legalForCartesianProduct(AddrSpaces64, {S64}) 700 .legalForCartesianProduct(AddrSpaces32, {S32}) 701 .scalarize(0) 702 // Accept any address space as long as the size matches 703 .legalIf(sameSize(0, 1)) 704 .widenScalarIf(smallerThan(1, 0), 705 [](const LegalityQuery &Query) { 706 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 707 }) 708 .narrowScalarIf(greaterThan(1, 0), 709 [](const LegalityQuery &Query) { 710 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 711 }); 712 713 getActionDefinitionsBuilder(G_PTRTOINT) 714 // List the common cases 715 .legalForCartesianProduct(AddrSpaces64, {S64}) 716 .legalForCartesianProduct(AddrSpaces32, {S32}) 717 .scalarize(0) 718 // Accept any address space as long as the size matches 719 .legalIf(sameSize(0, 1)) 720 .widenScalarIf(smallerThan(0, 1), 721 [](const LegalityQuery &Query) { 722 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 723 }) 724 .narrowScalarIf( 725 greaterThan(0, 1), 726 [](const LegalityQuery &Query) { 727 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 728 }); 729 730 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 731 .scalarize(0) 732 .custom(); 733 734 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 735 // handle some operations by just promoting the register during 736 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 737 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 738 switch (AS) { 739 // FIXME: Private element size. 740 case AMDGPUAS::PRIVATE_ADDRESS: 741 return 32; 742 // FIXME: Check subtarget 743 case AMDGPUAS::LOCAL_ADDRESS: 744 return ST.useDS128() ? 128 : 64; 745 746 // Treat constant and global as identical. SMRD loads are sometimes usable 747 // for global loads (ideally constant address space should be eliminated) 748 // depending on the context. Legality cannot be context dependent, but 749 // RegBankSelect can split the load as necessary depending on the pointer 750 // register bank/uniformity and if the memory is invariant or not written in 751 // a kernel. 752 case AMDGPUAS::CONSTANT_ADDRESS: 753 case AMDGPUAS::GLOBAL_ADDRESS: 754 return IsLoad ? 512 : 128; 755 default: 756 return 128; 757 } 758 }; 759 760 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 761 bool IsLoad) -> bool { 762 const LLT DstTy = Query.Types[0]; 763 764 // Split vector extloads. 765 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 766 unsigned Align = Query.MMODescrs[0].AlignInBits; 767 768 if (MemSize < DstTy.getSizeInBits()) 769 MemSize = std::max(MemSize, Align); 770 771 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 772 return true; 773 774 const LLT PtrTy = Query.Types[1]; 775 unsigned AS = PtrTy.getAddressSpace(); 776 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 777 return true; 778 779 // Catch weird sized loads that don't evenly divide into the access sizes 780 // TODO: May be able to widen depending on alignment etc. 781 unsigned NumRegs = (MemSize + 31) / 32; 782 if (NumRegs == 3) { 783 if (!ST.hasDwordx3LoadStores()) 784 return true; 785 } else { 786 // If the alignment allows, these should have been widened. 787 if (!isPowerOf2_32(NumRegs)) 788 return true; 789 } 790 791 if (Align < MemSize) { 792 const SITargetLowering *TLI = ST.getTargetLowering(); 793 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 794 } 795 796 return false; 797 }; 798 799 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 800 unsigned Size = Query.Types[0].getSizeInBits(); 801 if (isPowerOf2_32(Size)) 802 return false; 803 804 if (Size == 96 && ST.hasDwordx3LoadStores()) 805 return false; 806 807 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 808 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 809 return false; 810 811 unsigned Align = Query.MMODescrs[0].AlignInBits; 812 unsigned RoundedSize = NextPowerOf2(Size); 813 return (Align >= RoundedSize); 814 }; 815 816 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 817 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 818 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 819 820 // TODO: Refine based on subtargets which support unaligned access or 128-bit 821 // LDS 822 // TODO: Unsupported flat for SI. 823 824 for (unsigned Op : {G_LOAD, G_STORE}) { 825 const bool IsStore = Op == G_STORE; 826 827 auto &Actions = getActionDefinitionsBuilder(Op); 828 // Whitelist the common cases. 829 // TODO: Loads to s16 on gfx9 830 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 831 {V2S32, GlobalPtr, 64, GlobalAlign32}, 832 {V4S32, GlobalPtr, 128, GlobalAlign32}, 833 {S128, GlobalPtr, 128, GlobalAlign32}, 834 {S64, GlobalPtr, 64, GlobalAlign32}, 835 {V2S64, GlobalPtr, 128, GlobalAlign32}, 836 {V2S16, GlobalPtr, 32, GlobalAlign32}, 837 {S32, GlobalPtr, 8, GlobalAlign8}, 838 {S32, GlobalPtr, 16, GlobalAlign16}, 839 840 {S32, LocalPtr, 32, 32}, 841 {S64, LocalPtr, 64, 32}, 842 {V2S32, LocalPtr, 64, 32}, 843 {S32, LocalPtr, 8, 8}, 844 {S32, LocalPtr, 16, 16}, 845 {V2S16, LocalPtr, 32, 32}, 846 847 {S32, PrivatePtr, 32, 32}, 848 {S32, PrivatePtr, 8, 8}, 849 {S32, PrivatePtr, 16, 16}, 850 {V2S16, PrivatePtr, 32, 32}, 851 852 {S32, FlatPtr, 32, GlobalAlign32}, 853 {S32, FlatPtr, 16, GlobalAlign16}, 854 {S32, FlatPtr, 8, GlobalAlign8}, 855 {V2S16, FlatPtr, 32, GlobalAlign32}, 856 857 {S32, ConstantPtr, 32, GlobalAlign32}, 858 {V2S32, ConstantPtr, 64, GlobalAlign32}, 859 {V4S32, ConstantPtr, 128, GlobalAlign32}, 860 {S64, ConstantPtr, 64, GlobalAlign32}, 861 {S128, ConstantPtr, 128, GlobalAlign32}, 862 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 863 Actions 864 .customIf(typeIs(1, Constant32Ptr)) 865 // Widen suitably aligned loads by loading extra elements. 866 .moreElementsIf([=](const LegalityQuery &Query) { 867 const LLT Ty = Query.Types[0]; 868 return Op == G_LOAD && Ty.isVector() && 869 shouldWidenLoadResult(Query); 870 }, moreElementsToNextPow2(0)) 871 .widenScalarIf([=](const LegalityQuery &Query) { 872 const LLT Ty = Query.Types[0]; 873 return Op == G_LOAD && !Ty.isVector() && 874 shouldWidenLoadResult(Query); 875 }, widenScalarOrEltToNextPow2(0)) 876 .narrowScalarIf( 877 [=](const LegalityQuery &Query) -> bool { 878 return !Query.Types[0].isVector() && 879 needToSplitMemOp(Query, Op == G_LOAD); 880 }, 881 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 882 const LLT DstTy = Query.Types[0]; 883 const LLT PtrTy = Query.Types[1]; 884 885 const unsigned DstSize = DstTy.getSizeInBits(); 886 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 887 888 // Split extloads. 889 if (DstSize > MemSize) 890 return std::make_pair(0, LLT::scalar(MemSize)); 891 892 if (!isPowerOf2_32(DstSize)) { 893 // We're probably decomposing an odd sized store. Try to split 894 // to the widest type. TODO: Account for alignment. As-is it 895 // should be OK, since the new parts will be further legalized. 896 unsigned FloorSize = PowerOf2Floor(DstSize); 897 return std::make_pair(0, LLT::scalar(FloorSize)); 898 } 899 900 if (DstSize > 32 && (DstSize % 32 != 0)) { 901 // FIXME: Need a way to specify non-extload of larger size if 902 // suitably aligned. 903 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 904 } 905 906 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 907 Op == G_LOAD); 908 if (MemSize > MaxSize) 909 return std::make_pair(0, LLT::scalar(MaxSize)); 910 911 unsigned Align = Query.MMODescrs[0].AlignInBits; 912 return std::make_pair(0, LLT::scalar(Align)); 913 }) 914 .fewerElementsIf( 915 [=](const LegalityQuery &Query) -> bool { 916 return Query.Types[0].isVector() && 917 needToSplitMemOp(Query, Op == G_LOAD); 918 }, 919 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 920 const LLT DstTy = Query.Types[0]; 921 const LLT PtrTy = Query.Types[1]; 922 923 LLT EltTy = DstTy.getElementType(); 924 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 925 Op == G_LOAD); 926 927 // FIXME: Handle widened to power of 2 results better. This ends 928 // up scalarizing. 929 // FIXME: 3 element stores scalarized on SI 930 931 // Split if it's too large for the address space. 932 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 933 unsigned NumElts = DstTy.getNumElements(); 934 unsigned EltSize = EltTy.getSizeInBits(); 935 936 if (MaxSize % EltSize == 0) { 937 return std::make_pair( 938 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 939 } 940 941 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 942 943 // FIXME: Refine when odd breakdowns handled 944 // The scalars will need to be re-legalized. 945 if (NumPieces == 1 || NumPieces >= NumElts || 946 NumElts % NumPieces != 0) 947 return std::make_pair(0, EltTy); 948 949 return std::make_pair(0, 950 LLT::vector(NumElts / NumPieces, EltTy)); 951 } 952 953 // FIXME: We could probably handle weird extending loads better. 954 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 955 if (DstTy.getSizeInBits() > MemSize) 956 return std::make_pair(0, EltTy); 957 958 unsigned EltSize = EltTy.getSizeInBits(); 959 unsigned DstSize = DstTy.getSizeInBits(); 960 if (!isPowerOf2_32(DstSize)) { 961 // We're probably decomposing an odd sized store. Try to split 962 // to the widest type. TODO: Account for alignment. As-is it 963 // should be OK, since the new parts will be further legalized. 964 unsigned FloorSize = PowerOf2Floor(DstSize); 965 return std::make_pair( 966 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 967 } 968 969 // Need to split because of alignment. 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 if (EltSize > Align && 972 (EltSize / Align < DstTy.getNumElements())) { 973 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 974 } 975 976 // May need relegalization for the scalars. 977 return std::make_pair(0, EltTy); 978 }) 979 .minScalar(0, S32); 980 981 if (IsStore) 982 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 983 984 // TODO: Need a bitcast lower option? 985 Actions 986 .legalIf([=](const LegalityQuery &Query) { 987 const LLT Ty0 = Query.Types[0]; 988 unsigned Size = Ty0.getSizeInBits(); 989 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 990 unsigned Align = Query.MMODescrs[0].AlignInBits; 991 992 // FIXME: Widening store from alignment not valid. 993 if (MemSize < Size) 994 MemSize = std::max(MemSize, Align); 995 996 // No extending vector loads. 997 if (Size > MemSize && Ty0.isVector()) 998 return false; 999 1000 switch (MemSize) { 1001 case 8: 1002 case 16: 1003 return Size == 32; 1004 case 32: 1005 case 64: 1006 case 128: 1007 return true; 1008 case 96: 1009 return ST.hasDwordx3LoadStores(); 1010 case 256: 1011 case 512: 1012 return true; 1013 default: 1014 return false; 1015 } 1016 }) 1017 .widenScalarToNextPow2(0) 1018 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1019 } 1020 1021 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1022 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1023 {S32, GlobalPtr, 16, 2 * 8}, 1024 {S32, LocalPtr, 8, 8}, 1025 {S32, LocalPtr, 16, 16}, 1026 {S32, PrivatePtr, 8, 8}, 1027 {S32, PrivatePtr, 16, 16}, 1028 {S32, ConstantPtr, 8, 8}, 1029 {S32, ConstantPtr, 16, 2 * 8}}); 1030 if (ST.hasFlatAddressSpace()) { 1031 ExtLoads.legalForTypesWithMemDesc( 1032 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1033 } 1034 1035 ExtLoads.clampScalar(0, S32, S32) 1036 .widenScalarToNextPow2(0) 1037 .unsupportedIfMemSizeNotPow2() 1038 .lower(); 1039 1040 auto &Atomics = getActionDefinitionsBuilder( 1041 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1042 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1043 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1044 G_ATOMICRMW_UMIN}) 1045 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1046 {S64, GlobalPtr}, {S64, LocalPtr}}); 1047 if (ST.hasFlatAddressSpace()) { 1048 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1049 } 1050 1051 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1052 .legalFor({{S32, LocalPtr}}); 1053 1054 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1055 // demarshalling 1056 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1057 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1058 {S32, FlatPtr}, {S64, FlatPtr}}) 1059 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1060 {S32, RegionPtr}, {S64, RegionPtr}}); 1061 // TODO: Pointer types, any 32-bit or 64-bit vector 1062 1063 // Condition should be s32 for scalar, s1 for vector. 1064 getActionDefinitionsBuilder(G_SELECT) 1065 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1066 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1067 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1068 .clampScalar(0, S16, S64) 1069 .scalarize(1) 1070 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1071 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1072 .clampMaxNumElements(0, S32, 2) 1073 .clampMaxNumElements(0, LocalPtr, 2) 1074 .clampMaxNumElements(0, PrivatePtr, 2) 1075 .scalarize(0) 1076 .widenScalarToNextPow2(0) 1077 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1078 1079 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1080 // be more flexible with the shift amount type. 1081 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1082 .legalFor({{S32, S32}, {S64, S32}}); 1083 if (ST.has16BitInsts()) { 1084 if (ST.hasVOP3PInsts()) { 1085 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1086 .clampMaxNumElements(0, S16, 2); 1087 } else 1088 Shifts.legalFor({{S16, S16}}); 1089 1090 // TODO: Support 16-bit shift amounts for all types 1091 Shifts.widenScalarIf( 1092 [=](const LegalityQuery &Query) { 1093 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1094 // 32-bit amount. 1095 const LLT ValTy = Query.Types[0]; 1096 const LLT AmountTy = Query.Types[1]; 1097 return ValTy.getSizeInBits() <= 16 && 1098 AmountTy.getSizeInBits() < 16; 1099 }, changeTo(1, S16)); 1100 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1101 Shifts.clampScalar(1, S32, S32); 1102 Shifts.clampScalar(0, S16, S64); 1103 Shifts.widenScalarToNextPow2(0, 16); 1104 } else { 1105 // Make sure we legalize the shift amount type first, as the general 1106 // expansion for the shifted type will produce much worse code if it hasn't 1107 // been truncated already. 1108 Shifts.clampScalar(1, S32, S32); 1109 Shifts.clampScalar(0, S32, S64); 1110 Shifts.widenScalarToNextPow2(0, 32); 1111 } 1112 Shifts.scalarize(0); 1113 1114 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1115 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1116 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1117 unsigned IdxTypeIdx = 2; 1118 1119 getActionDefinitionsBuilder(Op) 1120 .customIf([=](const LegalityQuery &Query) { 1121 const LLT EltTy = Query.Types[EltTypeIdx]; 1122 const LLT VecTy = Query.Types[VecTypeIdx]; 1123 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1124 return (EltTy.getSizeInBits() == 16 || 1125 EltTy.getSizeInBits() % 32 == 0) && 1126 VecTy.getSizeInBits() % 32 == 0 && 1127 VecTy.getSizeInBits() <= 1024 && 1128 IdxTy.getSizeInBits() == 32; 1129 }) 1130 .clampScalar(EltTypeIdx, S32, S64) 1131 .clampScalar(VecTypeIdx, S32, S64) 1132 .clampScalar(IdxTypeIdx, S32, S32); 1133 } 1134 1135 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1136 .unsupportedIf([=](const LegalityQuery &Query) { 1137 const LLT &EltTy = Query.Types[1].getElementType(); 1138 return Query.Types[0] != EltTy; 1139 }); 1140 1141 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1142 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1143 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1144 1145 // FIXME: Doesn't handle extract of illegal sizes. 1146 getActionDefinitionsBuilder(Op) 1147 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1148 // FIXME: Multiples of 16 should not be legal. 1149 .legalIf([=](const LegalityQuery &Query) { 1150 const LLT BigTy = Query.Types[BigTyIdx]; 1151 const LLT LitTy = Query.Types[LitTyIdx]; 1152 return (BigTy.getSizeInBits() % 32 == 0) && 1153 (LitTy.getSizeInBits() % 16 == 0); 1154 }) 1155 .widenScalarIf( 1156 [=](const LegalityQuery &Query) { 1157 const LLT BigTy = Query.Types[BigTyIdx]; 1158 return (BigTy.getScalarSizeInBits() < 16); 1159 }, 1160 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1161 .widenScalarIf( 1162 [=](const LegalityQuery &Query) { 1163 const LLT LitTy = Query.Types[LitTyIdx]; 1164 return (LitTy.getScalarSizeInBits() < 16); 1165 }, 1166 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1167 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1168 .widenScalarToNextPow2(BigTyIdx, 32); 1169 1170 } 1171 1172 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1173 .legalForCartesianProduct(AllS32Vectors, {S32}) 1174 .legalForCartesianProduct(AllS64Vectors, {S64}) 1175 .clampNumElements(0, V16S32, V32S32) 1176 .clampNumElements(0, V2S64, V16S64) 1177 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1178 1179 if (ST.hasScalarPackInsts()) { 1180 BuildVector 1181 // FIXME: Should probably widen s1 vectors straight to s32 1182 .minScalarOrElt(0, S16) 1183 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1184 .minScalar(1, S32); 1185 1186 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1187 .legalFor({V2S16, S32}) 1188 .lower(); 1189 BuildVector.minScalarOrElt(0, S32); 1190 } else { 1191 BuildVector.customFor({V2S16, S16}); 1192 BuildVector.minScalarOrElt(0, S32); 1193 1194 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1195 .customFor({V2S16, S32}) 1196 .lower(); 1197 } 1198 1199 BuildVector.legalIf(isRegisterType(0)); 1200 1201 // FIXME: Clamp maximum size 1202 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1203 .legalIf(isRegisterType(0)); 1204 1205 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1206 // pre-legalize. 1207 if (ST.hasVOP3PInsts()) { 1208 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1209 .customFor({V2S16, V2S16}) 1210 .lower(); 1211 } else 1212 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1213 1214 // Merge/Unmerge 1215 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1216 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1217 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1218 1219 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1220 const LLT Ty = Query.Types[TypeIdx]; 1221 if (Ty.isVector()) { 1222 const LLT &EltTy = Ty.getElementType(); 1223 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1224 return true; 1225 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1226 return true; 1227 } 1228 return false; 1229 }; 1230 1231 auto &Builder = getActionDefinitionsBuilder(Op) 1232 // Try to widen to s16 first for small types. 1233 // TODO: Only do this on targets with legal s16 shifts 1234 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1235 1236 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1237 .lowerFor({{S16, V2S16}}) 1238 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1239 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1240 elementTypeIs(1, S16)), 1241 changeTo(1, V2S16)) 1242 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1243 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1244 // valid. 1245 .clampScalar(LitTyIdx, S32, S512) 1246 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1247 // Break up vectors with weird elements into scalars 1248 .fewerElementsIf( 1249 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1250 scalarize(0)) 1251 .fewerElementsIf( 1252 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1253 scalarize(1)) 1254 .clampScalar(BigTyIdx, S32, S1024); 1255 1256 if (Op == G_MERGE_VALUES) { 1257 Builder.widenScalarIf( 1258 // TODO: Use 16-bit shifts if legal for 8-bit values? 1259 [=](const LegalityQuery &Query) { 1260 const LLT Ty = Query.Types[LitTyIdx]; 1261 return Ty.getSizeInBits() < 32; 1262 }, 1263 changeTo(LitTyIdx, S32)); 1264 } 1265 1266 Builder.widenScalarIf( 1267 [=](const LegalityQuery &Query) { 1268 const LLT Ty = Query.Types[BigTyIdx]; 1269 return !isPowerOf2_32(Ty.getSizeInBits()) && 1270 Ty.getSizeInBits() % 16 != 0; 1271 }, 1272 [=](const LegalityQuery &Query) { 1273 // Pick the next power of 2, or a multiple of 64 over 128. 1274 // Whichever is smaller. 1275 const LLT &Ty = Query.Types[BigTyIdx]; 1276 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1277 if (NewSizeInBits >= 256) { 1278 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1279 if (RoundedTo < NewSizeInBits) 1280 NewSizeInBits = RoundedTo; 1281 } 1282 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1283 }) 1284 .legalIf([=](const LegalityQuery &Query) { 1285 const LLT &BigTy = Query.Types[BigTyIdx]; 1286 const LLT &LitTy = Query.Types[LitTyIdx]; 1287 1288 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1289 return false; 1290 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1291 return false; 1292 1293 return BigTy.getSizeInBits() % 16 == 0 && 1294 LitTy.getSizeInBits() % 16 == 0 && 1295 BigTy.getSizeInBits() <= 1024; 1296 }) 1297 // Any vectors left are the wrong size. Scalarize them. 1298 .scalarize(0) 1299 .scalarize(1); 1300 } 1301 1302 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1303 // RegBankSelect. 1304 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1305 .legalFor({{S32}, {S64}}); 1306 1307 if (ST.hasVOP3PInsts()) { 1308 SextInReg.lowerFor({{V2S16}}) 1309 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1310 // get more vector shift opportunities, since we'll get those when 1311 // expanded. 1312 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1313 } else if (ST.has16BitInsts()) { 1314 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1315 } else { 1316 // Prefer to promote to s32 before lowering if we don't have 16-bit 1317 // shifts. This avoid a lot of intermediate truncate and extend operations. 1318 SextInReg.lowerFor({{S32}, {S64}}); 1319 } 1320 1321 SextInReg 1322 .scalarize(0) 1323 .clampScalar(0, S32, S64) 1324 .lower(); 1325 1326 getActionDefinitionsBuilder(G_FSHR) 1327 .legalFor({{S32, S32}}) 1328 .scalarize(0) 1329 .lower(); 1330 1331 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1332 .legalFor({S64}); 1333 1334 getActionDefinitionsBuilder({ 1335 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1336 G_FCOPYSIGN, 1337 1338 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1339 G_READ_REGISTER, 1340 G_WRITE_REGISTER, 1341 1342 G_SADDO, G_SSUBO, 1343 1344 // TODO: Implement 1345 G_FMINIMUM, G_FMAXIMUM, 1346 G_FSHL 1347 }).lower(); 1348 1349 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1350 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1351 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1352 .unsupported(); 1353 1354 computeTables(); 1355 verify(*ST.getInstrInfo()); 1356 } 1357 1358 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1359 MachineRegisterInfo &MRI, 1360 MachineIRBuilder &B, 1361 GISelChangeObserver &Observer) const { 1362 switch (MI.getOpcode()) { 1363 case TargetOpcode::G_ADDRSPACE_CAST: 1364 return legalizeAddrSpaceCast(MI, MRI, B); 1365 case TargetOpcode::G_FRINT: 1366 return legalizeFrint(MI, MRI, B); 1367 case TargetOpcode::G_FCEIL: 1368 return legalizeFceil(MI, MRI, B); 1369 case TargetOpcode::G_INTRINSIC_TRUNC: 1370 return legalizeIntrinsicTrunc(MI, MRI, B); 1371 case TargetOpcode::G_SITOFP: 1372 return legalizeITOFP(MI, MRI, B, true); 1373 case TargetOpcode::G_UITOFP: 1374 return legalizeITOFP(MI, MRI, B, false); 1375 case TargetOpcode::G_FPTOSI: 1376 return legalizeFPTOI(MI, MRI, B, true); 1377 case TargetOpcode::G_FPTOUI: 1378 return legalizeFPTOI(MI, MRI, B, false); 1379 case TargetOpcode::G_FMINNUM: 1380 case TargetOpcode::G_FMAXNUM: 1381 case TargetOpcode::G_FMINNUM_IEEE: 1382 case TargetOpcode::G_FMAXNUM_IEEE: 1383 return legalizeMinNumMaxNum(MI, MRI, B); 1384 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1385 return legalizeExtractVectorElt(MI, MRI, B); 1386 case TargetOpcode::G_INSERT_VECTOR_ELT: 1387 return legalizeInsertVectorElt(MI, MRI, B); 1388 case TargetOpcode::G_SHUFFLE_VECTOR: 1389 return legalizeShuffleVector(MI, MRI, B); 1390 case TargetOpcode::G_FSIN: 1391 case TargetOpcode::G_FCOS: 1392 return legalizeSinCos(MI, MRI, B); 1393 case TargetOpcode::G_GLOBAL_VALUE: 1394 return legalizeGlobalValue(MI, MRI, B); 1395 case TargetOpcode::G_LOAD: 1396 return legalizeLoad(MI, MRI, B, Observer); 1397 case TargetOpcode::G_FMAD: 1398 return legalizeFMad(MI, MRI, B); 1399 case TargetOpcode::G_FDIV: 1400 return legalizeFDIV(MI, MRI, B); 1401 case TargetOpcode::G_UDIV: 1402 case TargetOpcode::G_UREM: 1403 return legalizeUDIV_UREM(MI, MRI, B); 1404 case TargetOpcode::G_SDIV: 1405 case TargetOpcode::G_SREM: 1406 return legalizeSDIV_SREM(MI, MRI, B); 1407 case TargetOpcode::G_ATOMIC_CMPXCHG: 1408 return legalizeAtomicCmpXChg(MI, MRI, B); 1409 case TargetOpcode::G_FLOG: 1410 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1411 case TargetOpcode::G_FLOG10: 1412 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1413 case TargetOpcode::G_FEXP: 1414 return legalizeFExp(MI, B); 1415 case TargetOpcode::G_FPOW: 1416 return legalizeFPow(MI, B); 1417 case TargetOpcode::G_FFLOOR: 1418 return legalizeFFloor(MI, MRI, B); 1419 case TargetOpcode::G_BUILD_VECTOR: 1420 return legalizeBuildVector(MI, MRI, B); 1421 default: 1422 return false; 1423 } 1424 1425 llvm_unreachable("expected switch to return"); 1426 } 1427 1428 Register AMDGPULegalizerInfo::getSegmentAperture( 1429 unsigned AS, 1430 MachineRegisterInfo &MRI, 1431 MachineIRBuilder &B) const { 1432 MachineFunction &MF = B.getMF(); 1433 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1434 const LLT S32 = LLT::scalar(32); 1435 1436 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1437 1438 if (ST.hasApertureRegs()) { 1439 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1440 // getreg. 1441 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1442 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1443 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1444 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1445 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1446 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1447 unsigned Encoding = 1448 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1449 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1450 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1451 1452 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1453 1454 B.buildInstr(AMDGPU::S_GETREG_B32) 1455 .addDef(GetReg) 1456 .addImm(Encoding); 1457 MRI.setType(GetReg, S32); 1458 1459 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1460 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1461 } 1462 1463 Register QueuePtr = MRI.createGenericVirtualRegister( 1464 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1465 1466 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1467 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1468 return Register(); 1469 1470 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1471 // private_segment_aperture_base_hi. 1472 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1473 1474 // TODO: can we be smarter about machine pointer info? 1475 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1476 MachineMemOperand *MMO = MF.getMachineMemOperand( 1477 PtrInfo, 1478 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1479 MachineMemOperand::MOInvariant, 1480 4, commonAlignment(Align(64), StructOffset)); 1481 1482 Register LoadAddr; 1483 1484 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1485 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1486 } 1487 1488 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1489 MachineInstr &MI, MachineRegisterInfo &MRI, 1490 MachineIRBuilder &B) const { 1491 MachineFunction &MF = B.getMF(); 1492 1493 B.setInstr(MI); 1494 1495 const LLT S32 = LLT::scalar(32); 1496 Register Dst = MI.getOperand(0).getReg(); 1497 Register Src = MI.getOperand(1).getReg(); 1498 1499 LLT DstTy = MRI.getType(Dst); 1500 LLT SrcTy = MRI.getType(Src); 1501 unsigned DestAS = DstTy.getAddressSpace(); 1502 unsigned SrcAS = SrcTy.getAddressSpace(); 1503 1504 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1505 // vector element. 1506 assert(!DstTy.isVector()); 1507 1508 const AMDGPUTargetMachine &TM 1509 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1510 1511 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1512 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1513 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1514 return true; 1515 } 1516 1517 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1518 // Truncate. 1519 B.buildExtract(Dst, Src, 0); 1520 MI.eraseFromParent(); 1521 return true; 1522 } 1523 1524 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1525 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1526 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1527 1528 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1529 // another. Merge operands are required to be the same type, but creating an 1530 // extra ptrtoint would be kind of pointless. 1531 auto HighAddr = B.buildConstant( 1532 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1533 B.buildMerge(Dst, {Src, HighAddr}); 1534 MI.eraseFromParent(); 1535 return true; 1536 } 1537 1538 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1539 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1540 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1541 unsigned NullVal = TM.getNullPointerValue(DestAS); 1542 1543 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1544 auto FlatNull = B.buildConstant(SrcTy, 0); 1545 1546 // Extract low 32-bits of the pointer. 1547 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1548 1549 auto CmpRes = 1550 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1551 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1552 1553 MI.eraseFromParent(); 1554 return true; 1555 } 1556 1557 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1558 return false; 1559 1560 if (!ST.hasFlatAddressSpace()) 1561 return false; 1562 1563 auto SegmentNull = 1564 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1565 auto FlatNull = 1566 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1567 1568 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1569 if (!ApertureReg.isValid()) 1570 return false; 1571 1572 auto CmpRes = 1573 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1574 1575 // Coerce the type of the low half of the result so we can use merge_values. 1576 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1577 1578 // TODO: Should we allow mismatched types but matching sizes in merges to 1579 // avoid the ptrtoint? 1580 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1581 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1582 1583 MI.eraseFromParent(); 1584 return true; 1585 } 1586 1587 bool AMDGPULegalizerInfo::legalizeFrint( 1588 MachineInstr &MI, MachineRegisterInfo &MRI, 1589 MachineIRBuilder &B) const { 1590 B.setInstr(MI); 1591 1592 Register Src = MI.getOperand(1).getReg(); 1593 LLT Ty = MRI.getType(Src); 1594 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1595 1596 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1597 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1598 1599 auto C1 = B.buildFConstant(Ty, C1Val); 1600 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1601 1602 // TODO: Should this propagate fast-math-flags? 1603 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1604 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1605 1606 auto C2 = B.buildFConstant(Ty, C2Val); 1607 auto Fabs = B.buildFAbs(Ty, Src); 1608 1609 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1610 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1611 return true; 1612 } 1613 1614 bool AMDGPULegalizerInfo::legalizeFceil( 1615 MachineInstr &MI, MachineRegisterInfo &MRI, 1616 MachineIRBuilder &B) const { 1617 B.setInstr(MI); 1618 1619 const LLT S1 = LLT::scalar(1); 1620 const LLT S64 = LLT::scalar(64); 1621 1622 Register Src = MI.getOperand(1).getReg(); 1623 assert(MRI.getType(Src) == S64); 1624 1625 // result = trunc(src) 1626 // if (src > 0.0 && src != result) 1627 // result += 1.0 1628 1629 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1630 1631 const auto Zero = B.buildFConstant(S64, 0.0); 1632 const auto One = B.buildFConstant(S64, 1.0); 1633 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1634 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1635 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1636 auto Add = B.buildSelect(S64, And, One, Zero); 1637 1638 // TODO: Should this propagate fast-math-flags? 1639 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1640 return true; 1641 } 1642 1643 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1644 MachineIRBuilder &B) { 1645 const unsigned FractBits = 52; 1646 const unsigned ExpBits = 11; 1647 LLT S32 = LLT::scalar(32); 1648 1649 auto Const0 = B.buildConstant(S32, FractBits - 32); 1650 auto Const1 = B.buildConstant(S32, ExpBits); 1651 1652 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1653 .addUse(Const0.getReg(0)) 1654 .addUse(Const1.getReg(0)); 1655 1656 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1657 } 1658 1659 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1660 MachineInstr &MI, MachineRegisterInfo &MRI, 1661 MachineIRBuilder &B) const { 1662 B.setInstr(MI); 1663 1664 const LLT S1 = LLT::scalar(1); 1665 const LLT S32 = LLT::scalar(32); 1666 const LLT S64 = LLT::scalar(64); 1667 1668 Register Src = MI.getOperand(1).getReg(); 1669 assert(MRI.getType(Src) == S64); 1670 1671 // TODO: Should this use extract since the low half is unused? 1672 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1673 Register Hi = Unmerge.getReg(1); 1674 1675 // Extract the upper half, since this is where we will find the sign and 1676 // exponent. 1677 auto Exp = extractF64Exponent(Hi, B); 1678 1679 const unsigned FractBits = 52; 1680 1681 // Extract the sign bit. 1682 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1683 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1684 1685 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1686 1687 const auto Zero32 = B.buildConstant(S32, 0); 1688 1689 // Extend back to 64-bits. 1690 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1691 1692 auto Shr = B.buildAShr(S64, FractMask, Exp); 1693 auto Not = B.buildNot(S64, Shr); 1694 auto Tmp0 = B.buildAnd(S64, Src, Not); 1695 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1696 1697 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1698 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1699 1700 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1701 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1702 return true; 1703 } 1704 1705 bool AMDGPULegalizerInfo::legalizeITOFP( 1706 MachineInstr &MI, MachineRegisterInfo &MRI, 1707 MachineIRBuilder &B, bool Signed) const { 1708 B.setInstr(MI); 1709 1710 Register Dst = MI.getOperand(0).getReg(); 1711 Register Src = MI.getOperand(1).getReg(); 1712 1713 const LLT S64 = LLT::scalar(64); 1714 const LLT S32 = LLT::scalar(32); 1715 1716 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1717 1718 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1719 1720 auto CvtHi = Signed ? 1721 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1722 B.buildUITOFP(S64, Unmerge.getReg(1)); 1723 1724 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1725 1726 auto ThirtyTwo = B.buildConstant(S32, 32); 1727 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1728 .addUse(CvtHi.getReg(0)) 1729 .addUse(ThirtyTwo.getReg(0)); 1730 1731 // TODO: Should this propagate fast-math-flags? 1732 B.buildFAdd(Dst, LdExp, CvtLo); 1733 MI.eraseFromParent(); 1734 return true; 1735 } 1736 1737 // TODO: Copied from DAG implementation. Verify logic and document how this 1738 // actually works. 1739 bool AMDGPULegalizerInfo::legalizeFPTOI( 1740 MachineInstr &MI, MachineRegisterInfo &MRI, 1741 MachineIRBuilder &B, bool Signed) const { 1742 B.setInstr(MI); 1743 1744 Register Dst = MI.getOperand(0).getReg(); 1745 Register Src = MI.getOperand(1).getReg(); 1746 1747 const LLT S64 = LLT::scalar(64); 1748 const LLT S32 = LLT::scalar(32); 1749 1750 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1751 1752 unsigned Flags = MI.getFlags(); 1753 1754 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1755 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1756 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1757 1758 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1759 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1760 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1761 1762 auto Hi = Signed ? 1763 B.buildFPTOSI(S32, FloorMul) : 1764 B.buildFPTOUI(S32, FloorMul); 1765 auto Lo = B.buildFPTOUI(S32, Fma); 1766 1767 B.buildMerge(Dst, { Lo, Hi }); 1768 MI.eraseFromParent(); 1769 1770 return true; 1771 } 1772 1773 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1774 MachineInstr &MI, MachineRegisterInfo &MRI, 1775 MachineIRBuilder &B) const { 1776 MachineFunction &MF = B.getMF(); 1777 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1778 1779 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1780 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1781 1782 // With ieee_mode disabled, the instructions have the correct behavior 1783 // already for G_FMINNUM/G_FMAXNUM 1784 if (!MFI->getMode().IEEE) 1785 return !IsIEEEOp; 1786 1787 if (IsIEEEOp) 1788 return true; 1789 1790 MachineIRBuilder HelperBuilder(MI); 1791 GISelObserverWrapper DummyObserver; 1792 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1793 HelperBuilder.setInstr(MI); 1794 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1795 } 1796 1797 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1798 MachineInstr &MI, MachineRegisterInfo &MRI, 1799 MachineIRBuilder &B) const { 1800 // TODO: Should move some of this into LegalizerHelper. 1801 1802 // TODO: Promote dynamic indexing of s16 to s32 1803 1804 // FIXME: Artifact combiner probably should have replaced the truncated 1805 // constant before this, so we shouldn't need 1806 // getConstantVRegValWithLookThrough. 1807 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1808 MI.getOperand(2).getReg(), MRI); 1809 if (!IdxVal) // Dynamic case will be selected to register indexing. 1810 return true; 1811 1812 Register Dst = MI.getOperand(0).getReg(); 1813 Register Vec = MI.getOperand(1).getReg(); 1814 1815 LLT VecTy = MRI.getType(Vec); 1816 LLT EltTy = VecTy.getElementType(); 1817 assert(EltTy == MRI.getType(Dst)); 1818 1819 B.setInstr(MI); 1820 1821 if (IdxVal->Value < VecTy.getNumElements()) 1822 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1823 else 1824 B.buildUndef(Dst); 1825 1826 MI.eraseFromParent(); 1827 return true; 1828 } 1829 1830 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1831 MachineInstr &MI, MachineRegisterInfo &MRI, 1832 MachineIRBuilder &B) const { 1833 // TODO: Should move some of this into LegalizerHelper. 1834 1835 // TODO: Promote dynamic indexing of s16 to s32 1836 1837 // FIXME: Artifact combiner probably should have replaced the truncated 1838 // constant before this, so we shouldn't need 1839 // getConstantVRegValWithLookThrough. 1840 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1841 MI.getOperand(3).getReg(), MRI); 1842 if (!IdxVal) // Dynamic case will be selected to register indexing. 1843 return true; 1844 1845 Register Dst = MI.getOperand(0).getReg(); 1846 Register Vec = MI.getOperand(1).getReg(); 1847 Register Ins = MI.getOperand(2).getReg(); 1848 1849 LLT VecTy = MRI.getType(Vec); 1850 LLT EltTy = VecTy.getElementType(); 1851 assert(EltTy == MRI.getType(Ins)); 1852 1853 B.setInstr(MI); 1854 1855 if (IdxVal->Value < VecTy.getNumElements()) 1856 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1857 else 1858 B.buildUndef(Dst); 1859 1860 MI.eraseFromParent(); 1861 return true; 1862 } 1863 1864 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1865 MachineInstr &MI, MachineRegisterInfo &MRI, 1866 MachineIRBuilder &B) const { 1867 const LLT V2S16 = LLT::vector(2, 16); 1868 1869 Register Dst = MI.getOperand(0).getReg(); 1870 Register Src0 = MI.getOperand(1).getReg(); 1871 LLT DstTy = MRI.getType(Dst); 1872 LLT SrcTy = MRI.getType(Src0); 1873 1874 if (SrcTy == V2S16 && DstTy == V2S16 && 1875 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1876 return true; 1877 1878 MachineIRBuilder HelperBuilder(MI); 1879 GISelObserverWrapper DummyObserver; 1880 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1881 HelperBuilder.setInstr(MI); 1882 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1883 } 1884 1885 bool AMDGPULegalizerInfo::legalizeSinCos( 1886 MachineInstr &MI, MachineRegisterInfo &MRI, 1887 MachineIRBuilder &B) const { 1888 B.setInstr(MI); 1889 1890 Register DstReg = MI.getOperand(0).getReg(); 1891 Register SrcReg = MI.getOperand(1).getReg(); 1892 LLT Ty = MRI.getType(DstReg); 1893 unsigned Flags = MI.getFlags(); 1894 1895 Register TrigVal; 1896 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1897 if (ST.hasTrigReducedRange()) { 1898 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1899 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1900 .addUse(MulVal.getReg(0)) 1901 .setMIFlags(Flags).getReg(0); 1902 } else 1903 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1904 1905 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1906 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1907 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1908 .addUse(TrigVal) 1909 .setMIFlags(Flags); 1910 MI.eraseFromParent(); 1911 return true; 1912 } 1913 1914 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1915 Register DstReg, LLT PtrTy, 1916 MachineIRBuilder &B, const GlobalValue *GV, 1917 unsigned Offset, unsigned GAFlags) const { 1918 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1919 // to the following code sequence: 1920 // 1921 // For constant address space: 1922 // s_getpc_b64 s[0:1] 1923 // s_add_u32 s0, s0, $symbol 1924 // s_addc_u32 s1, s1, 0 1925 // 1926 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1927 // a fixup or relocation is emitted to replace $symbol with a literal 1928 // constant, which is a pc-relative offset from the encoding of the $symbol 1929 // operand to the global variable. 1930 // 1931 // For global address space: 1932 // s_getpc_b64 s[0:1] 1933 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1934 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1935 // 1936 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1937 // fixups or relocations are emitted to replace $symbol@*@lo and 1938 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1939 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1940 // operand to the global variable. 1941 // 1942 // What we want here is an offset from the value returned by s_getpc 1943 // (which is the address of the s_add_u32 instruction) to the global 1944 // variable, but since the encoding of $symbol starts 4 bytes after the start 1945 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1946 // small. This requires us to add 4 to the global variable offset in order to 1947 // compute the correct address. 1948 1949 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1950 1951 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1952 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1953 1954 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1955 .addDef(PCReg); 1956 1957 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1958 if (GAFlags == SIInstrInfo::MO_NONE) 1959 MIB.addImm(0); 1960 else 1961 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1962 1963 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1964 1965 if (PtrTy.getSizeInBits() == 32) 1966 B.buildExtract(DstReg, PCReg, 0); 1967 return true; 1968 } 1969 1970 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1971 MachineInstr &MI, MachineRegisterInfo &MRI, 1972 MachineIRBuilder &B) const { 1973 Register DstReg = MI.getOperand(0).getReg(); 1974 LLT Ty = MRI.getType(DstReg); 1975 unsigned AS = Ty.getAddressSpace(); 1976 1977 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1978 MachineFunction &MF = B.getMF(); 1979 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1980 B.setInstr(MI); 1981 1982 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1983 if (!MFI->isEntryFunction()) { 1984 const Function &Fn = MF.getFunction(); 1985 DiagnosticInfoUnsupported BadLDSDecl( 1986 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1987 DS_Warning); 1988 Fn.getContext().diagnose(BadLDSDecl); 1989 1990 // We currently don't have a way to correctly allocate LDS objects that 1991 // aren't directly associated with a kernel. We do force inlining of 1992 // functions that use local objects. However, if these dead functions are 1993 // not eliminated, we don't want a compile time error. Just emit a warning 1994 // and a trap, since there should be no callable path here. 1995 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1996 B.buildUndef(DstReg); 1997 MI.eraseFromParent(); 1998 return true; 1999 } 2000 2001 // TODO: We could emit code to handle the initialization somewhere. 2002 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2003 const SITargetLowering *TLI = ST.getTargetLowering(); 2004 if (!TLI->shouldUseLDSConstAddress(GV)) { 2005 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2006 return true; // Leave in place; 2007 } 2008 2009 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 2010 MI.eraseFromParent(); 2011 return true; 2012 } 2013 2014 const Function &Fn = MF.getFunction(); 2015 DiagnosticInfoUnsupported BadInit( 2016 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2017 Fn.getContext().diagnose(BadInit); 2018 return true; 2019 } 2020 2021 const SITargetLowering *TLI = ST.getTargetLowering(); 2022 2023 if (TLI->shouldEmitFixup(GV)) { 2024 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2025 MI.eraseFromParent(); 2026 return true; 2027 } 2028 2029 if (TLI->shouldEmitPCReloc(GV)) { 2030 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2031 MI.eraseFromParent(); 2032 return true; 2033 } 2034 2035 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2036 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2037 2038 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2039 MachinePointerInfo::getGOT(MF), 2040 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2041 MachineMemOperand::MOInvariant, 2042 8 /*Size*/, Align(8)); 2043 2044 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2045 2046 if (Ty.getSizeInBits() == 32) { 2047 // Truncate if this is a 32-bit constant adrdess. 2048 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2049 B.buildExtract(DstReg, Load, 0); 2050 } else 2051 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2052 2053 MI.eraseFromParent(); 2054 return true; 2055 } 2056 2057 bool AMDGPULegalizerInfo::legalizeLoad( 2058 MachineInstr &MI, MachineRegisterInfo &MRI, 2059 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2060 B.setInstr(MI); 2061 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2062 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2063 Observer.changingInstr(MI); 2064 MI.getOperand(1).setReg(Cast.getReg(0)); 2065 Observer.changedInstr(MI); 2066 return true; 2067 } 2068 2069 bool AMDGPULegalizerInfo::legalizeFMad( 2070 MachineInstr &MI, MachineRegisterInfo &MRI, 2071 MachineIRBuilder &B) const { 2072 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2073 assert(Ty.isScalar()); 2074 2075 MachineFunction &MF = B.getMF(); 2076 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2077 2078 // TODO: Always legal with future ftz flag. 2079 // FIXME: Do we need just output? 2080 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2081 return true; 2082 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2083 return true; 2084 2085 MachineIRBuilder HelperBuilder(MI); 2086 GISelObserverWrapper DummyObserver; 2087 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2088 HelperBuilder.setInstr(MI); 2089 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2090 } 2091 2092 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2093 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2094 Register DstReg = MI.getOperand(0).getReg(); 2095 Register PtrReg = MI.getOperand(1).getReg(); 2096 Register CmpVal = MI.getOperand(2).getReg(); 2097 Register NewVal = MI.getOperand(3).getReg(); 2098 2099 assert(SITargetLowering::isFlatGlobalAddrSpace( 2100 MRI.getType(PtrReg).getAddressSpace()) && 2101 "this should not have been custom lowered"); 2102 2103 LLT ValTy = MRI.getType(CmpVal); 2104 LLT VecTy = LLT::vector(2, ValTy); 2105 2106 B.setInstr(MI); 2107 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2108 2109 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2110 .addDef(DstReg) 2111 .addUse(PtrReg) 2112 .addUse(PackedVal) 2113 .setMemRefs(MI.memoperands()); 2114 2115 MI.eraseFromParent(); 2116 return true; 2117 } 2118 2119 bool AMDGPULegalizerInfo::legalizeFlog( 2120 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2121 Register Dst = MI.getOperand(0).getReg(); 2122 Register Src = MI.getOperand(1).getReg(); 2123 LLT Ty = B.getMRI()->getType(Dst); 2124 unsigned Flags = MI.getFlags(); 2125 B.setInstr(MI); 2126 2127 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2128 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2129 2130 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2131 MI.eraseFromParent(); 2132 return true; 2133 } 2134 2135 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2136 MachineIRBuilder &B) const { 2137 Register Dst = MI.getOperand(0).getReg(); 2138 Register Src = MI.getOperand(1).getReg(); 2139 unsigned Flags = MI.getFlags(); 2140 LLT Ty = B.getMRI()->getType(Dst); 2141 B.setInstr(MI); 2142 2143 auto K = B.buildFConstant(Ty, numbers::log2e); 2144 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2145 B.buildFExp2(Dst, Mul, Flags); 2146 MI.eraseFromParent(); 2147 return true; 2148 } 2149 2150 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2151 MachineIRBuilder &B) const { 2152 Register Dst = MI.getOperand(0).getReg(); 2153 Register Src0 = MI.getOperand(1).getReg(); 2154 Register Src1 = MI.getOperand(2).getReg(); 2155 unsigned Flags = MI.getFlags(); 2156 LLT Ty = B.getMRI()->getType(Dst); 2157 B.setInstr(MI); 2158 const LLT S16 = LLT::scalar(16); 2159 const LLT S32 = LLT::scalar(32); 2160 2161 if (Ty == S32) { 2162 auto Log = B.buildFLog2(S32, Src0, Flags); 2163 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2164 .addUse(Log.getReg(0)) 2165 .addUse(Src1) 2166 .setMIFlags(Flags); 2167 B.buildFExp2(Dst, Mul, Flags); 2168 } else if (Ty == S16) { 2169 // There's no f16 fmul_legacy, so we need to convert for it. 2170 auto Log = B.buildFLog2(S16, Src0, Flags); 2171 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2172 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2173 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2174 .addUse(Ext0.getReg(0)) 2175 .addUse(Ext1.getReg(0)) 2176 .setMIFlags(Flags); 2177 2178 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2179 } else 2180 return false; 2181 2182 MI.eraseFromParent(); 2183 return true; 2184 } 2185 2186 // Find a source register, ignoring any possible source modifiers. 2187 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2188 Register ModSrc = OrigSrc; 2189 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2190 ModSrc = SrcFNeg->getOperand(1).getReg(); 2191 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2192 ModSrc = SrcFAbs->getOperand(1).getReg(); 2193 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2194 ModSrc = SrcFAbs->getOperand(1).getReg(); 2195 return ModSrc; 2196 } 2197 2198 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2199 MachineRegisterInfo &MRI, 2200 MachineIRBuilder &B) const { 2201 B.setInstr(MI); 2202 2203 const LLT S1 = LLT::scalar(1); 2204 const LLT S64 = LLT::scalar(64); 2205 Register Dst = MI.getOperand(0).getReg(); 2206 Register OrigSrc = MI.getOperand(1).getReg(); 2207 unsigned Flags = MI.getFlags(); 2208 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2209 "this should not have been custom lowered"); 2210 2211 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2212 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2213 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2214 // V_FRACT bug is: 2215 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2216 // 2217 // Convert floor(x) to (x - fract(x)) 2218 2219 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2220 .addUse(OrigSrc) 2221 .setMIFlags(Flags); 2222 2223 // Give source modifier matching some assistance before obscuring a foldable 2224 // pattern. 2225 2226 // TODO: We can avoid the neg on the fract? The input sign to fract 2227 // shouldn't matter? 2228 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2229 2230 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2231 2232 Register Min = MRI.createGenericVirtualRegister(S64); 2233 2234 // We don't need to concern ourselves with the snan handling difference, so 2235 // use the one which will directly select. 2236 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2237 if (MFI->getMode().IEEE) 2238 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2239 else 2240 B.buildFMinNum(Min, Fract, Const, Flags); 2241 2242 Register CorrectedFract = Min; 2243 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2244 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2245 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2246 } 2247 2248 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2249 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2250 2251 MI.eraseFromParent(); 2252 return true; 2253 } 2254 2255 // Turn an illegal packed v2s16 build vector into bit operations. 2256 // TODO: This should probably be a bitcast action in LegalizerHelper. 2257 bool AMDGPULegalizerInfo::legalizeBuildVector( 2258 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2259 Register Dst = MI.getOperand(0).getReg(); 2260 const LLT S32 = LLT::scalar(32); 2261 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2262 2263 Register Src0 = MI.getOperand(1).getReg(); 2264 Register Src1 = MI.getOperand(2).getReg(); 2265 assert(MRI.getType(Src0) == LLT::scalar(16)); 2266 2267 B.setInstr(MI); 2268 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2269 B.buildBitcast(Dst, Merge); 2270 2271 MI.eraseFromParent(); 2272 return true; 2273 } 2274 2275 // Return the use branch instruction, otherwise null if the usage is invalid. 2276 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2277 MachineRegisterInfo &MRI, 2278 MachineInstr *&Br) { 2279 Register CondDef = MI.getOperand(0).getReg(); 2280 if (!MRI.hasOneNonDBGUse(CondDef)) 2281 return nullptr; 2282 2283 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2284 if (UseMI.getParent() != MI.getParent() || 2285 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2286 return nullptr; 2287 2288 // Make sure the cond br is followed by a G_BR 2289 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2290 if (Next != MI.getParent()->end()) { 2291 if (Next->getOpcode() != AMDGPU::G_BR) 2292 return nullptr; 2293 Br = &*Next; 2294 } 2295 2296 return &UseMI; 2297 } 2298 2299 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2300 MachineRegisterInfo &MRI, 2301 Register LiveIn, 2302 Register PhyReg) const { 2303 assert(PhyReg.isPhysical() && "Physical register expected"); 2304 2305 // Insert the live-in copy, if required, by defining destination virtual 2306 // register. 2307 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2308 if (!MRI.getVRegDef(LiveIn)) { 2309 // FIXME: Should have scoped insert pt 2310 MachineBasicBlock &OrigInsBB = B.getMBB(); 2311 auto OrigInsPt = B.getInsertPt(); 2312 2313 MachineBasicBlock &EntryMBB = B.getMF().front(); 2314 EntryMBB.addLiveIn(PhyReg); 2315 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2316 B.buildCopy(LiveIn, PhyReg); 2317 2318 B.setInsertPt(OrigInsBB, OrigInsPt); 2319 } 2320 2321 return LiveIn; 2322 } 2323 2324 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2325 MachineRegisterInfo &MRI, 2326 Register PhyReg, LLT Ty, 2327 bool InsertLiveInCopy) const { 2328 assert(PhyReg.isPhysical() && "Physical register expected"); 2329 2330 // Get or create virtual live-in regester 2331 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2332 if (!LiveIn) { 2333 LiveIn = MRI.createGenericVirtualRegister(Ty); 2334 MRI.addLiveIn(PhyReg, LiveIn); 2335 } 2336 2337 // When the actual true copy required is from virtual register to physical 2338 // register (to be inserted later), live-in copy insertion from physical 2339 // to register virtual register is not required 2340 if (!InsertLiveInCopy) 2341 return LiveIn; 2342 2343 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2344 } 2345 2346 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2347 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2348 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2349 const ArgDescriptor *Arg; 2350 const TargetRegisterClass *RC; 2351 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2352 if (!Arg) { 2353 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2354 return nullptr; 2355 } 2356 return Arg; 2357 } 2358 2359 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2360 const ArgDescriptor *Arg) const { 2361 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2362 return false; // TODO: Handle these 2363 2364 Register SrcReg = Arg->getRegister(); 2365 assert(SrcReg.isPhysical() && "Physical register expected"); 2366 assert(DstReg.isVirtual() && "Virtual register expected"); 2367 2368 MachineRegisterInfo &MRI = *B.getMRI(); 2369 2370 LLT Ty = MRI.getType(DstReg); 2371 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2372 2373 if (Arg->isMasked()) { 2374 // TODO: Should we try to emit this once in the entry block? 2375 const LLT S32 = LLT::scalar(32); 2376 const unsigned Mask = Arg->getMask(); 2377 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2378 2379 Register AndMaskSrc = LiveIn; 2380 2381 if (Shift != 0) { 2382 auto ShiftAmt = B.buildConstant(S32, Shift); 2383 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2384 } 2385 2386 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2387 } else { 2388 B.buildCopy(DstReg, LiveIn); 2389 } 2390 2391 return true; 2392 } 2393 2394 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2395 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2396 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2397 B.setInstr(MI); 2398 2399 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2400 if (!Arg) 2401 return false; 2402 2403 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2404 return false; 2405 2406 MI.eraseFromParent(); 2407 return true; 2408 } 2409 2410 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2411 MachineRegisterInfo &MRI, 2412 MachineIRBuilder &B) const { 2413 B.setInstr(MI); 2414 Register Dst = MI.getOperand(0).getReg(); 2415 LLT DstTy = MRI.getType(Dst); 2416 LLT S16 = LLT::scalar(16); 2417 LLT S32 = LLT::scalar(32); 2418 LLT S64 = LLT::scalar(64); 2419 2420 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2421 return true; 2422 2423 if (DstTy == S16) 2424 return legalizeFDIV16(MI, MRI, B); 2425 if (DstTy == S32) 2426 return legalizeFDIV32(MI, MRI, B); 2427 if (DstTy == S64) 2428 return legalizeFDIV64(MI, MRI, B); 2429 2430 return false; 2431 } 2432 2433 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2434 const LLT S32 = LLT::scalar(32); 2435 2436 auto Cvt0 = B.buildUITOFP(S32, Src); 2437 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2438 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2439 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2440 return B.buildFPTOUI(S32, Mul).getReg(0); 2441 } 2442 2443 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2444 Register DstReg, 2445 Register Num, 2446 Register Den, 2447 bool IsRem) const { 2448 const LLT S1 = LLT::scalar(1); 2449 const LLT S32 = LLT::scalar(32); 2450 2451 // RCP = URECIP(Den) = 2^32 / Den + e 2452 // e is rounding error. 2453 auto RCP = buildDivRCP(B, Den); 2454 2455 // RCP_LO = mul(RCP, Den) 2456 auto RCP_LO = B.buildMul(S32, RCP, Den); 2457 2458 // RCP_HI = mulhu (RCP, Den) */ 2459 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2460 2461 // NEG_RCP_LO = -RCP_LO 2462 auto Zero = B.buildConstant(S32, 0); 2463 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2464 2465 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2466 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2467 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2468 2469 // Calculate the rounding error from the URECIP instruction 2470 // E = mulhu(ABS_RCP_LO, RCP) 2471 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2472 2473 // RCP_A_E = RCP + E 2474 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2475 2476 // RCP_S_E = RCP - E 2477 auto RCP_S_E = B.buildSub(S32, RCP, E); 2478 2479 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2480 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2481 2482 // Quotient = mulhu(Tmp0, Num)stmp 2483 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2484 2485 // Num_S_Remainder = Quotient * Den 2486 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2487 2488 // Remainder = Num - Num_S_Remainder 2489 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2490 2491 // Remainder_GE_Den = Remainder >= Den 2492 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2493 2494 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2495 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2496 Num, Num_S_Remainder); 2497 2498 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2499 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2500 2501 // Calculate Division result: 2502 2503 // Quotient_A_One = Quotient + 1 2504 auto One = B.buildConstant(S32, 1); 2505 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2506 2507 // Quotient_S_One = Quotient - 1 2508 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2509 2510 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2511 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2512 2513 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2514 if (IsRem) { 2515 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2516 2517 // Calculate Rem result: 2518 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2519 2520 // Remainder_A_Den = Remainder + Den 2521 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2522 2523 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2524 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2525 2526 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2527 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2528 } else { 2529 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2530 } 2531 } 2532 2533 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2534 MachineRegisterInfo &MRI, 2535 MachineIRBuilder &B) const { 2536 B.setInstr(MI); 2537 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2538 Register DstReg = MI.getOperand(0).getReg(); 2539 Register Num = MI.getOperand(1).getReg(); 2540 Register Den = MI.getOperand(2).getReg(); 2541 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2542 MI.eraseFromParent(); 2543 return true; 2544 } 2545 2546 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2547 // 2548 // Return lo, hi of result 2549 // 2550 // %cvt.lo = G_UITOFP Val.lo 2551 // %cvt.hi = G_UITOFP Val.hi 2552 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2553 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2554 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2555 // %mul2 = G_FMUL %mul1, 2**(-32) 2556 // %trunc = G_INTRINSIC_TRUNC %mul2 2557 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2558 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2559 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2560 Register Val) { 2561 const LLT S32 = LLT::scalar(32); 2562 auto Unmerge = B.buildUnmerge(S32, Val); 2563 2564 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2565 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2566 2567 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2568 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2569 2570 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2571 auto Mul1 = 2572 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2573 2574 // 2**(-32) 2575 auto Mul2 = 2576 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2577 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2578 2579 // -(2**32) 2580 auto Mad2 = B.buildFMAD(S32, Trunc, 2581 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2582 2583 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2584 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2585 2586 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2587 } 2588 2589 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2590 MachineRegisterInfo &MRI, 2591 MachineIRBuilder &B) const { 2592 B.setInstr(MI); 2593 2594 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2595 const LLT S32 = LLT::scalar(32); 2596 const LLT S64 = LLT::scalar(64); 2597 const LLT S1 = LLT::scalar(1); 2598 Register Numer = MI.getOperand(1).getReg(); 2599 Register Denom = MI.getOperand(2).getReg(); 2600 Register RcpLo, RcpHi; 2601 2602 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2603 2604 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2605 2606 auto Zero64 = B.buildConstant(S64, 0); 2607 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2608 2609 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2610 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2611 2612 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2613 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2614 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2615 2616 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2617 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2618 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2619 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2620 2621 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2622 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2623 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2624 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2625 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2626 2627 auto Zero32 = B.buildConstant(S32, 0); 2628 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2629 auto Add2_HiC = 2630 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2631 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2632 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2633 2634 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2635 Register NumerLo = UnmergeNumer.getReg(0); 2636 Register NumerHi = UnmergeNumer.getReg(1); 2637 2638 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2639 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2640 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2641 Register Mul3_Lo = UnmergeMul3.getReg(0); 2642 Register Mul3_Hi = UnmergeMul3.getReg(1); 2643 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2644 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2645 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2646 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2647 2648 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2649 Register DenomLo = UnmergeDenom.getReg(0); 2650 Register DenomHi = UnmergeDenom.getReg(1); 2651 2652 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2653 auto C1 = B.buildSExt(S32, CmpHi); 2654 2655 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2656 auto C2 = B.buildSExt(S32, CmpLo); 2657 2658 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2659 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2660 2661 // TODO: Here and below portions of the code can be enclosed into if/endif. 2662 // Currently control flow is unconditional and we have 4 selects after 2663 // potential endif to substitute PHIs. 2664 2665 // if C3 != 0 ... 2666 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2667 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2668 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2669 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2670 2671 auto One64 = B.buildConstant(S64, 1); 2672 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2673 2674 auto C4 = 2675 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2676 auto C5 = 2677 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2678 auto C6 = B.buildSelect( 2679 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2680 2681 // if (C6 != 0) 2682 auto Add4 = B.buildAdd(S64, Add3, One64); 2683 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2684 2685 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2686 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2687 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2688 2689 // endif C6 2690 // endif C3 2691 2692 if (IsDiv) { 2693 auto Sel1 = B.buildSelect( 2694 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2695 B.buildSelect(MI.getOperand(0), 2696 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2697 } else { 2698 auto Sel2 = B.buildSelect( 2699 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2700 B.buildSelect(MI.getOperand(0), 2701 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2702 } 2703 2704 MI.eraseFromParent(); 2705 return true; 2706 } 2707 2708 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2709 MachineRegisterInfo &MRI, 2710 MachineIRBuilder &B) const { 2711 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2712 if (Ty == LLT::scalar(32)) 2713 return legalizeUDIV_UREM32(MI, MRI, B); 2714 if (Ty == LLT::scalar(64)) 2715 return legalizeUDIV_UREM64(MI, MRI, B); 2716 return false; 2717 } 2718 2719 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2720 MachineRegisterInfo &MRI, 2721 MachineIRBuilder &B) const { 2722 B.setInstr(MI); 2723 const LLT S32 = LLT::scalar(32); 2724 2725 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2726 Register DstReg = MI.getOperand(0).getReg(); 2727 Register LHS = MI.getOperand(1).getReg(); 2728 Register RHS = MI.getOperand(2).getReg(); 2729 2730 auto ThirtyOne = B.buildConstant(S32, 31); 2731 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2732 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2733 2734 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2735 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2736 2737 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2738 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2739 2740 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2741 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2742 2743 if (IsRem) { 2744 auto RSign = LHSign; // Remainder sign is the same as LHS 2745 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2746 B.buildSub(DstReg, UDivRem, RSign); 2747 } else { 2748 auto DSign = B.buildXor(S32, LHSign, RHSign); 2749 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2750 B.buildSub(DstReg, UDivRem, DSign); 2751 } 2752 2753 MI.eraseFromParent(); 2754 return true; 2755 } 2756 2757 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2758 MachineRegisterInfo &MRI, 2759 MachineIRBuilder &B) const { 2760 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2761 return legalizeSDIV_SREM32(MI, MRI, B); 2762 return false; 2763 } 2764 2765 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2766 MachineRegisterInfo &MRI, 2767 MachineIRBuilder &B) const { 2768 Register Res = MI.getOperand(0).getReg(); 2769 Register LHS = MI.getOperand(1).getReg(); 2770 Register RHS = MI.getOperand(2).getReg(); 2771 2772 uint16_t Flags = MI.getFlags(); 2773 2774 LLT ResTy = MRI.getType(Res); 2775 LLT S32 = LLT::scalar(32); 2776 LLT S64 = LLT::scalar(64); 2777 2778 const MachineFunction &MF = B.getMF(); 2779 bool Unsafe = 2780 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2781 2782 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2783 return false; 2784 2785 if (!Unsafe && ResTy == S32 && 2786 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2787 return false; 2788 2789 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2790 // 1 / x -> RCP(x) 2791 if (CLHS->isExactlyValue(1.0)) { 2792 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2793 .addUse(RHS) 2794 .setMIFlags(Flags); 2795 2796 MI.eraseFromParent(); 2797 return true; 2798 } 2799 2800 // -1 / x -> RCP( FNEG(x) ) 2801 if (CLHS->isExactlyValue(-1.0)) { 2802 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2803 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2804 .addUse(FNeg.getReg(0)) 2805 .setMIFlags(Flags); 2806 2807 MI.eraseFromParent(); 2808 return true; 2809 } 2810 } 2811 2812 // x / y -> x * (1.0 / y) 2813 if (Unsafe) { 2814 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2815 .addUse(RHS) 2816 .setMIFlags(Flags); 2817 B.buildFMul(Res, LHS, RCP, Flags); 2818 2819 MI.eraseFromParent(); 2820 return true; 2821 } 2822 2823 return false; 2824 } 2825 2826 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2827 MachineRegisterInfo &MRI, 2828 MachineIRBuilder &B) const { 2829 B.setInstr(MI); 2830 Register Res = MI.getOperand(0).getReg(); 2831 Register LHS = MI.getOperand(1).getReg(); 2832 Register RHS = MI.getOperand(2).getReg(); 2833 2834 uint16_t Flags = MI.getFlags(); 2835 2836 LLT S16 = LLT::scalar(16); 2837 LLT S32 = LLT::scalar(32); 2838 2839 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2840 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2841 2842 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2843 .addUse(RHSExt.getReg(0)) 2844 .setMIFlags(Flags); 2845 2846 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2847 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2848 2849 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2850 .addUse(RDst.getReg(0)) 2851 .addUse(RHS) 2852 .addUse(LHS) 2853 .setMIFlags(Flags); 2854 2855 MI.eraseFromParent(); 2856 return true; 2857 } 2858 2859 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2860 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2861 static void toggleSPDenormMode(bool Enable, 2862 MachineIRBuilder &B, 2863 const GCNSubtarget &ST, 2864 AMDGPU::SIModeRegisterDefaults Mode) { 2865 // Set SP denorm mode to this value. 2866 unsigned SPDenormMode = 2867 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2868 2869 if (ST.hasDenormModeInst()) { 2870 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2871 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2872 2873 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2874 B.buildInstr(AMDGPU::S_DENORM_MODE) 2875 .addImm(NewDenormModeValue); 2876 2877 } else { 2878 // Select FP32 bit field in mode register. 2879 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2880 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2881 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2882 2883 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2884 .addImm(SPDenormMode) 2885 .addImm(SPDenormModeBitField); 2886 } 2887 } 2888 2889 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2890 MachineRegisterInfo &MRI, 2891 MachineIRBuilder &B) const { 2892 B.setInstr(MI); 2893 Register Res = MI.getOperand(0).getReg(); 2894 Register LHS = MI.getOperand(1).getReg(); 2895 Register RHS = MI.getOperand(2).getReg(); 2896 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2897 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2898 2899 uint16_t Flags = MI.getFlags(); 2900 2901 LLT S32 = LLT::scalar(32); 2902 LLT S1 = LLT::scalar(1); 2903 2904 auto One = B.buildFConstant(S32, 1.0f); 2905 2906 auto DenominatorScaled = 2907 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2908 .addUse(LHS) 2909 .addUse(RHS) 2910 .addImm(0) 2911 .setMIFlags(Flags); 2912 auto NumeratorScaled = 2913 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2914 .addUse(LHS) 2915 .addUse(RHS) 2916 .addImm(1) 2917 .setMIFlags(Flags); 2918 2919 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2920 .addUse(DenominatorScaled.getReg(0)) 2921 .setMIFlags(Flags); 2922 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2923 2924 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2925 // aren't modeled as reading it. 2926 if (!Mode.allFP32Denormals()) 2927 toggleSPDenormMode(true, B, ST, Mode); 2928 2929 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2930 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2931 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2932 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2933 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2934 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2935 2936 if (!Mode.allFP32Denormals()) 2937 toggleSPDenormMode(false, B, ST, Mode); 2938 2939 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2940 .addUse(Fma4.getReg(0)) 2941 .addUse(Fma1.getReg(0)) 2942 .addUse(Fma3.getReg(0)) 2943 .addUse(NumeratorScaled.getReg(1)) 2944 .setMIFlags(Flags); 2945 2946 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2947 .addUse(Fmas.getReg(0)) 2948 .addUse(RHS) 2949 .addUse(LHS) 2950 .setMIFlags(Flags); 2951 2952 MI.eraseFromParent(); 2953 return true; 2954 } 2955 2956 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2957 MachineRegisterInfo &MRI, 2958 MachineIRBuilder &B) const { 2959 B.setInstr(MI); 2960 Register Res = MI.getOperand(0).getReg(); 2961 Register LHS = MI.getOperand(1).getReg(); 2962 Register RHS = MI.getOperand(2).getReg(); 2963 2964 uint16_t Flags = MI.getFlags(); 2965 2966 LLT S64 = LLT::scalar(64); 2967 LLT S1 = LLT::scalar(1); 2968 2969 auto One = B.buildFConstant(S64, 1.0); 2970 2971 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2972 .addUse(LHS) 2973 .addUse(RHS) 2974 .addImm(0) 2975 .setMIFlags(Flags); 2976 2977 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2978 2979 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2980 .addUse(DivScale0.getReg(0)) 2981 .setMIFlags(Flags); 2982 2983 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2984 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2985 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2986 2987 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2988 .addUse(LHS) 2989 .addUse(RHS) 2990 .addImm(1) 2991 .setMIFlags(Flags); 2992 2993 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2994 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2995 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2996 2997 Register Scale; 2998 if (!ST.hasUsableDivScaleConditionOutput()) { 2999 // Workaround a hardware bug on SI where the condition output from div_scale 3000 // is not usable. 3001 3002 LLT S32 = LLT::scalar(32); 3003 3004 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3005 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3006 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3007 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3008 3009 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3010 Scale1Unmerge.getReg(1)); 3011 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3012 Scale0Unmerge.getReg(1)); 3013 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3014 } else { 3015 Scale = DivScale1.getReg(1); 3016 } 3017 3018 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3019 .addUse(Fma4.getReg(0)) 3020 .addUse(Fma3.getReg(0)) 3021 .addUse(Mul.getReg(0)) 3022 .addUse(Scale) 3023 .setMIFlags(Flags); 3024 3025 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3026 .addUse(Fmas.getReg(0)) 3027 .addUse(RHS) 3028 .addUse(LHS) 3029 .setMIFlags(Flags); 3030 3031 MI.eraseFromParent(); 3032 return true; 3033 } 3034 3035 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3036 MachineRegisterInfo &MRI, 3037 MachineIRBuilder &B) const { 3038 B.setInstr(MI); 3039 Register Res = MI.getOperand(0).getReg(); 3040 Register LHS = MI.getOperand(2).getReg(); 3041 Register RHS = MI.getOperand(3).getReg(); 3042 uint16_t Flags = MI.getFlags(); 3043 3044 LLT S32 = LLT::scalar(32); 3045 LLT S1 = LLT::scalar(1); 3046 3047 auto Abs = B.buildFAbs(S32, RHS, Flags); 3048 const APFloat C0Val(1.0f); 3049 3050 auto C0 = B.buildConstant(S32, 0x6f800000); 3051 auto C1 = B.buildConstant(S32, 0x2f800000); 3052 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3053 3054 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3055 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3056 3057 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3058 3059 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3060 .addUse(Mul0.getReg(0)) 3061 .setMIFlags(Flags); 3062 3063 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3064 3065 B.buildFMul(Res, Sel, Mul1, Flags); 3066 3067 MI.eraseFromParent(); 3068 return true; 3069 } 3070 3071 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3072 MachineRegisterInfo &MRI, 3073 MachineIRBuilder &B) const { 3074 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3075 if (!MFI->isEntryFunction()) { 3076 return legalizePreloadedArgIntrin(MI, MRI, B, 3077 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3078 } 3079 3080 B.setInstr(MI); 3081 3082 uint64_t Offset = 3083 ST.getTargetLowering()->getImplicitParameterOffset( 3084 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3085 Register DstReg = MI.getOperand(0).getReg(); 3086 LLT DstTy = MRI.getType(DstReg); 3087 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3088 3089 const ArgDescriptor *Arg; 3090 const TargetRegisterClass *RC; 3091 std::tie(Arg, RC) 3092 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3093 if (!Arg) 3094 return false; 3095 3096 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3097 if (!loadInputValue(KernargPtrReg, B, Arg)) 3098 return false; 3099 3100 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3101 MI.eraseFromParent(); 3102 return true; 3103 } 3104 3105 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3106 MachineRegisterInfo &MRI, 3107 MachineIRBuilder &B, 3108 unsigned AddrSpace) const { 3109 B.setInstr(MI); 3110 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3111 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3112 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3113 MI.eraseFromParent(); 3114 return true; 3115 } 3116 3117 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3118 // offset (the offset that is included in bounds checking and swizzling, to be 3119 // split between the instruction's voffset and immoffset fields) and soffset 3120 // (the offset that is excluded from bounds checking and swizzling, to go in 3121 // the instruction's soffset field). This function takes the first kind of 3122 // offset and figures out how to split it between voffset and immoffset. 3123 std::tuple<Register, unsigned, unsigned> 3124 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3125 Register OrigOffset) const { 3126 const unsigned MaxImm = 4095; 3127 Register BaseReg; 3128 unsigned TotalConstOffset; 3129 MachineInstr *OffsetDef; 3130 const LLT S32 = LLT::scalar(32); 3131 3132 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3133 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3134 3135 unsigned ImmOffset = TotalConstOffset; 3136 3137 // If the immediate value is too big for the immoffset field, put the value 3138 // and -4096 into the immoffset field so that the value that is copied/added 3139 // for the voffset field is a multiple of 4096, and it stands more chance 3140 // of being CSEd with the copy/add for another similar load/store. 3141 // However, do not do that rounding down to a multiple of 4096 if that is a 3142 // negative number, as it appears to be illegal to have a negative offset 3143 // in the vgpr, even if adding the immediate offset makes it positive. 3144 unsigned Overflow = ImmOffset & ~MaxImm; 3145 ImmOffset -= Overflow; 3146 if ((int32_t)Overflow < 0) { 3147 Overflow += ImmOffset; 3148 ImmOffset = 0; 3149 } 3150 3151 if (Overflow != 0) { 3152 if (!BaseReg) { 3153 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3154 } else { 3155 auto OverflowVal = B.buildConstant(S32, Overflow); 3156 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3157 } 3158 } 3159 3160 if (!BaseReg) 3161 BaseReg = B.buildConstant(S32, 0).getReg(0); 3162 3163 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3164 } 3165 3166 /// Handle register layout difference for f16 images for some subtargets. 3167 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3168 MachineRegisterInfo &MRI, 3169 Register Reg) const { 3170 if (!ST.hasUnpackedD16VMem()) 3171 return Reg; 3172 3173 const LLT S16 = LLT::scalar(16); 3174 const LLT S32 = LLT::scalar(32); 3175 LLT StoreVT = MRI.getType(Reg); 3176 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3177 3178 auto Unmerge = B.buildUnmerge(S16, Reg); 3179 3180 SmallVector<Register, 4> WideRegs; 3181 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3182 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3183 3184 int NumElts = StoreVT.getNumElements(); 3185 3186 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3187 } 3188 3189 Register AMDGPULegalizerInfo::fixStoreSourceType( 3190 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3191 MachineRegisterInfo *MRI = B.getMRI(); 3192 LLT Ty = MRI->getType(VData); 3193 3194 const LLT S16 = LLT::scalar(16); 3195 3196 // Fixup illegal register types for i8 stores. 3197 if (Ty == LLT::scalar(8) || Ty == S16) { 3198 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3199 return AnyExt; 3200 } 3201 3202 if (Ty.isVector()) { 3203 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3204 if (IsFormat) 3205 return handleD16VData(B, *MRI, VData); 3206 } 3207 } 3208 3209 return VData; 3210 } 3211 3212 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3213 MachineRegisterInfo &MRI, 3214 MachineIRBuilder &B, 3215 bool IsTyped, 3216 bool IsFormat) const { 3217 B.setInstr(MI); 3218 3219 Register VData = MI.getOperand(1).getReg(); 3220 LLT Ty = MRI.getType(VData); 3221 LLT EltTy = Ty.getScalarType(); 3222 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3223 const LLT S32 = LLT::scalar(32); 3224 3225 VData = fixStoreSourceType(B, VData, IsFormat); 3226 Register RSrc = MI.getOperand(2).getReg(); 3227 3228 MachineMemOperand *MMO = *MI.memoperands_begin(); 3229 const int MemSize = MMO->getSize(); 3230 3231 unsigned ImmOffset; 3232 unsigned TotalOffset; 3233 3234 // The typed intrinsics add an immediate after the registers. 3235 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3236 3237 // The struct intrinsic variants add one additional operand over raw. 3238 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3239 Register VIndex; 3240 int OpOffset = 0; 3241 if (HasVIndex) { 3242 VIndex = MI.getOperand(3).getReg(); 3243 OpOffset = 1; 3244 } 3245 3246 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3247 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3248 3249 unsigned Format = 0; 3250 if (IsTyped) { 3251 Format = MI.getOperand(5 + OpOffset).getImm(); 3252 ++OpOffset; 3253 } 3254 3255 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3256 3257 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3258 if (TotalOffset != 0) 3259 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3260 3261 unsigned Opc; 3262 if (IsTyped) { 3263 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3264 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3265 } else if (IsFormat) { 3266 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3267 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3268 } else { 3269 switch (MemSize) { 3270 case 1: 3271 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3272 break; 3273 case 2: 3274 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3275 break; 3276 default: 3277 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3278 break; 3279 } 3280 } 3281 3282 if (!VIndex) 3283 VIndex = B.buildConstant(S32, 0).getReg(0); 3284 3285 auto MIB = B.buildInstr(Opc) 3286 .addUse(VData) // vdata 3287 .addUse(RSrc) // rsrc 3288 .addUse(VIndex) // vindex 3289 .addUse(VOffset) // voffset 3290 .addUse(SOffset) // soffset 3291 .addImm(ImmOffset); // offset(imm) 3292 3293 if (IsTyped) 3294 MIB.addImm(Format); 3295 3296 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3297 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3298 .addMemOperand(MMO); 3299 3300 MI.eraseFromParent(); 3301 return true; 3302 } 3303 3304 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3305 MachineRegisterInfo &MRI, 3306 MachineIRBuilder &B, 3307 bool IsFormat, 3308 bool IsTyped) const { 3309 B.setInstr(MI); 3310 3311 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3312 MachineMemOperand *MMO = *MI.memoperands_begin(); 3313 const int MemSize = MMO->getSize(); 3314 const LLT S32 = LLT::scalar(32); 3315 3316 Register Dst = MI.getOperand(0).getReg(); 3317 Register RSrc = MI.getOperand(2).getReg(); 3318 3319 // The typed intrinsics add an immediate after the registers. 3320 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3321 3322 // The struct intrinsic variants add one additional operand over raw. 3323 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3324 Register VIndex; 3325 int OpOffset = 0; 3326 if (HasVIndex) { 3327 VIndex = MI.getOperand(3).getReg(); 3328 OpOffset = 1; 3329 } 3330 3331 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3332 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3333 3334 unsigned Format = 0; 3335 if (IsTyped) { 3336 Format = MI.getOperand(5 + OpOffset).getImm(); 3337 ++OpOffset; 3338 } 3339 3340 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3341 unsigned ImmOffset; 3342 unsigned TotalOffset; 3343 3344 LLT Ty = MRI.getType(Dst); 3345 LLT EltTy = Ty.getScalarType(); 3346 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3347 const bool Unpacked = ST.hasUnpackedD16VMem(); 3348 3349 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3350 if (TotalOffset != 0) 3351 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3352 3353 unsigned Opc; 3354 3355 if (IsTyped) { 3356 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3357 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3358 } else if (IsFormat) { 3359 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3360 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3361 } else { 3362 switch (MemSize) { 3363 case 1: 3364 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3365 break; 3366 case 2: 3367 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3368 break; 3369 default: 3370 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3371 break; 3372 } 3373 } 3374 3375 Register LoadDstReg; 3376 3377 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3378 LLT UnpackedTy = Ty.changeElementSize(32); 3379 3380 if (IsExtLoad) 3381 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3382 else if (Unpacked && IsD16 && Ty.isVector()) 3383 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3384 else 3385 LoadDstReg = Dst; 3386 3387 if (!VIndex) 3388 VIndex = B.buildConstant(S32, 0).getReg(0); 3389 3390 auto MIB = B.buildInstr(Opc) 3391 .addDef(LoadDstReg) // vdata 3392 .addUse(RSrc) // rsrc 3393 .addUse(VIndex) // vindex 3394 .addUse(VOffset) // voffset 3395 .addUse(SOffset) // soffset 3396 .addImm(ImmOffset); // offset(imm) 3397 3398 if (IsTyped) 3399 MIB.addImm(Format); 3400 3401 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3402 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3403 .addMemOperand(MMO); 3404 3405 if (LoadDstReg != Dst) { 3406 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3407 3408 // Widen result for extending loads was widened. 3409 if (IsExtLoad) 3410 B.buildTrunc(Dst, LoadDstReg); 3411 else { 3412 // Repack to original 16-bit vector result 3413 // FIXME: G_TRUNC should work, but legalization currently fails 3414 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3415 SmallVector<Register, 4> Repack; 3416 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3417 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3418 B.buildMerge(Dst, Repack); 3419 } 3420 } 3421 3422 MI.eraseFromParent(); 3423 return true; 3424 } 3425 3426 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3427 MachineIRBuilder &B, 3428 bool IsInc) const { 3429 B.setInstr(MI); 3430 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3431 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3432 B.buildInstr(Opc) 3433 .addDef(MI.getOperand(0).getReg()) 3434 .addUse(MI.getOperand(2).getReg()) 3435 .addUse(MI.getOperand(3).getReg()) 3436 .cloneMemRefs(MI); 3437 MI.eraseFromParent(); 3438 return true; 3439 } 3440 3441 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3442 switch (IntrID) { 3443 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3444 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3445 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3446 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3447 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3448 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3449 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3450 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3451 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3452 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3453 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3454 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3455 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3456 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3457 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3458 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3459 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3460 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3461 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3462 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3463 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3464 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3465 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3466 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3467 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3468 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3469 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3470 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3471 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3472 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3473 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3474 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3475 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3476 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3477 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3478 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3479 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3480 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3481 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3482 default: 3483 llvm_unreachable("unhandled atomic opcode"); 3484 } 3485 } 3486 3487 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3488 MachineIRBuilder &B, 3489 Intrinsic::ID IID) const { 3490 B.setInstr(MI); 3491 3492 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3493 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3494 3495 Register Dst = MI.getOperand(0).getReg(); 3496 Register VData = MI.getOperand(2).getReg(); 3497 3498 Register CmpVal; 3499 int OpOffset = 0; 3500 3501 if (IsCmpSwap) { 3502 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3503 ++OpOffset; 3504 } 3505 3506 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3507 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3508 3509 // The struct intrinsic variants add one additional operand over raw. 3510 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3511 Register VIndex; 3512 if (HasVIndex) { 3513 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3514 ++OpOffset; 3515 } 3516 3517 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3518 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3519 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3520 3521 MachineMemOperand *MMO = *MI.memoperands_begin(); 3522 3523 unsigned ImmOffset; 3524 unsigned TotalOffset; 3525 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3526 if (TotalOffset != 0) 3527 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3528 3529 if (!VIndex) 3530 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3531 3532 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3533 .addDef(Dst) 3534 .addUse(VData); // vdata 3535 3536 if (IsCmpSwap) 3537 MIB.addReg(CmpVal); 3538 3539 MIB.addUse(RSrc) // rsrc 3540 .addUse(VIndex) // vindex 3541 .addUse(VOffset) // voffset 3542 .addUse(SOffset) // soffset 3543 .addImm(ImmOffset) // offset(imm) 3544 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3545 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3546 .addMemOperand(MMO); 3547 3548 MI.eraseFromParent(); 3549 return true; 3550 } 3551 3552 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3553 /// vector with s16 typed elements. 3554 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3555 SmallVectorImpl<Register> &PackedAddrs, 3556 int AddrIdx, int DimIdx, int NumVAddrs, 3557 int NumGradients) { 3558 const LLT S16 = LLT::scalar(16); 3559 const LLT V2S16 = LLT::vector(2, 16); 3560 3561 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3562 MachineOperand &SrcOp = MI.getOperand(I); 3563 if (!SrcOp.isReg()) 3564 continue; // _L to _LZ may have eliminated this. 3565 3566 Register AddrReg = SrcOp.getReg(); 3567 3568 if (I < DimIdx) { 3569 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3570 PackedAddrs.push_back(AddrReg); 3571 } else { 3572 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3573 // derivatives dx/dh and dx/dv are packed with undef. 3574 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3575 ((NumGradients / 2) % 2 == 1 && 3576 (I == DimIdx + (NumGradients / 2) - 1 || 3577 I == DimIdx + NumGradients - 1)) || 3578 // Check for _L to _LZ optimization 3579 !MI.getOperand(I + 1).isReg()) { 3580 PackedAddrs.push_back( 3581 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3582 .getReg(0)); 3583 } else { 3584 PackedAddrs.push_back( 3585 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3586 .getReg(0)); 3587 ++I; 3588 } 3589 } 3590 } 3591 } 3592 3593 /// Convert from separate vaddr components to a single vector address register, 3594 /// and replace the remaining operands with $noreg. 3595 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3596 int DimIdx, int NumVAddrs) { 3597 const LLT S32 = LLT::scalar(32); 3598 3599 SmallVector<Register, 8> AddrRegs; 3600 for (int I = 0; I != NumVAddrs; ++I) { 3601 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3602 if (SrcOp.isReg()) { 3603 AddrRegs.push_back(SrcOp.getReg()); 3604 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3605 } 3606 } 3607 3608 int NumAddrRegs = AddrRegs.size(); 3609 if (NumAddrRegs != 1) { 3610 // Round up to 8 elements for v5-v7 3611 // FIXME: Missing intermediate sized register classes and instructions. 3612 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3613 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3614 auto Undef = B.buildUndef(S32); 3615 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3616 NumAddrRegs = RoundedNumRegs; 3617 } 3618 3619 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3620 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3621 } 3622 3623 for (int I = 1; I != NumVAddrs; ++I) { 3624 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3625 if (SrcOp.isReg()) 3626 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3627 } 3628 } 3629 3630 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3631 /// 3632 /// Depending on the subtarget, load/store with 16-bit element data need to be 3633 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3634 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3635 /// registers. 3636 /// 3637 /// We don't want to directly select image instructions just yet, but also want 3638 /// to exposes all register repacking to the legalizer/combiners. We also don't 3639 /// want a selected instrution entering RegBankSelect. In order to avoid 3640 /// defining a multitude of intermediate image instructions, directly hack on 3641 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3642 /// now unnecessary arguments with $noreg. 3643 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3644 MachineInstr &MI, MachineIRBuilder &B, 3645 GISelChangeObserver &Observer, 3646 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3647 B.setInstr(MI); 3648 3649 const int NumDefs = MI.getNumExplicitDefs(); 3650 bool IsTFE = NumDefs == 2; 3651 // We are only processing the operands of d16 image operations on subtargets 3652 // that use the unpacked register layout, or need to repack the TFE result. 3653 3654 // TODO: Do we need to guard against already legalized intrinsics? 3655 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3656 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3657 3658 MachineRegisterInfo *MRI = B.getMRI(); 3659 const LLT S32 = LLT::scalar(32); 3660 const LLT S16 = LLT::scalar(16); 3661 const LLT V2S16 = LLT::vector(2, 16); 3662 3663 // Index of first address argument 3664 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3665 3666 // Check for 16 bit addresses and pack if true. 3667 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3668 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3669 const bool IsA16 = AddrTy == S16; 3670 3671 int NumVAddrs, NumGradients; 3672 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3673 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3674 getDMaskIdx(BaseOpcode, NumDefs); 3675 unsigned DMask = 0; 3676 3677 int DMaskLanes = 0; 3678 if (!BaseOpcode->Atomic) { 3679 DMask = MI.getOperand(DMaskIdx).getImm(); 3680 if (BaseOpcode->Gather4) { 3681 DMaskLanes = 4; 3682 } else if (DMask != 0) { 3683 DMaskLanes = countPopulation(DMask); 3684 } else if (!IsTFE && !BaseOpcode->Store) { 3685 // If dmask is 0, this is a no-op load. This can be eliminated. 3686 B.buildUndef(MI.getOperand(0)); 3687 MI.eraseFromParent(); 3688 return true; 3689 } 3690 } 3691 3692 Observer.changingInstr(MI); 3693 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3694 3695 unsigned NewOpcode = NumDefs == 0 ? 3696 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3697 3698 // Track that we legalized this 3699 MI.setDesc(B.getTII().get(NewOpcode)); 3700 3701 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3702 // dmask to be at least 1 otherwise the instruction will fail 3703 if (IsTFE && DMask == 0) { 3704 DMask = 0x1; 3705 DMaskLanes = 1; 3706 MI.getOperand(DMaskIdx).setImm(DMask); 3707 } 3708 3709 if (BaseOpcode->Atomic) { 3710 Register VData0 = MI.getOperand(2).getReg(); 3711 LLT Ty = MRI->getType(VData0); 3712 3713 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3714 if (Ty.isVector()) 3715 return false; 3716 3717 if (BaseOpcode->AtomicX2) { 3718 Register VData1 = MI.getOperand(3).getReg(); 3719 // The two values are packed in one register. 3720 LLT PackedTy = LLT::vector(2, Ty); 3721 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3722 MI.getOperand(2).setReg(Concat.getReg(0)); 3723 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3724 } 3725 } 3726 3727 int CorrectedNumVAddrs = NumVAddrs; 3728 3729 // Optimize _L to _LZ when _L is zero 3730 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3731 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3732 const ConstantFP *ConstantLod; 3733 const int LodIdx = AddrIdx + NumVAddrs - 1; 3734 3735 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3736 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3737 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3738 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3739 LZMappingInfo->LZ, ImageDimIntr->Dim); 3740 3741 // The starting indexes should remain in the same place. 3742 --NumVAddrs; 3743 --CorrectedNumVAddrs; 3744 3745 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3746 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3747 MI.RemoveOperand(LodIdx); 3748 } 3749 } 3750 } 3751 3752 // Optimize _mip away, when 'lod' is zero 3753 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3754 int64_t ConstantLod; 3755 const int LodIdx = AddrIdx + NumVAddrs - 1; 3756 3757 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3758 if (ConstantLod == 0) { 3759 // TODO: Change intrinsic opcode and remove operand instead or replacing 3760 // it with 0, as the _L to _LZ handling is done above. 3761 MI.getOperand(LodIdx).ChangeToImmediate(0); 3762 --CorrectedNumVAddrs; 3763 } 3764 } 3765 } 3766 3767 // If the register allocator cannot place the address registers contiguously 3768 // without introducing moves, then using the non-sequential address encoding 3769 // is always preferable, since it saves VALU instructions and is usually a 3770 // wash in terms of code size or even better. 3771 // 3772 // However, we currently have no way of hinting to the register allocator 3773 // that MIMG addresses should be placed contiguously when it is possible to 3774 // do so, so force non-NSA for the common 2-address case as a heuristic. 3775 // 3776 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3777 // allocation when possible. 3778 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3779 3780 // Rewrite the addressing register layout before doing anything else. 3781 if (IsA16) { 3782 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3783 // should be introduced. 3784 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3785 return false; 3786 3787 if (NumVAddrs > 1) { 3788 SmallVector<Register, 4> PackedRegs; 3789 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3790 NumGradients); 3791 3792 if (!UseNSA && PackedRegs.size() > 1) { 3793 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3794 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3795 PackedRegs[0] = Concat.getReg(0); 3796 PackedRegs.resize(1); 3797 } 3798 3799 const int NumPacked = PackedRegs.size(); 3800 for (int I = 0; I != NumVAddrs; ++I) { 3801 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3802 if (!SrcOp.isReg()) { 3803 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3804 continue; 3805 } 3806 3807 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3808 3809 if (I < NumPacked) 3810 SrcOp.setReg(PackedRegs[I]); 3811 else 3812 SrcOp.setReg(AMDGPU::NoRegister); 3813 } 3814 } 3815 } else if (!UseNSA && NumVAddrs > 1) { 3816 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3817 } 3818 3819 3820 if (BaseOpcode->Store) { // No TFE for stores? 3821 // TODO: Handle dmask trim 3822 Register VData = MI.getOperand(1).getReg(); 3823 LLT Ty = MRI->getType(VData); 3824 if (!Ty.isVector() || Ty.getElementType() != S16) 3825 return true; 3826 3827 B.setInstr(MI); 3828 3829 Register RepackedReg = handleD16VData(B, *MRI, VData); 3830 if (RepackedReg != VData) { 3831 MI.getOperand(1).setReg(RepackedReg); 3832 } 3833 3834 return true; 3835 } 3836 3837 Register DstReg = MI.getOperand(0).getReg(); 3838 LLT Ty = MRI->getType(DstReg); 3839 const LLT EltTy = Ty.getScalarType(); 3840 const bool IsD16 = Ty.getScalarType() == S16; 3841 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3842 3843 // Confirm that the return type is large enough for the dmask specified 3844 if (NumElts < DMaskLanes) 3845 return false; 3846 3847 if (NumElts > 4 || DMaskLanes > 4) 3848 return false; 3849 3850 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3851 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3852 3853 // The raw dword aligned data component of the load. The only legal cases 3854 // where this matters should be when using the packed D16 format, for 3855 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3856 LLT RoundedTy; 3857 3858 // S32 vector to to cover all data, plus TFE result element. 3859 LLT TFETy; 3860 3861 // Register type to use for each loaded component. Will be S32 or V2S16. 3862 LLT RegTy; 3863 3864 if (IsD16 && ST.hasUnpackedD16VMem()) { 3865 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3866 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3867 RegTy = S32; 3868 } else { 3869 unsigned EltSize = EltTy.getSizeInBits(); 3870 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3871 unsigned RoundedSize = 32 * RoundedElts; 3872 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3873 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3874 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3875 } 3876 3877 // The return type does not need adjustment. 3878 // TODO: Should we change s16 case to s32 or <2 x s16>? 3879 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3880 return true; 3881 3882 Register Dst1Reg; 3883 3884 // Insert after the instruction. 3885 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3886 3887 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3888 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3889 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3890 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3891 3892 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3893 3894 MI.getOperand(0).setReg(NewResultReg); 3895 3896 // In the IR, TFE is supposed to be used with a 2 element struct return 3897 // type. The intruction really returns these two values in one contiguous 3898 // register, with one additional dword beyond the loaded data. Rewrite the 3899 // return type to use a single register result. 3900 3901 if (IsTFE) { 3902 Dst1Reg = MI.getOperand(1).getReg(); 3903 if (MRI->getType(Dst1Reg) != S32) 3904 return false; 3905 3906 // TODO: Make sure the TFE operand bit is set. 3907 MI.RemoveOperand(1); 3908 3909 // Handle the easy case that requires no repack instructions. 3910 if (Ty == S32) { 3911 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3912 return true; 3913 } 3914 } 3915 3916 // Now figure out how to copy the new result register back into the old 3917 // result. 3918 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3919 3920 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3921 3922 if (ResultNumRegs == 1) { 3923 assert(!IsTFE); 3924 ResultRegs[0] = NewResultReg; 3925 } else { 3926 // We have to repack into a new vector of some kind. 3927 for (int I = 0; I != NumDataRegs; ++I) 3928 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3929 B.buildUnmerge(ResultRegs, NewResultReg); 3930 3931 // Drop the final TFE element to get the data part. The TFE result is 3932 // directly written to the right place already. 3933 if (IsTFE) 3934 ResultRegs.resize(NumDataRegs); 3935 } 3936 3937 // For an s16 scalar result, we form an s32 result with a truncate regardless 3938 // of packed vs. unpacked. 3939 if (IsD16 && !Ty.isVector()) { 3940 B.buildTrunc(DstReg, ResultRegs[0]); 3941 return true; 3942 } 3943 3944 // Avoid a build/concat_vector of 1 entry. 3945 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3946 B.buildBitcast(DstReg, ResultRegs[0]); 3947 return true; 3948 } 3949 3950 assert(Ty.isVector()); 3951 3952 if (IsD16) { 3953 // For packed D16 results with TFE enabled, all the data components are 3954 // S32. Cast back to the expected type. 3955 // 3956 // TODO: We don't really need to use load s32 elements. We would only need one 3957 // cast for the TFE result if a multiple of v2s16 was used. 3958 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3959 for (Register &Reg : ResultRegs) 3960 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3961 } else if (ST.hasUnpackedD16VMem()) { 3962 for (Register &Reg : ResultRegs) 3963 Reg = B.buildTrunc(S16, Reg).getReg(0); 3964 } 3965 } 3966 3967 auto padWithUndef = [&](LLT Ty, int NumElts) { 3968 if (NumElts == 0) 3969 return; 3970 Register Undef = B.buildUndef(Ty).getReg(0); 3971 for (int I = 0; I != NumElts; ++I) 3972 ResultRegs.push_back(Undef); 3973 }; 3974 3975 // Pad out any elements eliminated due to the dmask. 3976 LLT ResTy = MRI->getType(ResultRegs[0]); 3977 if (!ResTy.isVector()) { 3978 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3979 B.buildBuildVector(DstReg, ResultRegs); 3980 return true; 3981 } 3982 3983 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 3984 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 3985 3986 // Deal with the one annoying legal case. 3987 const LLT V3S16 = LLT::vector(3, 16); 3988 if (Ty == V3S16) { 3989 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 3990 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 3991 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 3992 return true; 3993 } 3994 3995 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 3996 B.buildConcatVectors(DstReg, ResultRegs); 3997 return true; 3998 } 3999 4000 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4001 MachineInstr &MI, MachineIRBuilder &B, 4002 GISelChangeObserver &Observer) const { 4003 Register Dst = MI.getOperand(0).getReg(); 4004 LLT Ty = B.getMRI()->getType(Dst); 4005 unsigned Size = Ty.getSizeInBits(); 4006 MachineFunction &MF = B.getMF(); 4007 4008 Observer.changingInstr(MI); 4009 4010 // FIXME: We don't really need this intermediate instruction. The intrinsic 4011 // should be fixed to have a memory operand. Since it's readnone, we're not 4012 // allowed to add one. 4013 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4014 MI.RemoveOperand(1); // Remove intrinsic ID 4015 4016 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4017 // TODO: Should this use datalayout alignment? 4018 const unsigned MemSize = (Size + 7) / 8; 4019 const Align MemAlign(4); 4020 MachineMemOperand *MMO = MF.getMachineMemOperand( 4021 MachinePointerInfo(), 4022 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4023 MachineMemOperand::MOInvariant, 4024 MemSize, MemAlign); 4025 MI.addMemOperand(MF, MMO); 4026 4027 // There are no 96-bit result scalar loads, but widening to 128-bit should 4028 // always be legal. We may need to restore this to a 96-bit result if it turns 4029 // out this needs to be converted to a vector load during RegBankSelect. 4030 if (!isPowerOf2_32(Size)) { 4031 LegalizerHelper Helper(MF, *this, Observer, B); 4032 B.setInstr(MI); 4033 4034 if (Ty.isVector()) 4035 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4036 else 4037 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4038 } 4039 4040 Observer.changedInstr(MI); 4041 return true; 4042 } 4043 4044 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4045 MachineRegisterInfo &MRI, 4046 MachineIRBuilder &B) const { 4047 B.setInstr(MI); 4048 4049 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4050 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4051 !ST.isTrapHandlerEnabled()) { 4052 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4053 } else { 4054 // Pass queue pointer to trap handler as input, and insert trap instruction 4055 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4056 const ArgDescriptor *Arg = 4057 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4058 if (!Arg) 4059 return false; 4060 MachineRegisterInfo &MRI = *B.getMRI(); 4061 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4062 Register LiveIn = getLiveInRegister( 4063 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4064 /*InsertLiveInCopy=*/false); 4065 if (!loadInputValue(LiveIn, B, Arg)) 4066 return false; 4067 B.buildCopy(SGPR01, LiveIn); 4068 B.buildInstr(AMDGPU::S_TRAP) 4069 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4070 .addReg(SGPR01, RegState::Implicit); 4071 } 4072 4073 MI.eraseFromParent(); 4074 return true; 4075 } 4076 4077 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4078 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4079 B.setInstr(MI); 4080 4081 // Is non-HSA path or trap-handler disabled? then, report a warning 4082 // accordingly 4083 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4084 !ST.isTrapHandlerEnabled()) { 4085 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4086 "debugtrap handler not supported", 4087 MI.getDebugLoc(), DS_Warning); 4088 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4089 Ctx.diagnose(NoTrap); 4090 } else { 4091 // Insert debug-trap instruction 4092 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4093 } 4094 4095 MI.eraseFromParent(); 4096 return true; 4097 } 4098 4099 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4100 MachineIRBuilder &B, 4101 GISelChangeObserver &Observer) const { 4102 MachineRegisterInfo &MRI = *B.getMRI(); 4103 4104 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4105 auto IntrID = MI.getIntrinsicID(); 4106 switch (IntrID) { 4107 case Intrinsic::amdgcn_if: 4108 case Intrinsic::amdgcn_else: { 4109 MachineInstr *Br = nullptr; 4110 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 4111 const SIRegisterInfo *TRI 4112 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4113 4114 B.setInstr(*BrCond); 4115 Register Def = MI.getOperand(1).getReg(); 4116 Register Use = MI.getOperand(3).getReg(); 4117 4118 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 4119 if (Br) 4120 BrTarget = Br->getOperand(0).getMBB(); 4121 4122 if (IntrID == Intrinsic::amdgcn_if) { 4123 B.buildInstr(AMDGPU::SI_IF) 4124 .addDef(Def) 4125 .addUse(Use) 4126 .addMBB(BrTarget); 4127 } else { 4128 B.buildInstr(AMDGPU::SI_ELSE) 4129 .addDef(Def) 4130 .addUse(Use) 4131 .addMBB(BrTarget) 4132 .addImm(0); 4133 } 4134 4135 if (Br) 4136 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 4137 4138 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4139 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4140 MI.eraseFromParent(); 4141 BrCond->eraseFromParent(); 4142 return true; 4143 } 4144 4145 return false; 4146 } 4147 case Intrinsic::amdgcn_loop: { 4148 MachineInstr *Br = nullptr; 4149 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 4150 const SIRegisterInfo *TRI 4151 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4152 4153 B.setInstr(*BrCond); 4154 4155 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 4156 if (Br) 4157 BrTarget = Br->getOperand(0).getMBB(); 4158 4159 Register Reg = MI.getOperand(2).getReg(); 4160 B.buildInstr(AMDGPU::SI_LOOP) 4161 .addUse(Reg) 4162 .addMBB(BrTarget); 4163 4164 if (Br) 4165 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 4166 4167 MI.eraseFromParent(); 4168 BrCond->eraseFromParent(); 4169 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4170 return true; 4171 } 4172 4173 return false; 4174 } 4175 case Intrinsic::amdgcn_kernarg_segment_ptr: 4176 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4177 B.setInstr(MI); 4178 // This only makes sense to call in a kernel, so just lower to null. 4179 B.buildConstant(MI.getOperand(0).getReg(), 0); 4180 MI.eraseFromParent(); 4181 return true; 4182 } 4183 4184 return legalizePreloadedArgIntrin( 4185 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4186 case Intrinsic::amdgcn_implicitarg_ptr: 4187 return legalizeImplicitArgPtr(MI, MRI, B); 4188 case Intrinsic::amdgcn_workitem_id_x: 4189 return legalizePreloadedArgIntrin(MI, MRI, B, 4190 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4191 case Intrinsic::amdgcn_workitem_id_y: 4192 return legalizePreloadedArgIntrin(MI, MRI, B, 4193 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4194 case Intrinsic::amdgcn_workitem_id_z: 4195 return legalizePreloadedArgIntrin(MI, MRI, B, 4196 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4197 case Intrinsic::amdgcn_workgroup_id_x: 4198 return legalizePreloadedArgIntrin(MI, MRI, B, 4199 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4200 case Intrinsic::amdgcn_workgroup_id_y: 4201 return legalizePreloadedArgIntrin(MI, MRI, B, 4202 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4203 case Intrinsic::amdgcn_workgroup_id_z: 4204 return legalizePreloadedArgIntrin(MI, MRI, B, 4205 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4206 case Intrinsic::amdgcn_dispatch_ptr: 4207 return legalizePreloadedArgIntrin(MI, MRI, B, 4208 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4209 case Intrinsic::amdgcn_queue_ptr: 4210 return legalizePreloadedArgIntrin(MI, MRI, B, 4211 AMDGPUFunctionArgInfo::QUEUE_PTR); 4212 case Intrinsic::amdgcn_implicit_buffer_ptr: 4213 return legalizePreloadedArgIntrin( 4214 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4215 case Intrinsic::amdgcn_dispatch_id: 4216 return legalizePreloadedArgIntrin(MI, MRI, B, 4217 AMDGPUFunctionArgInfo::DISPATCH_ID); 4218 case Intrinsic::amdgcn_fdiv_fast: 4219 return legalizeFDIVFastIntrin(MI, MRI, B); 4220 case Intrinsic::amdgcn_is_shared: 4221 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4222 case Intrinsic::amdgcn_is_private: 4223 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4224 case Intrinsic::amdgcn_wavefrontsize: { 4225 B.setInstr(MI); 4226 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4227 MI.eraseFromParent(); 4228 return true; 4229 } 4230 case Intrinsic::amdgcn_s_buffer_load: 4231 return legalizeSBufferLoad(MI, B, Observer); 4232 case Intrinsic::amdgcn_raw_buffer_store: 4233 case Intrinsic::amdgcn_struct_buffer_store: 4234 return legalizeBufferStore(MI, MRI, B, false, false); 4235 case Intrinsic::amdgcn_raw_buffer_store_format: 4236 case Intrinsic::amdgcn_struct_buffer_store_format: 4237 return legalizeBufferStore(MI, MRI, B, false, true); 4238 case Intrinsic::amdgcn_raw_tbuffer_store: 4239 case Intrinsic::amdgcn_struct_tbuffer_store: 4240 return legalizeBufferStore(MI, MRI, B, true, true); 4241 case Intrinsic::amdgcn_raw_buffer_load: 4242 case Intrinsic::amdgcn_struct_buffer_load: 4243 return legalizeBufferLoad(MI, MRI, B, false, false); 4244 case Intrinsic::amdgcn_raw_buffer_load_format: 4245 case Intrinsic::amdgcn_struct_buffer_load_format: 4246 return legalizeBufferLoad(MI, MRI, B, true, false); 4247 case Intrinsic::amdgcn_raw_tbuffer_load: 4248 case Intrinsic::amdgcn_struct_tbuffer_load: 4249 return legalizeBufferLoad(MI, MRI, B, true, true); 4250 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4251 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4252 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4253 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4254 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4255 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4256 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4257 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4258 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4259 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4260 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4261 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4262 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4263 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4264 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4265 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4266 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4267 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4268 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4269 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4270 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4271 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4272 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4273 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4274 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4275 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4276 return legalizeBufferAtomic(MI, B, IntrID); 4277 case Intrinsic::amdgcn_atomic_inc: 4278 return legalizeAtomicIncDec(MI, B, true); 4279 case Intrinsic::amdgcn_atomic_dec: 4280 return legalizeAtomicIncDec(MI, B, false); 4281 case Intrinsic::trap: 4282 return legalizeTrapIntrinsic(MI, MRI, B); 4283 case Intrinsic::debugtrap: 4284 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4285 default: { 4286 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4287 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4288 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4289 return true; 4290 } 4291 } 4292 4293 return true; 4294 } 4295