1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/ADT/ScopeExit.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 31 #include "llvm/CodeGen/TargetOpcodes.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/DerivedTypes.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/Support/Debug.h" 37 38 #define DEBUG_TYPE "amdgpu-legalinfo" 39 40 using namespace llvm; 41 using namespace LegalizeActions; 42 using namespace LegalizeMutations; 43 using namespace LegalityPredicates; 44 using namespace MIPatternMatch; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 static LegalityPredicate isMultiple32(unsigned TypeIdx, 61 unsigned MaxSize = 1024) { 62 return [=](const LegalityQuery &Query) { 63 const LLT Ty = Query.Types[TypeIdx]; 64 const LLT EltTy = Ty.getScalarType(); 65 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 66 }; 67 } 68 69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 70 return [=](const LegalityQuery &Query) { 71 return Query.Types[TypeIdx].getSizeInBits() == Size; 72 }; 73 } 74 75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 return Ty.isVector() && 79 Ty.getNumElements() % 2 != 0 && 80 Ty.getElementType().getSizeInBits() < 32 && 81 Ty.getSizeInBits() % 32 != 0; 82 }; 83 } 84 85 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 86 return [=](const LegalityQuery &Query) { 87 const LLT Ty = Query.Types[TypeIdx]; 88 const LLT EltTy = Ty.getScalarType(); 89 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 90 }; 91 } 92 93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getElementType(); 97 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 98 }; 99 } 100 101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 unsigned Size = Ty.getSizeInBits(); 106 unsigned Pieces = (Size + 63) / 64; 107 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 108 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 109 }; 110 } 111 112 // Increase the number of vector elements to reach the next multiple of 32-bit 113 // type. 114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 115 return [=](const LegalityQuery &Query) { 116 const LLT Ty = Query.Types[TypeIdx]; 117 118 const LLT EltTy = Ty.getElementType(); 119 const int Size = Ty.getSizeInBits(); 120 const int EltSize = EltTy.getSizeInBits(); 121 const int NextMul32 = (Size + 31) / 32; 122 123 assert(EltSize < 32); 124 125 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 126 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 127 }; 128 } 129 130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 131 return [=](const LegalityQuery &Query) { 132 const LLT QueryTy = Query.Types[TypeIdx]; 133 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 134 }; 135 } 136 137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 138 return [=](const LegalityQuery &Query) { 139 const LLT QueryTy = Query.Types[TypeIdx]; 140 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 141 }; 142 } 143 144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT QueryTy = Query.Types[TypeIdx]; 147 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 148 }; 149 } 150 151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 152 // v2s16. 153 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 154 return [=](const LegalityQuery &Query) { 155 const LLT Ty = Query.Types[TypeIdx]; 156 if (Ty.isVector()) { 157 const int EltSize = Ty.getElementType().getSizeInBits(); 158 return EltSize == 32 || EltSize == 64 || 159 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 160 EltSize == 128 || EltSize == 256; 161 } 162 163 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 164 }; 165 } 166 167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getElementType() == Type; 171 }; 172 } 173 174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 if (!QueryTy.isVector()) 178 return false; 179 const LLT EltTy = QueryTy.getElementType(); 180 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 181 }; 182 } 183 184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 const LLT Ty = Query.Types[TypeIdx]; 187 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 188 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 189 }; 190 } 191 192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 193 return [=](const LegalityQuery &Query) { 194 return Query.Types[TypeIdx0].getSizeInBits() < 195 Query.Types[TypeIdx1].getSizeInBits(); 196 }; 197 } 198 199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 200 return [=](const LegalityQuery &Query) { 201 return Query.Types[TypeIdx0].getSizeInBits() > 202 Query.Types[TypeIdx1].getSizeInBits(); 203 }; 204 } 205 206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 207 const GCNTargetMachine &TM) 208 : ST(ST_) { 209 using namespace TargetOpcode; 210 211 auto GetAddrSpacePtr = [&TM](unsigned AS) { 212 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 213 }; 214 215 const LLT S1 = LLT::scalar(1); 216 const LLT S16 = LLT::scalar(16); 217 const LLT S32 = LLT::scalar(32); 218 const LLT S64 = LLT::scalar(64); 219 const LLT S128 = LLT::scalar(128); 220 const LLT S256 = LLT::scalar(256); 221 const LLT S512 = LLT::scalar(512); 222 const LLT S1024 = LLT::scalar(1024); 223 224 const LLT V2S16 = LLT::vector(2, 16); 225 const LLT V4S16 = LLT::vector(4, 16); 226 227 const LLT V2S32 = LLT::vector(2, 32); 228 const LLT V3S32 = LLT::vector(3, 32); 229 const LLT V4S32 = LLT::vector(4, 32); 230 const LLT V5S32 = LLT::vector(5, 32); 231 const LLT V6S32 = LLT::vector(6, 32); 232 const LLT V7S32 = LLT::vector(7, 32); 233 const LLT V8S32 = LLT::vector(8, 32); 234 const LLT V9S32 = LLT::vector(9, 32); 235 const LLT V10S32 = LLT::vector(10, 32); 236 const LLT V11S32 = LLT::vector(11, 32); 237 const LLT V12S32 = LLT::vector(12, 32); 238 const LLT V13S32 = LLT::vector(13, 32); 239 const LLT V14S32 = LLT::vector(14, 32); 240 const LLT V15S32 = LLT::vector(15, 32); 241 const LLT V16S32 = LLT::vector(16, 32); 242 const LLT V32S32 = LLT::vector(32, 32); 243 244 const LLT V2S64 = LLT::vector(2, 64); 245 const LLT V3S64 = LLT::vector(3, 64); 246 const LLT V4S64 = LLT::vector(4, 64); 247 const LLT V5S64 = LLT::vector(5, 64); 248 const LLT V6S64 = LLT::vector(6, 64); 249 const LLT V7S64 = LLT::vector(7, 64); 250 const LLT V8S64 = LLT::vector(8, 64); 251 const LLT V16S64 = LLT::vector(16, 64); 252 253 std::initializer_list<LLT> AllS32Vectors = 254 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 255 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 256 std::initializer_list<LLT> AllS64Vectors = 257 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 258 259 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 260 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 261 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 262 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 263 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 264 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 265 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 266 267 const LLT CodePtr = FlatPtr; 268 269 const std::initializer_list<LLT> AddrSpaces64 = { 270 GlobalPtr, ConstantPtr, FlatPtr 271 }; 272 273 const std::initializer_list<LLT> AddrSpaces32 = { 274 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 275 }; 276 277 const std::initializer_list<LLT> FPTypesBase = { 278 S32, S64 279 }; 280 281 const std::initializer_list<LLT> FPTypes16 = { 282 S32, S64, S16 283 }; 284 285 const std::initializer_list<LLT> FPTypesPK16 = { 286 S32, S64, S16, V2S16 287 }; 288 289 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 290 291 setAction({G_BRCOND, S1}, Legal); // VCC branches 292 setAction({G_BRCOND, S32}, Legal); // SCC branches 293 294 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 295 // elements for v3s16 296 getActionDefinitionsBuilder(G_PHI) 297 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 298 .legalFor(AllS32Vectors) 299 .legalFor(AllS64Vectors) 300 .legalFor(AddrSpaces64) 301 .legalFor(AddrSpaces32) 302 .clampScalar(0, S32, S256) 303 .widenScalarToNextPow2(0, 32) 304 .clampMaxNumElements(0, S32, 16) 305 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 306 .legalIf(isPointer(0)); 307 308 if (ST.hasVOP3PInsts()) { 309 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 310 .legalFor({S32, S16, V2S16}) 311 .clampScalar(0, S16, S32) 312 .clampMaxNumElements(0, S16, 2) 313 .scalarize(0) 314 .widenScalarToNextPow2(0, 32); 315 } else if (ST.has16BitInsts()) { 316 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 317 .legalFor({S32, S16}) 318 .clampScalar(0, S16, S32) 319 .scalarize(0) 320 .widenScalarToNextPow2(0, 32); 321 } else { 322 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 323 .legalFor({S32}) 324 .clampScalar(0, S32, S32) 325 .scalarize(0); 326 } 327 328 // FIXME: Not really legal. Placeholder for custom lowering. 329 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 330 .customFor({S32, S64}) 331 .clampScalar(0, S32, S64) 332 .widenScalarToNextPow2(0, 32) 333 .scalarize(0); 334 335 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 336 .legalFor({S32}) 337 .clampScalar(0, S32, S32) 338 .scalarize(0); 339 340 // Report legal for any types we can handle anywhere. For the cases only legal 341 // on the SALU, RegBankSelect will be able to re-legalize. 342 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 343 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 344 .clampScalar(0, S32, S64) 345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 346 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 347 .widenScalarToNextPow2(0) 348 .scalarize(0); 349 350 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 351 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 352 .legalFor({{S32, S1}, {S32, S32}}) 353 .minScalar(0, S32) 354 // TODO: .scalarize(0) 355 .lower(); 356 357 getActionDefinitionsBuilder(G_BITCAST) 358 // Don't worry about the size constraint. 359 .legalIf(all(isRegisterType(0), isRegisterType(1))) 360 .lower(); 361 362 363 getActionDefinitionsBuilder(G_CONSTANT) 364 .legalFor({S1, S32, S64, S16, GlobalPtr, 365 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 366 .clampScalar(0, S32, S64) 367 .widenScalarToNextPow2(0) 368 .legalIf(isPointer(0)); 369 370 getActionDefinitionsBuilder(G_FCONSTANT) 371 .legalFor({S32, S64, S16}) 372 .clampScalar(0, S16, S64); 373 374 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 375 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 376 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 377 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 378 .clampScalarOrElt(0, S32, S1024) 379 .legalIf(isMultiple32(0)) 380 .widenScalarToNextPow2(0, 32) 381 .clampMaxNumElements(0, S32, 16); 382 383 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 384 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 385 .unsupportedFor({PrivatePtr}) 386 .custom(); 387 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 388 389 auto &FPOpActions = getActionDefinitionsBuilder( 390 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 391 .legalFor({S32, S64}); 392 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 393 .customFor({S32, S64}); 394 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 395 .customFor({S32, S64}); 396 397 if (ST.has16BitInsts()) { 398 if (ST.hasVOP3PInsts()) 399 FPOpActions.legalFor({S16, V2S16}); 400 else 401 FPOpActions.legalFor({S16}); 402 403 TrigActions.customFor({S16}); 404 FDIVActions.customFor({S16}); 405 } 406 407 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 408 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 409 410 if (ST.hasVOP3PInsts()) { 411 MinNumMaxNum.customFor(FPTypesPK16) 412 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 413 .clampMaxNumElements(0, S16, 2) 414 .clampScalar(0, S16, S64) 415 .scalarize(0); 416 } else if (ST.has16BitInsts()) { 417 MinNumMaxNum.customFor(FPTypes16) 418 .clampScalar(0, S16, S64) 419 .scalarize(0); 420 } else { 421 MinNumMaxNum.customFor(FPTypesBase) 422 .clampScalar(0, S32, S64) 423 .scalarize(0); 424 } 425 426 if (ST.hasVOP3PInsts()) 427 FPOpActions.clampMaxNumElements(0, S16, 2); 428 429 FPOpActions 430 .scalarize(0) 431 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 432 433 TrigActions 434 .scalarize(0) 435 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 436 437 FDIVActions 438 .scalarize(0) 439 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 440 441 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 442 .legalFor(FPTypesPK16) 443 .clampMaxNumElements(0, S16, 2) 444 .scalarize(0) 445 .clampScalar(0, S16, S64); 446 447 if (ST.has16BitInsts()) { 448 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 449 .legalFor({S32, S64, S16}) 450 .scalarize(0) 451 .clampScalar(0, S16, S64); 452 } else { 453 getActionDefinitionsBuilder(G_FSQRT) 454 .legalFor({S32, S64}) 455 .scalarize(0) 456 .clampScalar(0, S32, S64); 457 458 if (ST.hasFractBug()) { 459 getActionDefinitionsBuilder(G_FFLOOR) 460 .customFor({S64}) 461 .legalFor({S32, S64}) 462 .scalarize(0) 463 .clampScalar(0, S32, S64); 464 } else { 465 getActionDefinitionsBuilder(G_FFLOOR) 466 .legalFor({S32, S64}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64); 469 } 470 } 471 472 getActionDefinitionsBuilder(G_FPTRUNC) 473 .legalFor({{S32, S64}, {S16, S32}}) 474 .scalarize(0) 475 .lower(); 476 477 getActionDefinitionsBuilder(G_FPEXT) 478 .legalFor({{S64, S32}, {S32, S16}}) 479 .lowerFor({{S64, S16}}) // FIXME: Implement 480 .scalarize(0); 481 482 getActionDefinitionsBuilder(G_FSUB) 483 // Use actual fsub instruction 484 .legalFor({S32}) 485 // Must use fadd + fneg 486 .lowerFor({S64, S16, V2S16}) 487 .scalarize(0) 488 .clampScalar(0, S32, S64); 489 490 // Whether this is legal depends on the floating point mode for the function. 491 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 492 if (ST.hasMadF16()) 493 FMad.customFor({S32, S16}); 494 else 495 FMad.customFor({S32}); 496 FMad.scalarize(0) 497 .lower(); 498 499 // TODO: Do we need to clamp maximum bitwidth? 500 getActionDefinitionsBuilder(G_TRUNC) 501 .legalIf(isScalar(0)) 502 .legalFor({{V2S16, V2S32}}) 503 .clampMaxNumElements(0, S16, 2) 504 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 505 // situations (like an invalid implicit use), we don't want to infinite loop 506 // in the legalizer. 507 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 508 .alwaysLegal(); 509 510 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 511 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 512 {S32, S1}, {S64, S1}, {S16, S1}}) 513 .scalarize(0) 514 .clampScalar(0, S32, S64) 515 .widenScalarToNextPow2(1, 32); 516 517 // TODO: Split s1->s64 during regbankselect for VALU. 518 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 519 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 520 .lowerFor({{S32, S64}}) 521 .lowerIf(typeIs(1, S1)) 522 .customFor({{S64, S64}}); 523 if (ST.has16BitInsts()) 524 IToFP.legalFor({{S16, S16}}); 525 IToFP.clampScalar(1, S32, S64) 526 .scalarize(0) 527 .widenScalarToNextPow2(1); 528 529 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 530 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 531 .customFor({{S64, S64}}); 532 if (ST.has16BitInsts()) 533 FPToI.legalFor({{S16, S16}}); 534 else 535 FPToI.minScalar(1, S32); 536 537 FPToI.minScalar(0, S32) 538 .scalarize(0) 539 .lower(); 540 541 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 542 .scalarize(0) 543 .lower(); 544 545 if (ST.has16BitInsts()) { 546 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 547 .legalFor({S16, S32, S64}) 548 .clampScalar(0, S16, S64) 549 .scalarize(0); 550 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 551 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 552 .legalFor({S32, S64}) 553 .clampScalar(0, S32, S64) 554 .scalarize(0); 555 } else { 556 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 557 .legalFor({S32}) 558 .customFor({S64}) 559 .clampScalar(0, S32, S64) 560 .scalarize(0); 561 } 562 563 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 564 .scalarize(0) 565 .alwaysLegal(); 566 567 auto &CmpBuilder = 568 getActionDefinitionsBuilder(G_ICMP) 569 // The compare output type differs based on the register bank of the output, 570 // so make both s1 and s32 legal. 571 // 572 // Scalar compares producing output in scc will be promoted to s32, as that 573 // is the allocatable register type that will be needed for the copy from 574 // scc. This will be promoted during RegBankSelect, and we assume something 575 // before that won't try to use s32 result types. 576 // 577 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 578 // bank. 579 .legalForCartesianProduct( 580 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 581 .legalForCartesianProduct( 582 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 583 if (ST.has16BitInsts()) { 584 CmpBuilder.legalFor({{S1, S16}}); 585 } 586 587 CmpBuilder 588 .widenScalarToNextPow2(1) 589 .clampScalar(1, S32, S64) 590 .scalarize(0) 591 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 592 593 getActionDefinitionsBuilder(G_FCMP) 594 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 595 .widenScalarToNextPow2(1) 596 .clampScalar(1, S32, S64) 597 .scalarize(0); 598 599 // FIXME: fpow has a selection pattern that should move to custom lowering. 600 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 601 if (ST.has16BitInsts()) 602 Exp2Ops.legalFor({S32, S16}); 603 else 604 Exp2Ops.legalFor({S32}); 605 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 606 Exp2Ops.scalarize(0); 607 608 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 609 if (ST.has16BitInsts()) 610 ExpOps.customFor({{S32}, {S16}}); 611 else 612 ExpOps.customFor({S32}); 613 ExpOps.clampScalar(0, MinScalarFPTy, S32) 614 .scalarize(0); 615 616 // The 64-bit versions produce 32-bit results, but only on the SALU. 617 getActionDefinitionsBuilder(G_CTPOP) 618 .legalFor({{S32, S32}, {S32, S64}}) 619 .clampScalar(0, S32, S32) 620 .clampScalar(1, S32, S64) 621 .scalarize(0) 622 .widenScalarToNextPow2(0, 32) 623 .widenScalarToNextPow2(1, 32); 624 625 // The hardware instructions return a different result on 0 than the generic 626 // instructions expect. The hardware produces -1, but these produce the 627 // bitwidth. 628 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 629 .scalarize(0) 630 .clampScalar(0, S32, S32) 631 .clampScalar(1, S32, S64) 632 .widenScalarToNextPow2(0, 32) 633 .widenScalarToNextPow2(1, 32) 634 .lower(); 635 636 // The 64-bit versions produce 32-bit results, but only on the SALU. 637 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 638 .legalFor({{S32, S32}, {S32, S64}}) 639 .clampScalar(0, S32, S32) 640 .clampScalar(1, S32, S64) 641 .scalarize(0) 642 .widenScalarToNextPow2(0, 32) 643 .widenScalarToNextPow2(1, 32); 644 645 getActionDefinitionsBuilder(G_BITREVERSE) 646 .legalFor({S32}) 647 .clampScalar(0, S32, S32) 648 .scalarize(0); 649 650 if (ST.has16BitInsts()) { 651 getActionDefinitionsBuilder(G_BSWAP) 652 .legalFor({S16, S32, V2S16}) 653 .clampMaxNumElements(0, S16, 2) 654 // FIXME: Fixing non-power-of-2 before clamp is workaround for 655 // narrowScalar limitation. 656 .widenScalarToNextPow2(0) 657 .clampScalar(0, S16, S32) 658 .scalarize(0); 659 660 if (ST.hasVOP3PInsts()) { 661 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 662 .legalFor({S32, S16, V2S16}) 663 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 664 .clampMaxNumElements(0, S16, 2) 665 .minScalar(0, S16) 666 .widenScalarToNextPow2(0) 667 .scalarize(0) 668 .lower(); 669 } else { 670 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 671 .legalFor({S32, S16}) 672 .widenScalarToNextPow2(0) 673 .minScalar(0, S16) 674 .scalarize(0) 675 .lower(); 676 } 677 } else { 678 // TODO: Should have same legality without v_perm_b32 679 getActionDefinitionsBuilder(G_BSWAP) 680 .legalFor({S32}) 681 .lowerIf(narrowerThan(0, 32)) 682 // FIXME: Fixing non-power-of-2 before clamp is workaround for 683 // narrowScalar limitation. 684 .widenScalarToNextPow2(0) 685 .maxScalar(0, S32) 686 .scalarize(0) 687 .lower(); 688 689 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 690 .legalFor({S32}) 691 .minScalar(0, S32) 692 .widenScalarToNextPow2(0) 693 .scalarize(0) 694 .lower(); 695 } 696 697 getActionDefinitionsBuilder(G_INTTOPTR) 698 // List the common cases 699 .legalForCartesianProduct(AddrSpaces64, {S64}) 700 .legalForCartesianProduct(AddrSpaces32, {S32}) 701 .scalarize(0) 702 // Accept any address space as long as the size matches 703 .legalIf(sameSize(0, 1)) 704 .widenScalarIf(smallerThan(1, 0), 705 [](const LegalityQuery &Query) { 706 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 707 }) 708 .narrowScalarIf(greaterThan(1, 0), 709 [](const LegalityQuery &Query) { 710 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 711 }); 712 713 getActionDefinitionsBuilder(G_PTRTOINT) 714 // List the common cases 715 .legalForCartesianProduct(AddrSpaces64, {S64}) 716 .legalForCartesianProduct(AddrSpaces32, {S32}) 717 .scalarize(0) 718 // Accept any address space as long as the size matches 719 .legalIf(sameSize(0, 1)) 720 .widenScalarIf(smallerThan(0, 1), 721 [](const LegalityQuery &Query) { 722 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 723 }) 724 .narrowScalarIf( 725 greaterThan(0, 1), 726 [](const LegalityQuery &Query) { 727 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 728 }); 729 730 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 731 .scalarize(0) 732 .custom(); 733 734 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 735 // handle some operations by just promoting the register during 736 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 737 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 738 switch (AS) { 739 // FIXME: Private element size. 740 case AMDGPUAS::PRIVATE_ADDRESS: 741 return 32; 742 // FIXME: Check subtarget 743 case AMDGPUAS::LOCAL_ADDRESS: 744 return ST.useDS128() ? 128 : 64; 745 746 // Treat constant and global as identical. SMRD loads are sometimes usable 747 // for global loads (ideally constant address space should be eliminated) 748 // depending on the context. Legality cannot be context dependent, but 749 // RegBankSelect can split the load as necessary depending on the pointer 750 // register bank/uniformity and if the memory is invariant or not written in 751 // a kernel. 752 case AMDGPUAS::CONSTANT_ADDRESS: 753 case AMDGPUAS::GLOBAL_ADDRESS: 754 return IsLoad ? 512 : 128; 755 default: 756 return 128; 757 } 758 }; 759 760 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 761 bool IsLoad) -> bool { 762 const LLT DstTy = Query.Types[0]; 763 764 // Split vector extloads. 765 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 766 unsigned Align = Query.MMODescrs[0].AlignInBits; 767 768 if (MemSize < DstTy.getSizeInBits()) 769 MemSize = std::max(MemSize, Align); 770 771 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 772 return true; 773 774 const LLT PtrTy = Query.Types[1]; 775 unsigned AS = PtrTy.getAddressSpace(); 776 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 777 return true; 778 779 // Catch weird sized loads that don't evenly divide into the access sizes 780 // TODO: May be able to widen depending on alignment etc. 781 unsigned NumRegs = (MemSize + 31) / 32; 782 if (NumRegs == 3) { 783 if (!ST.hasDwordx3LoadStores()) 784 return true; 785 } else { 786 // If the alignment allows, these should have been widened. 787 if (!isPowerOf2_32(NumRegs)) 788 return true; 789 } 790 791 if (Align < MemSize) { 792 const SITargetLowering *TLI = ST.getTargetLowering(); 793 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 794 } 795 796 return false; 797 }; 798 799 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 800 unsigned Size = Query.Types[0].getSizeInBits(); 801 if (isPowerOf2_32(Size)) 802 return false; 803 804 if (Size == 96 && ST.hasDwordx3LoadStores()) 805 return false; 806 807 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 808 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 809 return false; 810 811 unsigned Align = Query.MMODescrs[0].AlignInBits; 812 unsigned RoundedSize = NextPowerOf2(Size); 813 return (Align >= RoundedSize); 814 }; 815 816 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 817 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 818 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 819 820 // TODO: Refine based on subtargets which support unaligned access or 128-bit 821 // LDS 822 // TODO: Unsupported flat for SI. 823 824 for (unsigned Op : {G_LOAD, G_STORE}) { 825 const bool IsStore = Op == G_STORE; 826 827 auto &Actions = getActionDefinitionsBuilder(Op); 828 // Whitelist the common cases. 829 // TODO: Loads to s16 on gfx9 830 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 831 {V2S32, GlobalPtr, 64, GlobalAlign32}, 832 {V4S32, GlobalPtr, 128, GlobalAlign32}, 833 {S128, GlobalPtr, 128, GlobalAlign32}, 834 {S64, GlobalPtr, 64, GlobalAlign32}, 835 {V2S64, GlobalPtr, 128, GlobalAlign32}, 836 {V2S16, GlobalPtr, 32, GlobalAlign32}, 837 {S32, GlobalPtr, 8, GlobalAlign8}, 838 {S32, GlobalPtr, 16, GlobalAlign16}, 839 840 {S32, LocalPtr, 32, 32}, 841 {S64, LocalPtr, 64, 32}, 842 {V2S32, LocalPtr, 64, 32}, 843 {S32, LocalPtr, 8, 8}, 844 {S32, LocalPtr, 16, 16}, 845 {V2S16, LocalPtr, 32, 32}, 846 847 {S32, PrivatePtr, 32, 32}, 848 {S32, PrivatePtr, 8, 8}, 849 {S32, PrivatePtr, 16, 16}, 850 {V2S16, PrivatePtr, 32, 32}, 851 852 {S32, FlatPtr, 32, GlobalAlign32}, 853 {S32, FlatPtr, 16, GlobalAlign16}, 854 {S32, FlatPtr, 8, GlobalAlign8}, 855 {V2S16, FlatPtr, 32, GlobalAlign32}, 856 857 {S32, ConstantPtr, 32, GlobalAlign32}, 858 {V2S32, ConstantPtr, 64, GlobalAlign32}, 859 {V4S32, ConstantPtr, 128, GlobalAlign32}, 860 {S64, ConstantPtr, 64, GlobalAlign32}, 861 {S128, ConstantPtr, 128, GlobalAlign32}, 862 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 863 Actions 864 .customIf(typeIs(1, Constant32Ptr)) 865 // Widen suitably aligned loads by loading extra elements. 866 .moreElementsIf([=](const LegalityQuery &Query) { 867 const LLT Ty = Query.Types[0]; 868 return Op == G_LOAD && Ty.isVector() && 869 shouldWidenLoadResult(Query); 870 }, moreElementsToNextPow2(0)) 871 .widenScalarIf([=](const LegalityQuery &Query) { 872 const LLT Ty = Query.Types[0]; 873 return Op == G_LOAD && !Ty.isVector() && 874 shouldWidenLoadResult(Query); 875 }, widenScalarOrEltToNextPow2(0)) 876 .narrowScalarIf( 877 [=](const LegalityQuery &Query) -> bool { 878 return !Query.Types[0].isVector() && 879 needToSplitMemOp(Query, Op == G_LOAD); 880 }, 881 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 882 const LLT DstTy = Query.Types[0]; 883 const LLT PtrTy = Query.Types[1]; 884 885 const unsigned DstSize = DstTy.getSizeInBits(); 886 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 887 888 // Split extloads. 889 if (DstSize > MemSize) 890 return std::make_pair(0, LLT::scalar(MemSize)); 891 892 if (!isPowerOf2_32(DstSize)) { 893 // We're probably decomposing an odd sized store. Try to split 894 // to the widest type. TODO: Account for alignment. As-is it 895 // should be OK, since the new parts will be further legalized. 896 unsigned FloorSize = PowerOf2Floor(DstSize); 897 return std::make_pair(0, LLT::scalar(FloorSize)); 898 } 899 900 if (DstSize > 32 && (DstSize % 32 != 0)) { 901 // FIXME: Need a way to specify non-extload of larger size if 902 // suitably aligned. 903 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 904 } 905 906 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 907 Op == G_LOAD); 908 if (MemSize > MaxSize) 909 return std::make_pair(0, LLT::scalar(MaxSize)); 910 911 unsigned Align = Query.MMODescrs[0].AlignInBits; 912 return std::make_pair(0, LLT::scalar(Align)); 913 }) 914 .fewerElementsIf( 915 [=](const LegalityQuery &Query) -> bool { 916 return Query.Types[0].isVector() && 917 needToSplitMemOp(Query, Op == G_LOAD); 918 }, 919 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 920 const LLT DstTy = Query.Types[0]; 921 const LLT PtrTy = Query.Types[1]; 922 923 LLT EltTy = DstTy.getElementType(); 924 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 925 Op == G_LOAD); 926 927 // FIXME: Handle widened to power of 2 results better. This ends 928 // up scalarizing. 929 // FIXME: 3 element stores scalarized on SI 930 931 // Split if it's too large for the address space. 932 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 933 unsigned NumElts = DstTy.getNumElements(); 934 unsigned EltSize = EltTy.getSizeInBits(); 935 936 if (MaxSize % EltSize == 0) { 937 return std::make_pair( 938 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 939 } 940 941 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 942 943 // FIXME: Refine when odd breakdowns handled 944 // The scalars will need to be re-legalized. 945 if (NumPieces == 1 || NumPieces >= NumElts || 946 NumElts % NumPieces != 0) 947 return std::make_pair(0, EltTy); 948 949 return std::make_pair(0, 950 LLT::vector(NumElts / NumPieces, EltTy)); 951 } 952 953 // FIXME: We could probably handle weird extending loads better. 954 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 955 if (DstTy.getSizeInBits() > MemSize) 956 return std::make_pair(0, EltTy); 957 958 unsigned EltSize = EltTy.getSizeInBits(); 959 unsigned DstSize = DstTy.getSizeInBits(); 960 if (!isPowerOf2_32(DstSize)) { 961 // We're probably decomposing an odd sized store. Try to split 962 // to the widest type. TODO: Account for alignment. As-is it 963 // should be OK, since the new parts will be further legalized. 964 unsigned FloorSize = PowerOf2Floor(DstSize); 965 return std::make_pair( 966 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 967 } 968 969 // Need to split because of alignment. 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 if (EltSize > Align && 972 (EltSize / Align < DstTy.getNumElements())) { 973 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 974 } 975 976 // May need relegalization for the scalars. 977 return std::make_pair(0, EltTy); 978 }) 979 .minScalar(0, S32); 980 981 if (IsStore) 982 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 983 984 // TODO: Need a bitcast lower option? 985 Actions 986 .legalIf([=](const LegalityQuery &Query) { 987 const LLT Ty0 = Query.Types[0]; 988 unsigned Size = Ty0.getSizeInBits(); 989 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 990 unsigned Align = Query.MMODescrs[0].AlignInBits; 991 992 // FIXME: Widening store from alignment not valid. 993 if (MemSize < Size) 994 MemSize = std::max(MemSize, Align); 995 996 // No extending vector loads. 997 if (Size > MemSize && Ty0.isVector()) 998 return false; 999 1000 switch (MemSize) { 1001 case 8: 1002 case 16: 1003 return Size == 32; 1004 case 32: 1005 case 64: 1006 case 128: 1007 return true; 1008 case 96: 1009 return ST.hasDwordx3LoadStores(); 1010 case 256: 1011 case 512: 1012 return true; 1013 default: 1014 return false; 1015 } 1016 }) 1017 .widenScalarToNextPow2(0) 1018 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1019 } 1020 1021 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1022 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1023 {S32, GlobalPtr, 16, 2 * 8}, 1024 {S32, LocalPtr, 8, 8}, 1025 {S32, LocalPtr, 16, 16}, 1026 {S32, PrivatePtr, 8, 8}, 1027 {S32, PrivatePtr, 16, 16}, 1028 {S32, ConstantPtr, 8, 8}, 1029 {S32, ConstantPtr, 16, 2 * 8}}); 1030 if (ST.hasFlatAddressSpace()) { 1031 ExtLoads.legalForTypesWithMemDesc( 1032 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1033 } 1034 1035 ExtLoads.clampScalar(0, S32, S32) 1036 .widenScalarToNextPow2(0) 1037 .unsupportedIfMemSizeNotPow2() 1038 .lower(); 1039 1040 auto &Atomics = getActionDefinitionsBuilder( 1041 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1042 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1043 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1044 G_ATOMICRMW_UMIN}) 1045 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1046 {S64, GlobalPtr}, {S64, LocalPtr}}); 1047 if (ST.hasFlatAddressSpace()) { 1048 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1049 } 1050 1051 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1052 .legalFor({{S32, LocalPtr}}); 1053 1054 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1055 // demarshalling 1056 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1057 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1058 {S32, FlatPtr}, {S64, FlatPtr}}) 1059 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1060 {S32, RegionPtr}, {S64, RegionPtr}}); 1061 // TODO: Pointer types, any 32-bit or 64-bit vector 1062 1063 // Condition should be s32 for scalar, s1 for vector. 1064 getActionDefinitionsBuilder(G_SELECT) 1065 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1066 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1067 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1068 .clampScalar(0, S16, S64) 1069 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1070 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1071 .scalarize(1) 1072 .clampMaxNumElements(0, S32, 2) 1073 .clampMaxNumElements(0, LocalPtr, 2) 1074 .clampMaxNumElements(0, PrivatePtr, 2) 1075 .scalarize(0) 1076 .widenScalarToNextPow2(0) 1077 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1078 1079 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1080 // be more flexible with the shift amount type. 1081 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1082 .legalFor({{S32, S32}, {S64, S32}}); 1083 if (ST.has16BitInsts()) { 1084 if (ST.hasVOP3PInsts()) { 1085 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 1086 .clampMaxNumElements(0, S16, 2); 1087 } else 1088 Shifts.legalFor({{S16, S32}, {S16, S16}}); 1089 1090 // TODO: Support 16-bit shift amounts 1091 Shifts.clampScalar(1, S32, S32); 1092 Shifts.clampScalar(0, S16, S64); 1093 Shifts.widenScalarToNextPow2(0, 16); 1094 } else { 1095 // Make sure we legalize the shift amount type first, as the general 1096 // expansion for the shifted type will produce much worse code if it hasn't 1097 // been truncated already. 1098 Shifts.clampScalar(1, S32, S32); 1099 Shifts.clampScalar(0, S32, S64); 1100 Shifts.widenScalarToNextPow2(0, 32); 1101 } 1102 Shifts.scalarize(0); 1103 1104 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1105 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1106 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1107 unsigned IdxTypeIdx = 2; 1108 1109 getActionDefinitionsBuilder(Op) 1110 .customIf([=](const LegalityQuery &Query) { 1111 const LLT EltTy = Query.Types[EltTypeIdx]; 1112 const LLT VecTy = Query.Types[VecTypeIdx]; 1113 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1114 return (EltTy.getSizeInBits() == 16 || 1115 EltTy.getSizeInBits() % 32 == 0) && 1116 VecTy.getSizeInBits() % 32 == 0 && 1117 VecTy.getSizeInBits() <= 1024 && 1118 IdxTy.getSizeInBits() == 32; 1119 }) 1120 .clampScalar(EltTypeIdx, S32, S64) 1121 .clampScalar(VecTypeIdx, S32, S64) 1122 .clampScalar(IdxTypeIdx, S32, S32); 1123 } 1124 1125 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1126 .unsupportedIf([=](const LegalityQuery &Query) { 1127 const LLT &EltTy = Query.Types[1].getElementType(); 1128 return Query.Types[0] != EltTy; 1129 }); 1130 1131 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1132 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1133 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1134 1135 // FIXME: Doesn't handle extract of illegal sizes. 1136 getActionDefinitionsBuilder(Op) 1137 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1138 // FIXME: Multiples of 16 should not be legal. 1139 .legalIf([=](const LegalityQuery &Query) { 1140 const LLT BigTy = Query.Types[BigTyIdx]; 1141 const LLT LitTy = Query.Types[LitTyIdx]; 1142 return (BigTy.getSizeInBits() % 32 == 0) && 1143 (LitTy.getSizeInBits() % 16 == 0); 1144 }) 1145 .widenScalarIf( 1146 [=](const LegalityQuery &Query) { 1147 const LLT BigTy = Query.Types[BigTyIdx]; 1148 return (BigTy.getScalarSizeInBits() < 16); 1149 }, 1150 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1151 .widenScalarIf( 1152 [=](const LegalityQuery &Query) { 1153 const LLT LitTy = Query.Types[LitTyIdx]; 1154 return (LitTy.getScalarSizeInBits() < 16); 1155 }, 1156 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1157 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1158 .widenScalarToNextPow2(BigTyIdx, 32); 1159 1160 } 1161 1162 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1163 .legalForCartesianProduct(AllS32Vectors, {S32}) 1164 .legalForCartesianProduct(AllS64Vectors, {S64}) 1165 .clampNumElements(0, V16S32, V32S32) 1166 .clampNumElements(0, V2S64, V16S64) 1167 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1168 1169 if (ST.hasScalarPackInsts()) { 1170 BuildVector 1171 // FIXME: Should probably widen s1 vectors straight to s32 1172 .minScalarOrElt(0, S16) 1173 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1174 .minScalar(1, S32); 1175 1176 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1177 .legalFor({V2S16, S32}) 1178 .lower(); 1179 BuildVector.minScalarOrElt(0, S32); 1180 } else { 1181 BuildVector.customFor({V2S16, S16}); 1182 BuildVector.minScalarOrElt(0, S32); 1183 1184 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1185 .customFor({V2S16, S32}) 1186 .lower(); 1187 } 1188 1189 BuildVector.legalIf(isRegisterType(0)); 1190 1191 // FIXME: Clamp maximum size 1192 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1193 .legalIf(isRegisterType(0)); 1194 1195 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1196 // pre-legalize. 1197 if (ST.hasVOP3PInsts()) { 1198 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1199 .customFor({V2S16, V2S16}) 1200 .lower(); 1201 } else 1202 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1203 1204 // Merge/Unmerge 1205 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1206 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1207 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1208 1209 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1210 const LLT Ty = Query.Types[TypeIdx]; 1211 if (Ty.isVector()) { 1212 const LLT &EltTy = Ty.getElementType(); 1213 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1214 return true; 1215 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1216 return true; 1217 } 1218 return false; 1219 }; 1220 1221 auto &Builder = getActionDefinitionsBuilder(Op) 1222 // Try to widen to s16 first for small types. 1223 // TODO: Only do this on targets with legal s16 shifts 1224 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1225 1226 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1227 .lowerFor({{S16, V2S16}}) 1228 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1229 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1230 elementTypeIs(1, S16)), 1231 changeTo(1, V2S16)) 1232 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1233 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1234 // valid. 1235 .clampScalar(LitTyIdx, S32, S512) 1236 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1237 // Break up vectors with weird elements into scalars 1238 .fewerElementsIf( 1239 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1240 scalarize(0)) 1241 .fewerElementsIf( 1242 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1243 scalarize(1)) 1244 .clampScalar(BigTyIdx, S32, S1024); 1245 1246 if (Op == G_MERGE_VALUES) { 1247 Builder.widenScalarIf( 1248 // TODO: Use 16-bit shifts if legal for 8-bit values? 1249 [=](const LegalityQuery &Query) { 1250 const LLT Ty = Query.Types[LitTyIdx]; 1251 return Ty.getSizeInBits() < 32; 1252 }, 1253 changeTo(LitTyIdx, S32)); 1254 } 1255 1256 Builder.widenScalarIf( 1257 [=](const LegalityQuery &Query) { 1258 const LLT Ty = Query.Types[BigTyIdx]; 1259 return !isPowerOf2_32(Ty.getSizeInBits()) && 1260 Ty.getSizeInBits() % 16 != 0; 1261 }, 1262 [=](const LegalityQuery &Query) { 1263 // Pick the next power of 2, or a multiple of 64 over 128. 1264 // Whichever is smaller. 1265 const LLT &Ty = Query.Types[BigTyIdx]; 1266 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1267 if (NewSizeInBits >= 256) { 1268 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1269 if (RoundedTo < NewSizeInBits) 1270 NewSizeInBits = RoundedTo; 1271 } 1272 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1273 }) 1274 .legalIf([=](const LegalityQuery &Query) { 1275 const LLT &BigTy = Query.Types[BigTyIdx]; 1276 const LLT &LitTy = Query.Types[LitTyIdx]; 1277 1278 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1279 return false; 1280 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1281 return false; 1282 1283 return BigTy.getSizeInBits() % 16 == 0 && 1284 LitTy.getSizeInBits() % 16 == 0 && 1285 BigTy.getSizeInBits() <= 1024; 1286 }) 1287 // Any vectors left are the wrong size. Scalarize them. 1288 .scalarize(0) 1289 .scalarize(1); 1290 } 1291 1292 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1293 // RegBankSelect. 1294 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1295 .legalFor({{S32}, {S64}}); 1296 1297 if (ST.hasVOP3PInsts()) { 1298 SextInReg.lowerFor({{V2S16}}) 1299 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1300 // get more vector shift opportunities, since we'll get those when 1301 // expanded. 1302 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1303 } else if (ST.has16BitInsts()) { 1304 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1305 } else { 1306 // Prefer to promote to s32 before lowering if we don't have 16-bit 1307 // shifts. This avoid a lot of intermediate truncate and extend operations. 1308 SextInReg.lowerFor({{S32}, {S64}}); 1309 } 1310 1311 SextInReg 1312 .scalarize(0) 1313 .clampScalar(0, S32, S64) 1314 .lower(); 1315 1316 getActionDefinitionsBuilder(G_FSHR) 1317 .legalFor({{S32, S32}}) 1318 .scalarize(0) 1319 .lower(); 1320 1321 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1322 .legalFor({S64}); 1323 1324 getActionDefinitionsBuilder({ 1325 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1326 G_FCOPYSIGN, 1327 1328 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1329 G_READ_REGISTER, 1330 G_WRITE_REGISTER, 1331 1332 G_SADDO, G_SSUBO, 1333 1334 // TODO: Implement 1335 G_FMINIMUM, G_FMAXIMUM, 1336 G_FSHL 1337 }).lower(); 1338 1339 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1340 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1341 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1342 .unsupported(); 1343 1344 computeTables(); 1345 verify(*ST.getInstrInfo()); 1346 } 1347 1348 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1349 MachineRegisterInfo &MRI, 1350 MachineIRBuilder &B, 1351 GISelChangeObserver &Observer) const { 1352 switch (MI.getOpcode()) { 1353 case TargetOpcode::G_ADDRSPACE_CAST: 1354 return legalizeAddrSpaceCast(MI, MRI, B); 1355 case TargetOpcode::G_FRINT: 1356 return legalizeFrint(MI, MRI, B); 1357 case TargetOpcode::G_FCEIL: 1358 return legalizeFceil(MI, MRI, B); 1359 case TargetOpcode::G_INTRINSIC_TRUNC: 1360 return legalizeIntrinsicTrunc(MI, MRI, B); 1361 case TargetOpcode::G_SITOFP: 1362 return legalizeITOFP(MI, MRI, B, true); 1363 case TargetOpcode::G_UITOFP: 1364 return legalizeITOFP(MI, MRI, B, false); 1365 case TargetOpcode::G_FPTOSI: 1366 return legalizeFPTOI(MI, MRI, B, true); 1367 case TargetOpcode::G_FPTOUI: 1368 return legalizeFPTOI(MI, MRI, B, false); 1369 case TargetOpcode::G_FMINNUM: 1370 case TargetOpcode::G_FMAXNUM: 1371 case TargetOpcode::G_FMINNUM_IEEE: 1372 case TargetOpcode::G_FMAXNUM_IEEE: 1373 return legalizeMinNumMaxNum(MI, MRI, B); 1374 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1375 return legalizeExtractVectorElt(MI, MRI, B); 1376 case TargetOpcode::G_INSERT_VECTOR_ELT: 1377 return legalizeInsertVectorElt(MI, MRI, B); 1378 case TargetOpcode::G_SHUFFLE_VECTOR: 1379 return legalizeShuffleVector(MI, MRI, B); 1380 case TargetOpcode::G_FSIN: 1381 case TargetOpcode::G_FCOS: 1382 return legalizeSinCos(MI, MRI, B); 1383 case TargetOpcode::G_GLOBAL_VALUE: 1384 return legalizeGlobalValue(MI, MRI, B); 1385 case TargetOpcode::G_LOAD: 1386 return legalizeLoad(MI, MRI, B, Observer); 1387 case TargetOpcode::G_FMAD: 1388 return legalizeFMad(MI, MRI, B); 1389 case TargetOpcode::G_FDIV: 1390 return legalizeFDIV(MI, MRI, B); 1391 case TargetOpcode::G_UDIV: 1392 case TargetOpcode::G_UREM: 1393 return legalizeUDIV_UREM(MI, MRI, B); 1394 case TargetOpcode::G_SDIV: 1395 case TargetOpcode::G_SREM: 1396 return legalizeSDIV_SREM(MI, MRI, B); 1397 case TargetOpcode::G_ATOMIC_CMPXCHG: 1398 return legalizeAtomicCmpXChg(MI, MRI, B); 1399 case TargetOpcode::G_FLOG: 1400 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1401 case TargetOpcode::G_FLOG10: 1402 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1403 case TargetOpcode::G_FEXP: 1404 return legalizeFExp(MI, B); 1405 case TargetOpcode::G_FPOW: 1406 return legalizeFPow(MI, B); 1407 case TargetOpcode::G_FFLOOR: 1408 return legalizeFFloor(MI, MRI, B); 1409 case TargetOpcode::G_BUILD_VECTOR: 1410 return legalizeBuildVector(MI, MRI, B); 1411 default: 1412 return false; 1413 } 1414 1415 llvm_unreachable("expected switch to return"); 1416 } 1417 1418 Register AMDGPULegalizerInfo::getSegmentAperture( 1419 unsigned AS, 1420 MachineRegisterInfo &MRI, 1421 MachineIRBuilder &B) const { 1422 MachineFunction &MF = B.getMF(); 1423 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1424 const LLT S32 = LLT::scalar(32); 1425 1426 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1427 1428 if (ST.hasApertureRegs()) { 1429 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1430 // getreg. 1431 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1432 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1433 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1434 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1435 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1436 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1437 unsigned Encoding = 1438 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1439 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1440 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1441 1442 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1443 1444 B.buildInstr(AMDGPU::S_GETREG_B32) 1445 .addDef(GetReg) 1446 .addImm(Encoding); 1447 MRI.setType(GetReg, S32); 1448 1449 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1450 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1451 } 1452 1453 Register QueuePtr = MRI.createGenericVirtualRegister( 1454 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1455 1456 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1457 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1458 return Register(); 1459 1460 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1461 // private_segment_aperture_base_hi. 1462 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1463 1464 // TODO: can we be smarter about machine pointer info? 1465 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1466 MachineMemOperand *MMO = MF.getMachineMemOperand( 1467 PtrInfo, 1468 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1469 MachineMemOperand::MOInvariant, 1470 4, commonAlignment(Align(64), StructOffset)); 1471 1472 Register LoadAddr; 1473 1474 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1475 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1476 } 1477 1478 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1479 MachineInstr &MI, MachineRegisterInfo &MRI, 1480 MachineIRBuilder &B) const { 1481 MachineFunction &MF = B.getMF(); 1482 1483 B.setInstr(MI); 1484 1485 const LLT S32 = LLT::scalar(32); 1486 Register Dst = MI.getOperand(0).getReg(); 1487 Register Src = MI.getOperand(1).getReg(); 1488 1489 LLT DstTy = MRI.getType(Dst); 1490 LLT SrcTy = MRI.getType(Src); 1491 unsigned DestAS = DstTy.getAddressSpace(); 1492 unsigned SrcAS = SrcTy.getAddressSpace(); 1493 1494 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1495 // vector element. 1496 assert(!DstTy.isVector()); 1497 1498 const AMDGPUTargetMachine &TM 1499 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1500 1501 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1502 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1503 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1504 return true; 1505 } 1506 1507 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1508 // Truncate. 1509 B.buildExtract(Dst, Src, 0); 1510 MI.eraseFromParent(); 1511 return true; 1512 } 1513 1514 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1515 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1516 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1517 1518 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1519 // another. Merge operands are required to be the same type, but creating an 1520 // extra ptrtoint would be kind of pointless. 1521 auto HighAddr = B.buildConstant( 1522 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1523 B.buildMerge(Dst, {Src, HighAddr}); 1524 MI.eraseFromParent(); 1525 return true; 1526 } 1527 1528 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1529 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1530 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1531 unsigned NullVal = TM.getNullPointerValue(DestAS); 1532 1533 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1534 auto FlatNull = B.buildConstant(SrcTy, 0); 1535 1536 // Extract low 32-bits of the pointer. 1537 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1538 1539 auto CmpRes = 1540 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1541 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1542 1543 MI.eraseFromParent(); 1544 return true; 1545 } 1546 1547 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1548 return false; 1549 1550 if (!ST.hasFlatAddressSpace()) 1551 return false; 1552 1553 auto SegmentNull = 1554 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1555 auto FlatNull = 1556 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1557 1558 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1559 if (!ApertureReg.isValid()) 1560 return false; 1561 1562 auto CmpRes = 1563 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1564 1565 // Coerce the type of the low half of the result so we can use merge_values. 1566 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1567 1568 // TODO: Should we allow mismatched types but matching sizes in merges to 1569 // avoid the ptrtoint? 1570 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1571 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1572 1573 MI.eraseFromParent(); 1574 return true; 1575 } 1576 1577 bool AMDGPULegalizerInfo::legalizeFrint( 1578 MachineInstr &MI, MachineRegisterInfo &MRI, 1579 MachineIRBuilder &B) const { 1580 B.setInstr(MI); 1581 1582 Register Src = MI.getOperand(1).getReg(); 1583 LLT Ty = MRI.getType(Src); 1584 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1585 1586 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1587 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1588 1589 auto C1 = B.buildFConstant(Ty, C1Val); 1590 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1591 1592 // TODO: Should this propagate fast-math-flags? 1593 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1594 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1595 1596 auto C2 = B.buildFConstant(Ty, C2Val); 1597 auto Fabs = B.buildFAbs(Ty, Src); 1598 1599 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1600 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1601 return true; 1602 } 1603 1604 bool AMDGPULegalizerInfo::legalizeFceil( 1605 MachineInstr &MI, MachineRegisterInfo &MRI, 1606 MachineIRBuilder &B) const { 1607 B.setInstr(MI); 1608 1609 const LLT S1 = LLT::scalar(1); 1610 const LLT S64 = LLT::scalar(64); 1611 1612 Register Src = MI.getOperand(1).getReg(); 1613 assert(MRI.getType(Src) == S64); 1614 1615 // result = trunc(src) 1616 // if (src > 0.0 && src != result) 1617 // result += 1.0 1618 1619 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1620 1621 const auto Zero = B.buildFConstant(S64, 0.0); 1622 const auto One = B.buildFConstant(S64, 1.0); 1623 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1624 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1625 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1626 auto Add = B.buildSelect(S64, And, One, Zero); 1627 1628 // TODO: Should this propagate fast-math-flags? 1629 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1630 return true; 1631 } 1632 1633 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1634 MachineIRBuilder &B) { 1635 const unsigned FractBits = 52; 1636 const unsigned ExpBits = 11; 1637 LLT S32 = LLT::scalar(32); 1638 1639 auto Const0 = B.buildConstant(S32, FractBits - 32); 1640 auto Const1 = B.buildConstant(S32, ExpBits); 1641 1642 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1643 .addUse(Const0.getReg(0)) 1644 .addUse(Const1.getReg(0)); 1645 1646 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1647 } 1648 1649 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1650 MachineInstr &MI, MachineRegisterInfo &MRI, 1651 MachineIRBuilder &B) const { 1652 B.setInstr(MI); 1653 1654 const LLT S1 = LLT::scalar(1); 1655 const LLT S32 = LLT::scalar(32); 1656 const LLT S64 = LLT::scalar(64); 1657 1658 Register Src = MI.getOperand(1).getReg(); 1659 assert(MRI.getType(Src) == S64); 1660 1661 // TODO: Should this use extract since the low half is unused? 1662 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1663 Register Hi = Unmerge.getReg(1); 1664 1665 // Extract the upper half, since this is where we will find the sign and 1666 // exponent. 1667 auto Exp = extractF64Exponent(Hi, B); 1668 1669 const unsigned FractBits = 52; 1670 1671 // Extract the sign bit. 1672 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1673 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1674 1675 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1676 1677 const auto Zero32 = B.buildConstant(S32, 0); 1678 1679 // Extend back to 64-bits. 1680 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1681 1682 auto Shr = B.buildAShr(S64, FractMask, Exp); 1683 auto Not = B.buildNot(S64, Shr); 1684 auto Tmp0 = B.buildAnd(S64, Src, Not); 1685 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1686 1687 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1688 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1689 1690 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1691 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1692 return true; 1693 } 1694 1695 bool AMDGPULegalizerInfo::legalizeITOFP( 1696 MachineInstr &MI, MachineRegisterInfo &MRI, 1697 MachineIRBuilder &B, bool Signed) const { 1698 B.setInstr(MI); 1699 1700 Register Dst = MI.getOperand(0).getReg(); 1701 Register Src = MI.getOperand(1).getReg(); 1702 1703 const LLT S64 = LLT::scalar(64); 1704 const LLT S32 = LLT::scalar(32); 1705 1706 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1707 1708 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1709 1710 auto CvtHi = Signed ? 1711 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1712 B.buildUITOFP(S64, Unmerge.getReg(1)); 1713 1714 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1715 1716 auto ThirtyTwo = B.buildConstant(S32, 32); 1717 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1718 .addUse(CvtHi.getReg(0)) 1719 .addUse(ThirtyTwo.getReg(0)); 1720 1721 // TODO: Should this propagate fast-math-flags? 1722 B.buildFAdd(Dst, LdExp, CvtLo); 1723 MI.eraseFromParent(); 1724 return true; 1725 } 1726 1727 // TODO: Copied from DAG implementation. Verify logic and document how this 1728 // actually works. 1729 bool AMDGPULegalizerInfo::legalizeFPTOI( 1730 MachineInstr &MI, MachineRegisterInfo &MRI, 1731 MachineIRBuilder &B, bool Signed) const { 1732 B.setInstr(MI); 1733 1734 Register Dst = MI.getOperand(0).getReg(); 1735 Register Src = MI.getOperand(1).getReg(); 1736 1737 const LLT S64 = LLT::scalar(64); 1738 const LLT S32 = LLT::scalar(32); 1739 1740 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1741 1742 unsigned Flags = MI.getFlags(); 1743 1744 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1745 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1746 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1747 1748 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1749 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1750 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1751 1752 auto Hi = Signed ? 1753 B.buildFPTOSI(S32, FloorMul) : 1754 B.buildFPTOUI(S32, FloorMul); 1755 auto Lo = B.buildFPTOUI(S32, Fma); 1756 1757 B.buildMerge(Dst, { Lo, Hi }); 1758 MI.eraseFromParent(); 1759 1760 return true; 1761 } 1762 1763 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1764 MachineInstr &MI, MachineRegisterInfo &MRI, 1765 MachineIRBuilder &B) const { 1766 MachineFunction &MF = B.getMF(); 1767 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1768 1769 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1770 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1771 1772 // With ieee_mode disabled, the instructions have the correct behavior 1773 // already for G_FMINNUM/G_FMAXNUM 1774 if (!MFI->getMode().IEEE) 1775 return !IsIEEEOp; 1776 1777 if (IsIEEEOp) 1778 return true; 1779 1780 MachineIRBuilder HelperBuilder(MI); 1781 GISelObserverWrapper DummyObserver; 1782 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1783 HelperBuilder.setInstr(MI); 1784 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1785 } 1786 1787 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1788 MachineInstr &MI, MachineRegisterInfo &MRI, 1789 MachineIRBuilder &B) const { 1790 // TODO: Should move some of this into LegalizerHelper. 1791 1792 // TODO: Promote dynamic indexing of s16 to s32 1793 1794 // FIXME: Artifact combiner probably should have replaced the truncated 1795 // constant before this, so we shouldn't need 1796 // getConstantVRegValWithLookThrough. 1797 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1798 MI.getOperand(2).getReg(), MRI); 1799 if (!IdxVal) // Dynamic case will be selected to register indexing. 1800 return true; 1801 1802 Register Dst = MI.getOperand(0).getReg(); 1803 Register Vec = MI.getOperand(1).getReg(); 1804 1805 LLT VecTy = MRI.getType(Vec); 1806 LLT EltTy = VecTy.getElementType(); 1807 assert(EltTy == MRI.getType(Dst)); 1808 1809 B.setInstr(MI); 1810 1811 if (IdxVal->Value < VecTy.getNumElements()) 1812 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1813 else 1814 B.buildUndef(Dst); 1815 1816 MI.eraseFromParent(); 1817 return true; 1818 } 1819 1820 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1821 MachineInstr &MI, MachineRegisterInfo &MRI, 1822 MachineIRBuilder &B) const { 1823 // TODO: Should move some of this into LegalizerHelper. 1824 1825 // TODO: Promote dynamic indexing of s16 to s32 1826 1827 // FIXME: Artifact combiner probably should have replaced the truncated 1828 // constant before this, so we shouldn't need 1829 // getConstantVRegValWithLookThrough. 1830 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1831 MI.getOperand(3).getReg(), MRI); 1832 if (!IdxVal) // Dynamic case will be selected to register indexing. 1833 return true; 1834 1835 Register Dst = MI.getOperand(0).getReg(); 1836 Register Vec = MI.getOperand(1).getReg(); 1837 Register Ins = MI.getOperand(2).getReg(); 1838 1839 LLT VecTy = MRI.getType(Vec); 1840 LLT EltTy = VecTy.getElementType(); 1841 assert(EltTy == MRI.getType(Ins)); 1842 1843 B.setInstr(MI); 1844 1845 if (IdxVal->Value < VecTy.getNumElements()) 1846 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1847 else 1848 B.buildUndef(Dst); 1849 1850 MI.eraseFromParent(); 1851 return true; 1852 } 1853 1854 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1855 MachineInstr &MI, MachineRegisterInfo &MRI, 1856 MachineIRBuilder &B) const { 1857 const LLT V2S16 = LLT::vector(2, 16); 1858 1859 Register Dst = MI.getOperand(0).getReg(); 1860 Register Src0 = MI.getOperand(1).getReg(); 1861 LLT DstTy = MRI.getType(Dst); 1862 LLT SrcTy = MRI.getType(Src0); 1863 1864 if (SrcTy == V2S16 && DstTy == V2S16 && 1865 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1866 return true; 1867 1868 MachineIRBuilder HelperBuilder(MI); 1869 GISelObserverWrapper DummyObserver; 1870 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1871 HelperBuilder.setInstr(MI); 1872 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1873 } 1874 1875 bool AMDGPULegalizerInfo::legalizeSinCos( 1876 MachineInstr &MI, MachineRegisterInfo &MRI, 1877 MachineIRBuilder &B) const { 1878 B.setInstr(MI); 1879 1880 Register DstReg = MI.getOperand(0).getReg(); 1881 Register SrcReg = MI.getOperand(1).getReg(); 1882 LLT Ty = MRI.getType(DstReg); 1883 unsigned Flags = MI.getFlags(); 1884 1885 Register TrigVal; 1886 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1887 if (ST.hasTrigReducedRange()) { 1888 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1889 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1890 .addUse(MulVal.getReg(0)) 1891 .setMIFlags(Flags).getReg(0); 1892 } else 1893 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1894 1895 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1896 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1897 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1898 .addUse(TrigVal) 1899 .setMIFlags(Flags); 1900 MI.eraseFromParent(); 1901 return true; 1902 } 1903 1904 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1905 Register DstReg, LLT PtrTy, 1906 MachineIRBuilder &B, const GlobalValue *GV, 1907 unsigned Offset, unsigned GAFlags) const { 1908 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1909 // to the following code sequence: 1910 // 1911 // For constant address space: 1912 // s_getpc_b64 s[0:1] 1913 // s_add_u32 s0, s0, $symbol 1914 // s_addc_u32 s1, s1, 0 1915 // 1916 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1917 // a fixup or relocation is emitted to replace $symbol with a literal 1918 // constant, which is a pc-relative offset from the encoding of the $symbol 1919 // operand to the global variable. 1920 // 1921 // For global address space: 1922 // s_getpc_b64 s[0:1] 1923 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1924 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1925 // 1926 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1927 // fixups or relocations are emitted to replace $symbol@*@lo and 1928 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1929 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1930 // operand to the global variable. 1931 // 1932 // What we want here is an offset from the value returned by s_getpc 1933 // (which is the address of the s_add_u32 instruction) to the global 1934 // variable, but since the encoding of $symbol starts 4 bytes after the start 1935 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1936 // small. This requires us to add 4 to the global variable offset in order to 1937 // compute the correct address. 1938 1939 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1940 1941 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1942 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1943 1944 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1945 .addDef(PCReg); 1946 1947 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1948 if (GAFlags == SIInstrInfo::MO_NONE) 1949 MIB.addImm(0); 1950 else 1951 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1952 1953 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1954 1955 if (PtrTy.getSizeInBits() == 32) 1956 B.buildExtract(DstReg, PCReg, 0); 1957 return true; 1958 } 1959 1960 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1961 MachineInstr &MI, MachineRegisterInfo &MRI, 1962 MachineIRBuilder &B) const { 1963 Register DstReg = MI.getOperand(0).getReg(); 1964 LLT Ty = MRI.getType(DstReg); 1965 unsigned AS = Ty.getAddressSpace(); 1966 1967 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1968 MachineFunction &MF = B.getMF(); 1969 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1970 B.setInstr(MI); 1971 1972 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1973 if (!MFI->isEntryFunction()) { 1974 const Function &Fn = MF.getFunction(); 1975 DiagnosticInfoUnsupported BadLDSDecl( 1976 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1977 DS_Warning); 1978 Fn.getContext().diagnose(BadLDSDecl); 1979 1980 // We currently don't have a way to correctly allocate LDS objects that 1981 // aren't directly associated with a kernel. We do force inlining of 1982 // functions that use local objects. However, if these dead functions are 1983 // not eliminated, we don't want a compile time error. Just emit a warning 1984 // and a trap, since there should be no callable path here. 1985 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1986 B.buildUndef(DstReg); 1987 MI.eraseFromParent(); 1988 return true; 1989 } 1990 1991 // TODO: We could emit code to handle the initialization somewhere. 1992 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1993 const SITargetLowering *TLI = ST.getTargetLowering(); 1994 if (!TLI->shouldUseLDSConstAddress(GV)) { 1995 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1996 return true; // Leave in place; 1997 } 1998 1999 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 2000 MI.eraseFromParent(); 2001 return true; 2002 } 2003 2004 const Function &Fn = MF.getFunction(); 2005 DiagnosticInfoUnsupported BadInit( 2006 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2007 Fn.getContext().diagnose(BadInit); 2008 return true; 2009 } 2010 2011 const SITargetLowering *TLI = ST.getTargetLowering(); 2012 2013 if (TLI->shouldEmitFixup(GV)) { 2014 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2015 MI.eraseFromParent(); 2016 return true; 2017 } 2018 2019 if (TLI->shouldEmitPCReloc(GV)) { 2020 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2021 MI.eraseFromParent(); 2022 return true; 2023 } 2024 2025 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2026 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2027 2028 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2029 MachinePointerInfo::getGOT(MF), 2030 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2031 MachineMemOperand::MOInvariant, 2032 8 /*Size*/, Align(8)); 2033 2034 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2035 2036 if (Ty.getSizeInBits() == 32) { 2037 // Truncate if this is a 32-bit constant adrdess. 2038 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2039 B.buildExtract(DstReg, Load, 0); 2040 } else 2041 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2042 2043 MI.eraseFromParent(); 2044 return true; 2045 } 2046 2047 bool AMDGPULegalizerInfo::legalizeLoad( 2048 MachineInstr &MI, MachineRegisterInfo &MRI, 2049 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2050 B.setInstr(MI); 2051 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2052 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2053 Observer.changingInstr(MI); 2054 MI.getOperand(1).setReg(Cast.getReg(0)); 2055 Observer.changedInstr(MI); 2056 return true; 2057 } 2058 2059 bool AMDGPULegalizerInfo::legalizeFMad( 2060 MachineInstr &MI, MachineRegisterInfo &MRI, 2061 MachineIRBuilder &B) const { 2062 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2063 assert(Ty.isScalar()); 2064 2065 MachineFunction &MF = B.getMF(); 2066 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2067 2068 // TODO: Always legal with future ftz flag. 2069 // FIXME: Do we need just output? 2070 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2071 return true; 2072 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2073 return true; 2074 2075 MachineIRBuilder HelperBuilder(MI); 2076 GISelObserverWrapper DummyObserver; 2077 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2078 HelperBuilder.setInstr(MI); 2079 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2080 } 2081 2082 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2083 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2084 Register DstReg = MI.getOperand(0).getReg(); 2085 Register PtrReg = MI.getOperand(1).getReg(); 2086 Register CmpVal = MI.getOperand(2).getReg(); 2087 Register NewVal = MI.getOperand(3).getReg(); 2088 2089 assert(SITargetLowering::isFlatGlobalAddrSpace( 2090 MRI.getType(PtrReg).getAddressSpace()) && 2091 "this should not have been custom lowered"); 2092 2093 LLT ValTy = MRI.getType(CmpVal); 2094 LLT VecTy = LLT::vector(2, ValTy); 2095 2096 B.setInstr(MI); 2097 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2098 2099 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2100 .addDef(DstReg) 2101 .addUse(PtrReg) 2102 .addUse(PackedVal) 2103 .setMemRefs(MI.memoperands()); 2104 2105 MI.eraseFromParent(); 2106 return true; 2107 } 2108 2109 bool AMDGPULegalizerInfo::legalizeFlog( 2110 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2111 Register Dst = MI.getOperand(0).getReg(); 2112 Register Src = MI.getOperand(1).getReg(); 2113 LLT Ty = B.getMRI()->getType(Dst); 2114 unsigned Flags = MI.getFlags(); 2115 B.setInstr(MI); 2116 2117 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2118 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2119 2120 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2121 MI.eraseFromParent(); 2122 return true; 2123 } 2124 2125 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2126 MachineIRBuilder &B) const { 2127 Register Dst = MI.getOperand(0).getReg(); 2128 Register Src = MI.getOperand(1).getReg(); 2129 unsigned Flags = MI.getFlags(); 2130 LLT Ty = B.getMRI()->getType(Dst); 2131 B.setInstr(MI); 2132 2133 auto K = B.buildFConstant(Ty, numbers::log2e); 2134 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2135 B.buildFExp2(Dst, Mul, Flags); 2136 MI.eraseFromParent(); 2137 return true; 2138 } 2139 2140 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2141 MachineIRBuilder &B) const { 2142 Register Dst = MI.getOperand(0).getReg(); 2143 Register Src0 = MI.getOperand(1).getReg(); 2144 Register Src1 = MI.getOperand(2).getReg(); 2145 unsigned Flags = MI.getFlags(); 2146 LLT Ty = B.getMRI()->getType(Dst); 2147 B.setInstr(MI); 2148 const LLT S16 = LLT::scalar(16); 2149 const LLT S32 = LLT::scalar(32); 2150 2151 if (Ty == S32) { 2152 auto Log = B.buildFLog2(S32, Src0, Flags); 2153 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2154 .addUse(Log.getReg(0)) 2155 .addUse(Src1) 2156 .setMIFlags(Flags); 2157 B.buildFExp2(Dst, Mul, Flags); 2158 } else if (Ty == S16) { 2159 // There's no f16 fmul_legacy, so we need to convert for it. 2160 auto Log = B.buildFLog2(S16, Src0, Flags); 2161 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2162 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2163 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2164 .addUse(Ext0.getReg(0)) 2165 .addUse(Ext1.getReg(0)) 2166 .setMIFlags(Flags); 2167 2168 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2169 } else 2170 return false; 2171 2172 MI.eraseFromParent(); 2173 return true; 2174 } 2175 2176 // Find a source register, ignoring any possible source modifiers. 2177 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2178 Register ModSrc = OrigSrc; 2179 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2180 ModSrc = SrcFNeg->getOperand(1).getReg(); 2181 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2182 ModSrc = SrcFAbs->getOperand(1).getReg(); 2183 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2184 ModSrc = SrcFAbs->getOperand(1).getReg(); 2185 return ModSrc; 2186 } 2187 2188 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2189 MachineRegisterInfo &MRI, 2190 MachineIRBuilder &B) const { 2191 B.setInstr(MI); 2192 2193 const LLT S1 = LLT::scalar(1); 2194 const LLT S64 = LLT::scalar(64); 2195 Register Dst = MI.getOperand(0).getReg(); 2196 Register OrigSrc = MI.getOperand(1).getReg(); 2197 unsigned Flags = MI.getFlags(); 2198 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2199 "this should not have been custom lowered"); 2200 2201 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2202 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2203 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2204 // V_FRACT bug is: 2205 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2206 // 2207 // Convert floor(x) to (x - fract(x)) 2208 2209 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2210 .addUse(OrigSrc) 2211 .setMIFlags(Flags); 2212 2213 // Give source modifier matching some assistance before obscuring a foldable 2214 // pattern. 2215 2216 // TODO: We can avoid the neg on the fract? The input sign to fract 2217 // shouldn't matter? 2218 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2219 2220 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2221 2222 Register Min = MRI.createGenericVirtualRegister(S64); 2223 2224 // We don't need to concern ourselves with the snan handling difference, so 2225 // use the one which will directly select. 2226 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2227 if (MFI->getMode().IEEE) 2228 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2229 else 2230 B.buildFMinNum(Min, Fract, Const, Flags); 2231 2232 Register CorrectedFract = Min; 2233 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2234 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2235 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2236 } 2237 2238 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2239 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2240 2241 MI.eraseFromParent(); 2242 return true; 2243 } 2244 2245 // Turn an illegal packed v2s16 build vector into bit operations. 2246 // TODO: This should probably be a bitcast action in LegalizerHelper. 2247 bool AMDGPULegalizerInfo::legalizeBuildVector( 2248 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2249 Register Dst = MI.getOperand(0).getReg(); 2250 const LLT S32 = LLT::scalar(32); 2251 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2252 2253 Register Src0 = MI.getOperand(1).getReg(); 2254 Register Src1 = MI.getOperand(2).getReg(); 2255 assert(MRI.getType(Src0) == LLT::scalar(16)); 2256 2257 B.setInstr(MI); 2258 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2259 B.buildBitcast(Dst, Merge); 2260 2261 MI.eraseFromParent(); 2262 return true; 2263 } 2264 2265 // Return the use branch instruction, otherwise null if the usage is invalid. 2266 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2267 MachineRegisterInfo &MRI, 2268 MachineInstr *&Br) { 2269 Register CondDef = MI.getOperand(0).getReg(); 2270 if (!MRI.hasOneNonDBGUse(CondDef)) 2271 return nullptr; 2272 2273 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2274 if (UseMI.getParent() != MI.getParent() || 2275 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2276 return nullptr; 2277 2278 // Make sure the cond br is followed by a G_BR 2279 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2280 if (Next != MI.getParent()->end()) { 2281 if (Next->getOpcode() != AMDGPU::G_BR) 2282 return nullptr; 2283 Br = &*Next; 2284 } 2285 2286 return &UseMI; 2287 } 2288 2289 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2290 MachineRegisterInfo &MRI, 2291 Register LiveIn, 2292 Register PhyReg) const { 2293 assert(PhyReg.isPhysical() && "Physical register expected"); 2294 2295 // Insert the live-in copy, if required, by defining destination virtual 2296 // register. 2297 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2298 if (!MRI.getVRegDef(LiveIn)) { 2299 // FIXME: Should have scoped insert pt 2300 MachineBasicBlock &OrigInsBB = B.getMBB(); 2301 auto OrigInsPt = B.getInsertPt(); 2302 2303 MachineBasicBlock &EntryMBB = B.getMF().front(); 2304 EntryMBB.addLiveIn(PhyReg); 2305 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2306 B.buildCopy(LiveIn, PhyReg); 2307 2308 B.setInsertPt(OrigInsBB, OrigInsPt); 2309 } 2310 2311 return LiveIn; 2312 } 2313 2314 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2315 MachineRegisterInfo &MRI, 2316 Register PhyReg, LLT Ty, 2317 bool InsertLiveInCopy) const { 2318 assert(PhyReg.isPhysical() && "Physical register expected"); 2319 2320 // Get or create virtual live-in regester 2321 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2322 if (!LiveIn) { 2323 LiveIn = MRI.createGenericVirtualRegister(Ty); 2324 MRI.addLiveIn(PhyReg, LiveIn); 2325 } 2326 2327 // When the actual true copy required is from virtual register to physical 2328 // register (to be inserted later), live-in copy insertion from physical 2329 // to register virtual register is not required 2330 if (!InsertLiveInCopy) 2331 return LiveIn; 2332 2333 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2334 } 2335 2336 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2337 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2338 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2339 const ArgDescriptor *Arg; 2340 const TargetRegisterClass *RC; 2341 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2342 if (!Arg) { 2343 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2344 return nullptr; 2345 } 2346 return Arg; 2347 } 2348 2349 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2350 const ArgDescriptor *Arg) const { 2351 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2352 return false; // TODO: Handle these 2353 2354 Register SrcReg = Arg->getRegister(); 2355 assert(SrcReg.isPhysical() && "Physical register expected"); 2356 assert(DstReg.isVirtual() && "Virtual register expected"); 2357 2358 MachineRegisterInfo &MRI = *B.getMRI(); 2359 2360 LLT Ty = MRI.getType(DstReg); 2361 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2362 2363 if (Arg->isMasked()) { 2364 // TODO: Should we try to emit this once in the entry block? 2365 const LLT S32 = LLT::scalar(32); 2366 const unsigned Mask = Arg->getMask(); 2367 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2368 2369 Register AndMaskSrc = LiveIn; 2370 2371 if (Shift != 0) { 2372 auto ShiftAmt = B.buildConstant(S32, Shift); 2373 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2374 } 2375 2376 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2377 } else { 2378 B.buildCopy(DstReg, LiveIn); 2379 } 2380 2381 return true; 2382 } 2383 2384 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2385 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2386 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2387 B.setInstr(MI); 2388 2389 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2390 if (!Arg) 2391 return false; 2392 2393 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2394 return false; 2395 2396 MI.eraseFromParent(); 2397 return true; 2398 } 2399 2400 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2401 MachineRegisterInfo &MRI, 2402 MachineIRBuilder &B) const { 2403 B.setInstr(MI); 2404 Register Dst = MI.getOperand(0).getReg(); 2405 LLT DstTy = MRI.getType(Dst); 2406 LLT S16 = LLT::scalar(16); 2407 LLT S32 = LLT::scalar(32); 2408 LLT S64 = LLT::scalar(64); 2409 2410 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2411 return true; 2412 2413 if (DstTy == S16) 2414 return legalizeFDIV16(MI, MRI, B); 2415 if (DstTy == S32) 2416 return legalizeFDIV32(MI, MRI, B); 2417 if (DstTy == S64) 2418 return legalizeFDIV64(MI, MRI, B); 2419 2420 return false; 2421 } 2422 2423 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2424 const LLT S32 = LLT::scalar(32); 2425 2426 auto Cvt0 = B.buildUITOFP(S32, Src); 2427 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2428 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2429 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2430 return B.buildFPTOUI(S32, Mul).getReg(0); 2431 } 2432 2433 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2434 Register DstReg, 2435 Register Num, 2436 Register Den, 2437 bool IsRem) const { 2438 const LLT S1 = LLT::scalar(1); 2439 const LLT S32 = LLT::scalar(32); 2440 2441 // RCP = URECIP(Den) = 2^32 / Den + e 2442 // e is rounding error. 2443 auto RCP = buildDivRCP(B, Den); 2444 2445 // RCP_LO = mul(RCP, Den) 2446 auto RCP_LO = B.buildMul(S32, RCP, Den); 2447 2448 // RCP_HI = mulhu (RCP, Den) */ 2449 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2450 2451 // NEG_RCP_LO = -RCP_LO 2452 auto Zero = B.buildConstant(S32, 0); 2453 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2454 2455 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2456 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2457 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2458 2459 // Calculate the rounding error from the URECIP instruction 2460 // E = mulhu(ABS_RCP_LO, RCP) 2461 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2462 2463 // RCP_A_E = RCP + E 2464 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2465 2466 // RCP_S_E = RCP - E 2467 auto RCP_S_E = B.buildSub(S32, RCP, E); 2468 2469 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2470 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2471 2472 // Quotient = mulhu(Tmp0, Num)stmp 2473 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2474 2475 // Num_S_Remainder = Quotient * Den 2476 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2477 2478 // Remainder = Num - Num_S_Remainder 2479 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2480 2481 // Remainder_GE_Den = Remainder >= Den 2482 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2483 2484 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2485 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2486 Num, Num_S_Remainder); 2487 2488 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2489 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2490 2491 // Calculate Division result: 2492 2493 // Quotient_A_One = Quotient + 1 2494 auto One = B.buildConstant(S32, 1); 2495 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2496 2497 // Quotient_S_One = Quotient - 1 2498 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2499 2500 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2501 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2502 2503 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2504 if (IsRem) { 2505 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2506 2507 // Calculate Rem result: 2508 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2509 2510 // Remainder_A_Den = Remainder + Den 2511 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2512 2513 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2514 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2515 2516 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2517 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2518 } else { 2519 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2520 } 2521 } 2522 2523 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2524 MachineRegisterInfo &MRI, 2525 MachineIRBuilder &B) const { 2526 B.setInstr(MI); 2527 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2528 Register DstReg = MI.getOperand(0).getReg(); 2529 Register Num = MI.getOperand(1).getReg(); 2530 Register Den = MI.getOperand(2).getReg(); 2531 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2532 MI.eraseFromParent(); 2533 return true; 2534 } 2535 2536 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2537 // 2538 // Return lo, hi of result 2539 // 2540 // %cvt.lo = G_UITOFP Val.lo 2541 // %cvt.hi = G_UITOFP Val.hi 2542 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2543 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2544 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2545 // %mul2 = G_FMUL %mul1, 2**(-32) 2546 // %trunc = G_INTRINSIC_TRUNC %mul2 2547 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2548 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2549 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2550 Register Val) { 2551 const LLT S32 = LLT::scalar(32); 2552 auto Unmerge = B.buildUnmerge(S32, Val); 2553 2554 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2555 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2556 2557 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2558 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2559 2560 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2561 auto Mul1 = 2562 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2563 2564 // 2**(-32) 2565 auto Mul2 = 2566 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2567 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2568 2569 // -(2**32) 2570 auto Mad2 = B.buildFMAD(S32, Trunc, 2571 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2572 2573 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2574 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2575 2576 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2577 } 2578 2579 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2580 MachineRegisterInfo &MRI, 2581 MachineIRBuilder &B) const { 2582 B.setInstr(MI); 2583 2584 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2585 const LLT S32 = LLT::scalar(32); 2586 const LLT S64 = LLT::scalar(64); 2587 const LLT S1 = LLT::scalar(1); 2588 Register Numer = MI.getOperand(1).getReg(); 2589 Register Denom = MI.getOperand(2).getReg(); 2590 Register RcpLo, RcpHi; 2591 2592 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2593 2594 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2595 2596 auto Zero64 = B.buildConstant(S64, 0); 2597 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2598 2599 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2600 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2601 2602 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2603 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2604 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2605 2606 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2607 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2608 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2609 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2610 2611 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2612 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2613 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2614 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2615 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2616 2617 auto Zero32 = B.buildConstant(S32, 0); 2618 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2619 auto Add2_HiC = 2620 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2621 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2622 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2623 2624 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2625 Register NumerLo = UnmergeNumer.getReg(0); 2626 Register NumerHi = UnmergeNumer.getReg(1); 2627 2628 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2629 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2630 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2631 Register Mul3_Lo = UnmergeMul3.getReg(0); 2632 Register Mul3_Hi = UnmergeMul3.getReg(1); 2633 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2634 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2635 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2636 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2637 2638 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2639 Register DenomLo = UnmergeDenom.getReg(0); 2640 Register DenomHi = UnmergeDenom.getReg(1); 2641 2642 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2643 auto C1 = B.buildSExt(S32, CmpHi); 2644 2645 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2646 auto C2 = B.buildSExt(S32, CmpLo); 2647 2648 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2649 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2650 2651 // TODO: Here and below portions of the code can be enclosed into if/endif. 2652 // Currently control flow is unconditional and we have 4 selects after 2653 // potential endif to substitute PHIs. 2654 2655 // if C3 != 0 ... 2656 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2657 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2658 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2659 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2660 2661 auto One64 = B.buildConstant(S64, 1); 2662 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2663 2664 auto C4 = 2665 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2666 auto C5 = 2667 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2668 auto C6 = B.buildSelect( 2669 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2670 2671 // if (C6 != 0) 2672 auto Add4 = B.buildAdd(S64, Add3, One64); 2673 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2674 2675 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2676 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2677 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2678 2679 // endif C6 2680 // endif C3 2681 2682 if (IsDiv) { 2683 auto Sel1 = B.buildSelect( 2684 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2685 B.buildSelect(MI.getOperand(0), 2686 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2687 } else { 2688 auto Sel2 = B.buildSelect( 2689 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2690 B.buildSelect(MI.getOperand(0), 2691 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2692 } 2693 2694 MI.eraseFromParent(); 2695 return true; 2696 } 2697 2698 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2699 MachineRegisterInfo &MRI, 2700 MachineIRBuilder &B) const { 2701 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2702 if (Ty == LLT::scalar(32)) 2703 return legalizeUDIV_UREM32(MI, MRI, B); 2704 if (Ty == LLT::scalar(64)) 2705 return legalizeUDIV_UREM64(MI, MRI, B); 2706 return false; 2707 } 2708 2709 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2710 MachineRegisterInfo &MRI, 2711 MachineIRBuilder &B) const { 2712 B.setInstr(MI); 2713 const LLT S32 = LLT::scalar(32); 2714 2715 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2716 Register DstReg = MI.getOperand(0).getReg(); 2717 Register LHS = MI.getOperand(1).getReg(); 2718 Register RHS = MI.getOperand(2).getReg(); 2719 2720 auto ThirtyOne = B.buildConstant(S32, 31); 2721 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2722 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2723 2724 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2725 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2726 2727 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2728 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2729 2730 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2731 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2732 2733 if (IsRem) { 2734 auto RSign = LHSign; // Remainder sign is the same as LHS 2735 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2736 B.buildSub(DstReg, UDivRem, RSign); 2737 } else { 2738 auto DSign = B.buildXor(S32, LHSign, RHSign); 2739 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2740 B.buildSub(DstReg, UDivRem, DSign); 2741 } 2742 2743 MI.eraseFromParent(); 2744 return true; 2745 } 2746 2747 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2748 MachineRegisterInfo &MRI, 2749 MachineIRBuilder &B) const { 2750 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2751 return legalizeSDIV_SREM32(MI, MRI, B); 2752 return false; 2753 } 2754 2755 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2756 MachineRegisterInfo &MRI, 2757 MachineIRBuilder &B) const { 2758 Register Res = MI.getOperand(0).getReg(); 2759 Register LHS = MI.getOperand(1).getReg(); 2760 Register RHS = MI.getOperand(2).getReg(); 2761 2762 uint16_t Flags = MI.getFlags(); 2763 2764 LLT ResTy = MRI.getType(Res); 2765 LLT S32 = LLT::scalar(32); 2766 LLT S64 = LLT::scalar(64); 2767 2768 const MachineFunction &MF = B.getMF(); 2769 bool Unsafe = 2770 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2771 2772 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2773 return false; 2774 2775 if (!Unsafe && ResTy == S32 && 2776 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2777 return false; 2778 2779 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2780 // 1 / x -> RCP(x) 2781 if (CLHS->isExactlyValue(1.0)) { 2782 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2783 .addUse(RHS) 2784 .setMIFlags(Flags); 2785 2786 MI.eraseFromParent(); 2787 return true; 2788 } 2789 2790 // -1 / x -> RCP( FNEG(x) ) 2791 if (CLHS->isExactlyValue(-1.0)) { 2792 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2793 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2794 .addUse(FNeg.getReg(0)) 2795 .setMIFlags(Flags); 2796 2797 MI.eraseFromParent(); 2798 return true; 2799 } 2800 } 2801 2802 // x / y -> x * (1.0 / y) 2803 if (Unsafe) { 2804 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2805 .addUse(RHS) 2806 .setMIFlags(Flags); 2807 B.buildFMul(Res, LHS, RCP, Flags); 2808 2809 MI.eraseFromParent(); 2810 return true; 2811 } 2812 2813 return false; 2814 } 2815 2816 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2817 MachineRegisterInfo &MRI, 2818 MachineIRBuilder &B) const { 2819 B.setInstr(MI); 2820 Register Res = MI.getOperand(0).getReg(); 2821 Register LHS = MI.getOperand(1).getReg(); 2822 Register RHS = MI.getOperand(2).getReg(); 2823 2824 uint16_t Flags = MI.getFlags(); 2825 2826 LLT S16 = LLT::scalar(16); 2827 LLT S32 = LLT::scalar(32); 2828 2829 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2830 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2831 2832 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2833 .addUse(RHSExt.getReg(0)) 2834 .setMIFlags(Flags); 2835 2836 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2837 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2838 2839 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2840 .addUse(RDst.getReg(0)) 2841 .addUse(RHS) 2842 .addUse(LHS) 2843 .setMIFlags(Flags); 2844 2845 MI.eraseFromParent(); 2846 return true; 2847 } 2848 2849 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2850 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2851 static void toggleSPDenormMode(bool Enable, 2852 MachineIRBuilder &B, 2853 const GCNSubtarget &ST, 2854 AMDGPU::SIModeRegisterDefaults Mode) { 2855 // Set SP denorm mode to this value. 2856 unsigned SPDenormMode = 2857 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2858 2859 if (ST.hasDenormModeInst()) { 2860 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2861 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2862 2863 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2864 B.buildInstr(AMDGPU::S_DENORM_MODE) 2865 .addImm(NewDenormModeValue); 2866 2867 } else { 2868 // Select FP32 bit field in mode register. 2869 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2870 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2871 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2872 2873 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2874 .addImm(SPDenormMode) 2875 .addImm(SPDenormModeBitField); 2876 } 2877 } 2878 2879 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2880 MachineRegisterInfo &MRI, 2881 MachineIRBuilder &B) const { 2882 B.setInstr(MI); 2883 Register Res = MI.getOperand(0).getReg(); 2884 Register LHS = MI.getOperand(1).getReg(); 2885 Register RHS = MI.getOperand(2).getReg(); 2886 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2887 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2888 2889 uint16_t Flags = MI.getFlags(); 2890 2891 LLT S32 = LLT::scalar(32); 2892 LLT S1 = LLT::scalar(1); 2893 2894 auto One = B.buildFConstant(S32, 1.0f); 2895 2896 auto DenominatorScaled = 2897 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2898 .addUse(RHS) 2899 .addUse(LHS) 2900 .addImm(1) 2901 .setMIFlags(Flags); 2902 auto NumeratorScaled = 2903 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2904 .addUse(LHS) 2905 .addUse(RHS) 2906 .addImm(0) 2907 .setMIFlags(Flags); 2908 2909 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2910 .addUse(DenominatorScaled.getReg(0)) 2911 .setMIFlags(Flags); 2912 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2913 2914 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2915 // aren't modeled as reading it. 2916 if (!Mode.allFP32Denormals()) 2917 toggleSPDenormMode(true, B, ST, Mode); 2918 2919 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2920 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2921 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2922 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2923 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2924 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2925 2926 if (!Mode.allFP32Denormals()) 2927 toggleSPDenormMode(false, B, ST, Mode); 2928 2929 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2930 .addUse(Fma4.getReg(0)) 2931 .addUse(Fma1.getReg(0)) 2932 .addUse(Fma3.getReg(0)) 2933 .addUse(NumeratorScaled.getReg(1)) 2934 .setMIFlags(Flags); 2935 2936 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2937 .addUse(Fmas.getReg(0)) 2938 .addUse(RHS) 2939 .addUse(LHS) 2940 .setMIFlags(Flags); 2941 2942 MI.eraseFromParent(); 2943 return true; 2944 } 2945 2946 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2947 MachineRegisterInfo &MRI, 2948 MachineIRBuilder &B) const { 2949 B.setInstr(MI); 2950 Register Res = MI.getOperand(0).getReg(); 2951 Register LHS = MI.getOperand(1).getReg(); 2952 Register RHS = MI.getOperand(2).getReg(); 2953 2954 uint16_t Flags = MI.getFlags(); 2955 2956 LLT S64 = LLT::scalar(64); 2957 LLT S1 = LLT::scalar(1); 2958 2959 auto One = B.buildFConstant(S64, 1.0); 2960 2961 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2962 .addUse(LHS) 2963 .addUse(RHS) 2964 .addImm(1) 2965 .setMIFlags(Flags); 2966 2967 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2968 2969 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2970 .addUse(DivScale0.getReg(0)) 2971 .setMIFlags(Flags); 2972 2973 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2974 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2975 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2976 2977 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2978 .addUse(LHS) 2979 .addUse(RHS) 2980 .addImm(0) 2981 .setMIFlags(Flags); 2982 2983 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2984 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2985 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2986 2987 Register Scale; 2988 if (!ST.hasUsableDivScaleConditionOutput()) { 2989 // Workaround a hardware bug on SI where the condition output from div_scale 2990 // is not usable. 2991 2992 LLT S32 = LLT::scalar(32); 2993 2994 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2995 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2996 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2997 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2998 2999 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3000 Scale1Unmerge.getReg(1)); 3001 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3002 Scale0Unmerge.getReg(1)); 3003 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3004 } else { 3005 Scale = DivScale1.getReg(1); 3006 } 3007 3008 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3009 .addUse(Fma4.getReg(0)) 3010 .addUse(Fma3.getReg(0)) 3011 .addUse(Mul.getReg(0)) 3012 .addUse(Scale) 3013 .setMIFlags(Flags); 3014 3015 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3016 .addUse(Fmas.getReg(0)) 3017 .addUse(RHS) 3018 .addUse(LHS) 3019 .setMIFlags(Flags); 3020 3021 MI.eraseFromParent(); 3022 return true; 3023 } 3024 3025 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3026 MachineRegisterInfo &MRI, 3027 MachineIRBuilder &B) const { 3028 B.setInstr(MI); 3029 Register Res = MI.getOperand(0).getReg(); 3030 Register LHS = MI.getOperand(2).getReg(); 3031 Register RHS = MI.getOperand(3).getReg(); 3032 uint16_t Flags = MI.getFlags(); 3033 3034 LLT S32 = LLT::scalar(32); 3035 LLT S1 = LLT::scalar(1); 3036 3037 auto Abs = B.buildFAbs(S32, RHS, Flags); 3038 const APFloat C0Val(1.0f); 3039 3040 auto C0 = B.buildConstant(S32, 0x6f800000); 3041 auto C1 = B.buildConstant(S32, 0x2f800000); 3042 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3043 3044 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3045 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3046 3047 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3048 3049 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3050 .addUse(Mul0.getReg(0)) 3051 .setMIFlags(Flags); 3052 3053 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3054 3055 B.buildFMul(Res, Sel, Mul1, Flags); 3056 3057 MI.eraseFromParent(); 3058 return true; 3059 } 3060 3061 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3062 MachineRegisterInfo &MRI, 3063 MachineIRBuilder &B) const { 3064 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3065 if (!MFI->isEntryFunction()) { 3066 return legalizePreloadedArgIntrin(MI, MRI, B, 3067 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3068 } 3069 3070 B.setInstr(MI); 3071 3072 uint64_t Offset = 3073 ST.getTargetLowering()->getImplicitParameterOffset( 3074 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3075 Register DstReg = MI.getOperand(0).getReg(); 3076 LLT DstTy = MRI.getType(DstReg); 3077 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3078 3079 const ArgDescriptor *Arg; 3080 const TargetRegisterClass *RC; 3081 std::tie(Arg, RC) 3082 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3083 if (!Arg) 3084 return false; 3085 3086 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3087 if (!loadInputValue(KernargPtrReg, B, Arg)) 3088 return false; 3089 3090 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3091 MI.eraseFromParent(); 3092 return true; 3093 } 3094 3095 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3096 MachineRegisterInfo &MRI, 3097 MachineIRBuilder &B, 3098 unsigned AddrSpace) const { 3099 B.setInstr(MI); 3100 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3101 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3102 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3103 MI.eraseFromParent(); 3104 return true; 3105 } 3106 3107 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3108 // offset (the offset that is included in bounds checking and swizzling, to be 3109 // split between the instruction's voffset and immoffset fields) and soffset 3110 // (the offset that is excluded from bounds checking and swizzling, to go in 3111 // the instruction's soffset field). This function takes the first kind of 3112 // offset and figures out how to split it between voffset and immoffset. 3113 std::tuple<Register, unsigned, unsigned> 3114 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3115 Register OrigOffset) const { 3116 const unsigned MaxImm = 4095; 3117 Register BaseReg; 3118 unsigned TotalConstOffset; 3119 MachineInstr *OffsetDef; 3120 const LLT S32 = LLT::scalar(32); 3121 3122 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3123 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3124 3125 unsigned ImmOffset = TotalConstOffset; 3126 3127 // If the immediate value is too big for the immoffset field, put the value 3128 // and -4096 into the immoffset field so that the value that is copied/added 3129 // for the voffset field is a multiple of 4096, and it stands more chance 3130 // of being CSEd with the copy/add for another similar load/store. 3131 // However, do not do that rounding down to a multiple of 4096 if that is a 3132 // negative number, as it appears to be illegal to have a negative offset 3133 // in the vgpr, even if adding the immediate offset makes it positive. 3134 unsigned Overflow = ImmOffset & ~MaxImm; 3135 ImmOffset -= Overflow; 3136 if ((int32_t)Overflow < 0) { 3137 Overflow += ImmOffset; 3138 ImmOffset = 0; 3139 } 3140 3141 if (Overflow != 0) { 3142 if (!BaseReg) { 3143 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3144 } else { 3145 auto OverflowVal = B.buildConstant(S32, Overflow); 3146 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3147 } 3148 } 3149 3150 if (!BaseReg) 3151 BaseReg = B.buildConstant(S32, 0).getReg(0); 3152 3153 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3154 } 3155 3156 /// Handle register layout difference for f16 images for some subtargets. 3157 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3158 MachineRegisterInfo &MRI, 3159 Register Reg) const { 3160 if (!ST.hasUnpackedD16VMem()) 3161 return Reg; 3162 3163 const LLT S16 = LLT::scalar(16); 3164 const LLT S32 = LLT::scalar(32); 3165 LLT StoreVT = MRI.getType(Reg); 3166 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3167 3168 auto Unmerge = B.buildUnmerge(S16, Reg); 3169 3170 SmallVector<Register, 4> WideRegs; 3171 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3172 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3173 3174 int NumElts = StoreVT.getNumElements(); 3175 3176 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3177 } 3178 3179 Register AMDGPULegalizerInfo::fixStoreSourceType( 3180 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3181 MachineRegisterInfo *MRI = B.getMRI(); 3182 LLT Ty = MRI->getType(VData); 3183 3184 const LLT S16 = LLT::scalar(16); 3185 3186 // Fixup illegal register types for i8 stores. 3187 if (Ty == LLT::scalar(8) || Ty == S16) { 3188 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3189 return AnyExt; 3190 } 3191 3192 if (Ty.isVector()) { 3193 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3194 if (IsFormat) 3195 return handleD16VData(B, *MRI, VData); 3196 } 3197 } 3198 3199 return VData; 3200 } 3201 3202 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3203 MachineRegisterInfo &MRI, 3204 MachineIRBuilder &B, 3205 bool IsTyped, 3206 bool IsFormat) const { 3207 B.setInstr(MI); 3208 3209 Register VData = MI.getOperand(1).getReg(); 3210 LLT Ty = MRI.getType(VData); 3211 LLT EltTy = Ty.getScalarType(); 3212 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3213 const LLT S32 = LLT::scalar(32); 3214 3215 VData = fixStoreSourceType(B, VData, IsFormat); 3216 Register RSrc = MI.getOperand(2).getReg(); 3217 3218 MachineMemOperand *MMO = *MI.memoperands_begin(); 3219 const int MemSize = MMO->getSize(); 3220 3221 unsigned ImmOffset; 3222 unsigned TotalOffset; 3223 3224 // The typed intrinsics add an immediate after the registers. 3225 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3226 3227 // The struct intrinsic variants add one additional operand over raw. 3228 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3229 Register VIndex; 3230 int OpOffset = 0; 3231 if (HasVIndex) { 3232 VIndex = MI.getOperand(3).getReg(); 3233 OpOffset = 1; 3234 } 3235 3236 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3237 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3238 3239 unsigned Format = 0; 3240 if (IsTyped) { 3241 Format = MI.getOperand(5 + OpOffset).getImm(); 3242 ++OpOffset; 3243 } 3244 3245 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3246 3247 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3248 if (TotalOffset != 0) 3249 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3250 3251 unsigned Opc; 3252 if (IsTyped) { 3253 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3254 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3255 } else if (IsFormat) { 3256 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3257 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3258 } else { 3259 switch (MemSize) { 3260 case 1: 3261 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3262 break; 3263 case 2: 3264 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3265 break; 3266 default: 3267 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3268 break; 3269 } 3270 } 3271 3272 if (!VIndex) 3273 VIndex = B.buildConstant(S32, 0).getReg(0); 3274 3275 auto MIB = B.buildInstr(Opc) 3276 .addUse(VData) // vdata 3277 .addUse(RSrc) // rsrc 3278 .addUse(VIndex) // vindex 3279 .addUse(VOffset) // voffset 3280 .addUse(SOffset) // soffset 3281 .addImm(ImmOffset); // offset(imm) 3282 3283 if (IsTyped) 3284 MIB.addImm(Format); 3285 3286 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3287 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3288 .addMemOperand(MMO); 3289 3290 MI.eraseFromParent(); 3291 return true; 3292 } 3293 3294 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3295 MachineRegisterInfo &MRI, 3296 MachineIRBuilder &B, 3297 bool IsFormat, 3298 bool IsTyped) const { 3299 B.setInstr(MI); 3300 3301 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3302 MachineMemOperand *MMO = *MI.memoperands_begin(); 3303 const int MemSize = MMO->getSize(); 3304 const LLT S32 = LLT::scalar(32); 3305 3306 Register Dst = MI.getOperand(0).getReg(); 3307 Register RSrc = MI.getOperand(2).getReg(); 3308 3309 // The typed intrinsics add an immediate after the registers. 3310 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3311 3312 // The struct intrinsic variants add one additional operand over raw. 3313 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3314 Register VIndex; 3315 int OpOffset = 0; 3316 if (HasVIndex) { 3317 VIndex = MI.getOperand(3).getReg(); 3318 OpOffset = 1; 3319 } 3320 3321 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3322 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3323 3324 unsigned Format = 0; 3325 if (IsTyped) { 3326 Format = MI.getOperand(5 + OpOffset).getImm(); 3327 ++OpOffset; 3328 } 3329 3330 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3331 unsigned ImmOffset; 3332 unsigned TotalOffset; 3333 3334 LLT Ty = MRI.getType(Dst); 3335 LLT EltTy = Ty.getScalarType(); 3336 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3337 const bool Unpacked = ST.hasUnpackedD16VMem(); 3338 3339 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3340 if (TotalOffset != 0) 3341 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3342 3343 unsigned Opc; 3344 3345 if (IsTyped) { 3346 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3347 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3348 } else if (IsFormat) { 3349 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3350 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3351 } else { 3352 switch (MemSize) { 3353 case 1: 3354 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3355 break; 3356 case 2: 3357 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3358 break; 3359 default: 3360 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3361 break; 3362 } 3363 } 3364 3365 Register LoadDstReg; 3366 3367 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3368 LLT UnpackedTy = Ty.changeElementSize(32); 3369 3370 if (IsExtLoad) 3371 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3372 else if (Unpacked && IsD16 && Ty.isVector()) 3373 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3374 else 3375 LoadDstReg = Dst; 3376 3377 if (!VIndex) 3378 VIndex = B.buildConstant(S32, 0).getReg(0); 3379 3380 auto MIB = B.buildInstr(Opc) 3381 .addDef(LoadDstReg) // vdata 3382 .addUse(RSrc) // rsrc 3383 .addUse(VIndex) // vindex 3384 .addUse(VOffset) // voffset 3385 .addUse(SOffset) // soffset 3386 .addImm(ImmOffset); // offset(imm) 3387 3388 if (IsTyped) 3389 MIB.addImm(Format); 3390 3391 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3392 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3393 .addMemOperand(MMO); 3394 3395 if (LoadDstReg != Dst) { 3396 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3397 3398 // Widen result for extending loads was widened. 3399 if (IsExtLoad) 3400 B.buildTrunc(Dst, LoadDstReg); 3401 else { 3402 // Repack to original 16-bit vector result 3403 // FIXME: G_TRUNC should work, but legalization currently fails 3404 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3405 SmallVector<Register, 4> Repack; 3406 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3407 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3408 B.buildMerge(Dst, Repack); 3409 } 3410 } 3411 3412 MI.eraseFromParent(); 3413 return true; 3414 } 3415 3416 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3417 MachineIRBuilder &B, 3418 bool IsInc) const { 3419 B.setInstr(MI); 3420 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3421 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3422 B.buildInstr(Opc) 3423 .addDef(MI.getOperand(0).getReg()) 3424 .addUse(MI.getOperand(2).getReg()) 3425 .addUse(MI.getOperand(3).getReg()) 3426 .cloneMemRefs(MI); 3427 MI.eraseFromParent(); 3428 return true; 3429 } 3430 3431 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3432 switch (IntrID) { 3433 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3434 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3435 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3436 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3437 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3438 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3439 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3440 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3441 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3442 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3443 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3444 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3445 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3446 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3447 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3448 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3449 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3450 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3451 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3452 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3453 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3454 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3455 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3456 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3457 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3458 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3459 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3460 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3461 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3462 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3463 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3464 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3465 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3466 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3467 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3468 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3469 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3470 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3471 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3472 default: 3473 llvm_unreachable("unhandled atomic opcode"); 3474 } 3475 } 3476 3477 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3478 MachineIRBuilder &B, 3479 Intrinsic::ID IID) const { 3480 B.setInstr(MI); 3481 3482 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3483 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3484 3485 Register Dst = MI.getOperand(0).getReg(); 3486 Register VData = MI.getOperand(2).getReg(); 3487 3488 Register CmpVal; 3489 int OpOffset = 0; 3490 3491 if (IsCmpSwap) { 3492 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3493 ++OpOffset; 3494 } 3495 3496 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3497 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3498 3499 // The struct intrinsic variants add one additional operand over raw. 3500 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3501 Register VIndex; 3502 if (HasVIndex) { 3503 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3504 ++OpOffset; 3505 } 3506 3507 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3508 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3509 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3510 3511 MachineMemOperand *MMO = *MI.memoperands_begin(); 3512 3513 unsigned ImmOffset; 3514 unsigned TotalOffset; 3515 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3516 if (TotalOffset != 0) 3517 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3518 3519 if (!VIndex) 3520 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3521 3522 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3523 .addDef(Dst) 3524 .addUse(VData); // vdata 3525 3526 if (IsCmpSwap) 3527 MIB.addReg(CmpVal); 3528 3529 MIB.addUse(RSrc) // rsrc 3530 .addUse(VIndex) // vindex 3531 .addUse(VOffset) // voffset 3532 .addUse(SOffset) // soffset 3533 .addImm(ImmOffset) // offset(imm) 3534 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3535 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3536 .addMemOperand(MMO); 3537 3538 MI.eraseFromParent(); 3539 return true; 3540 } 3541 3542 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3543 /// vector with s16 typed elements. 3544 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3545 SmallVectorImpl<Register> &PackedAddrs, 3546 int AddrIdx, int DimIdx, int NumVAddrs, 3547 int NumGradients) { 3548 const LLT S16 = LLT::scalar(16); 3549 const LLT V2S16 = LLT::vector(2, 16); 3550 3551 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3552 MachineOperand &SrcOp = MI.getOperand(I); 3553 if (!SrcOp.isReg()) 3554 continue; // _L to _LZ may have eliminated this. 3555 3556 Register AddrReg = SrcOp.getReg(); 3557 3558 if (I < DimIdx) { 3559 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3560 PackedAddrs.push_back(AddrReg); 3561 } else { 3562 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3563 // derivatives dx/dh and dx/dv are packed with undef. 3564 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3565 ((NumGradients / 2) % 2 == 1 && 3566 (I == DimIdx + (NumGradients / 2) - 1 || 3567 I == DimIdx + NumGradients - 1)) || 3568 // Check for _L to _LZ optimization 3569 !MI.getOperand(I + 1).isReg()) { 3570 PackedAddrs.push_back( 3571 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3572 .getReg(0)); 3573 } else { 3574 PackedAddrs.push_back( 3575 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3576 .getReg(0)); 3577 ++I; 3578 } 3579 } 3580 } 3581 } 3582 3583 /// Convert from separate vaddr components to a single vector address register, 3584 /// and replace the remaining operands with $noreg. 3585 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3586 int DimIdx, int NumVAddrs) { 3587 const LLT S32 = LLT::scalar(32); 3588 3589 SmallVector<Register, 8> AddrRegs; 3590 for (int I = 0; I != NumVAddrs; ++I) { 3591 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3592 if (SrcOp.isReg()) { 3593 AddrRegs.push_back(SrcOp.getReg()); 3594 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3595 } 3596 } 3597 3598 int NumAddrRegs = AddrRegs.size(); 3599 if (NumAddrRegs != 1) { 3600 // Round up to 8 elements for v5-v7 3601 // FIXME: Missing intermediate sized register classes and instructions. 3602 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3603 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3604 auto Undef = B.buildUndef(S32); 3605 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3606 NumAddrRegs = RoundedNumRegs; 3607 } 3608 3609 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3610 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3611 } 3612 3613 for (int I = 1; I != NumVAddrs; ++I) { 3614 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3615 if (SrcOp.isReg()) 3616 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3617 } 3618 } 3619 3620 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3621 /// 3622 /// Depending on the subtarget, load/store with 16-bit element data need to be 3623 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3624 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3625 /// registers. 3626 /// 3627 /// We don't want to directly select image instructions just yet, but also want 3628 /// to exposes all register repacking to the legalizer/combiners. We also don't 3629 /// want a selected instrution entering RegBankSelect. In order to avoid 3630 /// defining a multitude of intermediate image instructions, directly hack on 3631 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3632 /// now unnecessary arguments with $noreg. 3633 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3634 MachineInstr &MI, MachineIRBuilder &B, 3635 GISelChangeObserver &Observer, 3636 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3637 B.setInstr(MI); 3638 3639 const int NumDefs = MI.getNumExplicitDefs(); 3640 bool IsTFE = NumDefs == 2; 3641 // We are only processing the operands of d16 image operations on subtargets 3642 // that use the unpacked register layout, or need to repack the TFE result. 3643 3644 // TODO: Do we need to guard against already legalized intrinsics? 3645 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3646 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3647 3648 MachineRegisterInfo *MRI = B.getMRI(); 3649 const LLT S32 = LLT::scalar(32); 3650 const LLT S16 = LLT::scalar(16); 3651 const LLT V2S16 = LLT::vector(2, 16); 3652 3653 // Index of first address argument 3654 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3655 3656 // Check for 16 bit addresses and pack if true. 3657 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3658 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3659 const bool IsA16 = AddrTy == S16; 3660 3661 int NumVAddrs, NumGradients; 3662 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3663 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3664 getDMaskIdx(BaseOpcode, NumDefs); 3665 unsigned DMask = 0; 3666 3667 int DMaskLanes = 0; 3668 if (!BaseOpcode->Atomic) { 3669 DMask = MI.getOperand(DMaskIdx).getImm(); 3670 if (BaseOpcode->Gather4) { 3671 DMaskLanes = 4; 3672 } else if (DMask != 0) { 3673 DMaskLanes = countPopulation(DMask); 3674 } else if (!IsTFE && !BaseOpcode->Store) { 3675 // If dmask is 0, this is a no-op load. This can be eliminated. 3676 B.buildUndef(MI.getOperand(0)); 3677 MI.eraseFromParent(); 3678 return true; 3679 } 3680 } 3681 3682 Observer.changingInstr(MI); 3683 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3684 3685 unsigned NewOpcode = NumDefs == 0 ? 3686 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3687 3688 // Track that we legalized this 3689 MI.setDesc(B.getTII().get(NewOpcode)); 3690 3691 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3692 // dmask to be at least 1 otherwise the instruction will fail 3693 if (IsTFE && DMask == 0) { 3694 DMask = 0x1; 3695 DMaskLanes = 1; 3696 MI.getOperand(DMaskIdx).setImm(DMask); 3697 } 3698 3699 if (BaseOpcode->Atomic) { 3700 Register VData0 = MI.getOperand(2).getReg(); 3701 LLT Ty = MRI->getType(VData0); 3702 3703 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3704 if (Ty.isVector()) 3705 return false; 3706 3707 if (BaseOpcode->AtomicX2) { 3708 Register VData1 = MI.getOperand(3).getReg(); 3709 // The two values are packed in one register. 3710 LLT PackedTy = LLT::vector(2, Ty); 3711 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3712 MI.getOperand(2).setReg(Concat.getReg(0)); 3713 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3714 } 3715 } 3716 3717 int CorrectedNumVAddrs = NumVAddrs; 3718 3719 // Optimize _L to _LZ when _L is zero 3720 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3721 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3722 const ConstantFP *ConstantLod; 3723 const int LodIdx = AddrIdx + NumVAddrs - 1; 3724 3725 // FIXME: This isn't the cleanest way to handle this, but it's the easiest 3726 // option the current infrastructure gives. We really should be changing the 3727 // base intrinsic opcode, but the current searchable tables only gives us 3728 // the final MI opcode. Eliminate the register here, and track with an 3729 // immediate 0 so the final selection will know to do the opcode change. 3730 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3731 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3732 MI.getOperand(LodIdx).ChangeToImmediate(0); 3733 --CorrectedNumVAddrs; 3734 } 3735 } 3736 } 3737 3738 // Optimize _mip away, when 'lod' is zero 3739 if (const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = 3740 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3741 int64_t ConstantLod; 3742 const int LodIdx = AddrIdx + NumVAddrs - 1; 3743 3744 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3745 if (ConstantLod == 0) { 3746 MI.getOperand(LodIdx).ChangeToImmediate(0); 3747 --CorrectedNumVAddrs; 3748 } 3749 } 3750 } 3751 3752 // If the register allocator cannot place the address registers contiguously 3753 // without introducing moves, then using the non-sequential address encoding 3754 // is always preferable, since it saves VALU instructions and is usually a 3755 // wash in terms of code size or even better. 3756 // 3757 // However, we currently have no way of hinting to the register allocator 3758 // that MIMG addresses should be placed contiguously when it is possible to 3759 // do so, so force non-NSA for the common 2-address case as a heuristic. 3760 // 3761 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3762 // allocation when possible. 3763 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3764 3765 // Rewrite the addressing register layout before doing anything else. 3766 if (IsA16) { 3767 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3768 // should be introduced. 3769 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3770 return false; 3771 3772 if (NumVAddrs > 1) { 3773 SmallVector<Register, 4> PackedRegs; 3774 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3775 NumGradients); 3776 3777 if (!UseNSA && PackedRegs.size() > 1) { 3778 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3779 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3780 PackedRegs[0] = Concat.getReg(0); 3781 PackedRegs.resize(1); 3782 } 3783 3784 const int NumPacked = PackedRegs.size(); 3785 for (int I = 0; I != NumVAddrs; ++I) { 3786 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3787 if (!SrcOp.isReg()) { 3788 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3789 continue; 3790 } 3791 3792 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3793 3794 if (I < NumPacked) 3795 SrcOp.setReg(PackedRegs[I]); 3796 else 3797 SrcOp.setReg(AMDGPU::NoRegister); 3798 } 3799 } 3800 } else if (!UseNSA && NumVAddrs > 1) { 3801 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3802 } 3803 3804 3805 if (BaseOpcode->Store) { // No TFE for stores? 3806 // TODO: Handle dmask trim 3807 Register VData = MI.getOperand(1).getReg(); 3808 LLT Ty = MRI->getType(VData); 3809 if (!Ty.isVector() || Ty.getElementType() != S16) 3810 return true; 3811 3812 B.setInstr(MI); 3813 3814 Register RepackedReg = handleD16VData(B, *MRI, VData); 3815 if (RepackedReg != VData) { 3816 MI.getOperand(1).setReg(RepackedReg); 3817 } 3818 3819 return true; 3820 } 3821 3822 Register DstReg = MI.getOperand(0).getReg(); 3823 LLT Ty = MRI->getType(DstReg); 3824 const LLT EltTy = Ty.getScalarType(); 3825 const bool IsD16 = Ty.getScalarType() == S16; 3826 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3827 3828 // Confirm that the return type is large enough for the dmask specified 3829 if (NumElts < DMaskLanes) 3830 return false; 3831 3832 if (NumElts > 4 || DMaskLanes > 4) 3833 return false; 3834 3835 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3836 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3837 3838 // The raw dword aligned data component of the load. The only legal cases 3839 // where this matters should be when using the packed D16 format, for 3840 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3841 LLT RoundedTy; 3842 3843 // S32 vector to to cover all data, plus TFE result element. 3844 LLT TFETy; 3845 3846 // Register type to use for each loaded component. Will be S32 or V2S16. 3847 LLT RegTy; 3848 3849 if (IsD16 && ST.hasUnpackedD16VMem()) { 3850 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3851 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3852 RegTy = S32; 3853 } else { 3854 unsigned EltSize = EltTy.getSizeInBits(); 3855 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3856 unsigned RoundedSize = 32 * RoundedElts; 3857 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3858 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3859 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3860 } 3861 3862 // The return type does not need adjustment. 3863 // TODO: Should we change s16 case to s32 or <2 x s16>? 3864 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3865 return true; 3866 3867 Register Dst1Reg; 3868 3869 // Insert after the instruction. 3870 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3871 3872 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3873 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3874 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3875 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3876 3877 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3878 3879 MI.getOperand(0).setReg(NewResultReg); 3880 3881 // In the IR, TFE is supposed to be used with a 2 element struct return 3882 // type. The intruction really returns these two values in one contiguous 3883 // register, with one additional dword beyond the loaded data. Rewrite the 3884 // return type to use a single register result. 3885 3886 if (IsTFE) { 3887 Dst1Reg = MI.getOperand(1).getReg(); 3888 if (MRI->getType(Dst1Reg) != S32) 3889 return false; 3890 3891 // TODO: Make sure the TFE operand bit is set. 3892 MI.RemoveOperand(1); 3893 3894 // Handle the easy case that requires no repack instructions. 3895 if (Ty == S32) { 3896 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3897 return true; 3898 } 3899 } 3900 3901 // Now figure out how to copy the new result register back into the old 3902 // result. 3903 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3904 3905 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3906 3907 if (ResultNumRegs == 1) { 3908 assert(!IsTFE); 3909 ResultRegs[0] = NewResultReg; 3910 } else { 3911 // We have to repack into a new vector of some kind. 3912 for (int I = 0; I != NumDataRegs; ++I) 3913 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3914 B.buildUnmerge(ResultRegs, NewResultReg); 3915 3916 // Drop the final TFE element to get the data part. The TFE result is 3917 // directly written to the right place already. 3918 if (IsTFE) 3919 ResultRegs.resize(NumDataRegs); 3920 } 3921 3922 // For an s16 scalar result, we form an s32 result with a truncate regardless 3923 // of packed vs. unpacked. 3924 if (IsD16 && !Ty.isVector()) { 3925 B.buildTrunc(DstReg, ResultRegs[0]); 3926 return true; 3927 } 3928 3929 // Avoid a build/concat_vector of 1 entry. 3930 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3931 B.buildBitcast(DstReg, ResultRegs[0]); 3932 return true; 3933 } 3934 3935 assert(Ty.isVector()); 3936 3937 if (IsD16) { 3938 // For packed D16 results with TFE enabled, all the data components are 3939 // S32. Cast back to the expected type. 3940 // 3941 // TODO: We don't really need to use load s32 elements. We would only need one 3942 // cast for the TFE result if a multiple of v2s16 was used. 3943 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3944 for (Register &Reg : ResultRegs) 3945 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3946 } else if (ST.hasUnpackedD16VMem()) { 3947 for (Register &Reg : ResultRegs) 3948 Reg = B.buildTrunc(S16, Reg).getReg(0); 3949 } 3950 } 3951 3952 auto padWithUndef = [&](LLT Ty, int NumElts) { 3953 if (NumElts == 0) 3954 return; 3955 Register Undef = B.buildUndef(Ty).getReg(0); 3956 for (int I = 0; I != NumElts; ++I) 3957 ResultRegs.push_back(Undef); 3958 }; 3959 3960 // Pad out any elements eliminated due to the dmask. 3961 LLT ResTy = MRI->getType(ResultRegs[0]); 3962 if (!ResTy.isVector()) { 3963 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3964 B.buildBuildVector(DstReg, ResultRegs); 3965 return true; 3966 } 3967 3968 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 3969 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 3970 3971 // Deal with the one annoying legal case. 3972 const LLT V3S16 = LLT::vector(3, 16); 3973 if (Ty == V3S16) { 3974 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 3975 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 3976 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 3977 return true; 3978 } 3979 3980 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 3981 B.buildConcatVectors(DstReg, ResultRegs); 3982 return true; 3983 } 3984 3985 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3986 MachineInstr &MI, MachineIRBuilder &B, 3987 GISelChangeObserver &Observer) const { 3988 Register Dst = MI.getOperand(0).getReg(); 3989 LLT Ty = B.getMRI()->getType(Dst); 3990 unsigned Size = Ty.getSizeInBits(); 3991 MachineFunction &MF = B.getMF(); 3992 3993 Observer.changingInstr(MI); 3994 3995 // FIXME: We don't really need this intermediate instruction. The intrinsic 3996 // should be fixed to have a memory operand. Since it's readnone, we're not 3997 // allowed to add one. 3998 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 3999 MI.RemoveOperand(1); // Remove intrinsic ID 4000 4001 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4002 // TODO: Should this use datalayout alignment? 4003 const unsigned MemSize = (Size + 7) / 8; 4004 const Align MemAlign(4); 4005 MachineMemOperand *MMO = MF.getMachineMemOperand( 4006 MachinePointerInfo(), 4007 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4008 MachineMemOperand::MOInvariant, 4009 MemSize, MemAlign); 4010 MI.addMemOperand(MF, MMO); 4011 4012 // There are no 96-bit result scalar loads, but widening to 128-bit should 4013 // always be legal. We may need to restore this to a 96-bit result if it turns 4014 // out this needs to be converted to a vector load during RegBankSelect. 4015 if (!isPowerOf2_32(Size)) { 4016 LegalizerHelper Helper(MF, *this, Observer, B); 4017 B.setInstr(MI); 4018 4019 if (Ty.isVector()) 4020 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4021 else 4022 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4023 } 4024 4025 Observer.changedInstr(MI); 4026 return true; 4027 } 4028 4029 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4030 MachineRegisterInfo &MRI, 4031 MachineIRBuilder &B) const { 4032 B.setInstr(MI); 4033 4034 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4035 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4036 !ST.isTrapHandlerEnabled()) { 4037 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4038 } else { 4039 // Pass queue pointer to trap handler as input, and insert trap instruction 4040 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4041 const ArgDescriptor *Arg = 4042 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4043 if (!Arg) 4044 return false; 4045 MachineRegisterInfo &MRI = *B.getMRI(); 4046 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4047 Register LiveIn = getLiveInRegister( 4048 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4049 /*InsertLiveInCopy=*/false); 4050 if (!loadInputValue(LiveIn, B, Arg)) 4051 return false; 4052 B.buildCopy(SGPR01, LiveIn); 4053 B.buildInstr(AMDGPU::S_TRAP) 4054 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4055 .addReg(SGPR01, RegState::Implicit); 4056 } 4057 4058 MI.eraseFromParent(); 4059 return true; 4060 } 4061 4062 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4063 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4064 B.setInstr(MI); 4065 4066 // Is non-HSA path or trap-handler disabled? then, report a warning 4067 // accordingly 4068 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4069 !ST.isTrapHandlerEnabled()) { 4070 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4071 "debugtrap handler not supported", 4072 MI.getDebugLoc(), DS_Warning); 4073 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4074 Ctx.diagnose(NoTrap); 4075 } else { 4076 // Insert debug-trap instruction 4077 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4078 } 4079 4080 MI.eraseFromParent(); 4081 return true; 4082 } 4083 4084 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4085 MachineIRBuilder &B, 4086 GISelChangeObserver &Observer) const { 4087 MachineRegisterInfo &MRI = *B.getMRI(); 4088 4089 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4090 auto IntrID = MI.getIntrinsicID(); 4091 switch (IntrID) { 4092 case Intrinsic::amdgcn_if: 4093 case Intrinsic::amdgcn_else: { 4094 MachineInstr *Br = nullptr; 4095 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 4096 const SIRegisterInfo *TRI 4097 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4098 4099 B.setInstr(*BrCond); 4100 Register Def = MI.getOperand(1).getReg(); 4101 Register Use = MI.getOperand(3).getReg(); 4102 4103 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 4104 if (Br) 4105 BrTarget = Br->getOperand(0).getMBB(); 4106 4107 if (IntrID == Intrinsic::amdgcn_if) { 4108 B.buildInstr(AMDGPU::SI_IF) 4109 .addDef(Def) 4110 .addUse(Use) 4111 .addMBB(BrTarget); 4112 } else { 4113 B.buildInstr(AMDGPU::SI_ELSE) 4114 .addDef(Def) 4115 .addUse(Use) 4116 .addMBB(BrTarget) 4117 .addImm(0); 4118 } 4119 4120 if (Br) 4121 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 4122 4123 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4124 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4125 MI.eraseFromParent(); 4126 BrCond->eraseFromParent(); 4127 return true; 4128 } 4129 4130 return false; 4131 } 4132 case Intrinsic::amdgcn_loop: { 4133 MachineInstr *Br = nullptr; 4134 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 4135 const SIRegisterInfo *TRI 4136 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4137 4138 B.setInstr(*BrCond); 4139 4140 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 4141 if (Br) 4142 BrTarget = Br->getOperand(0).getMBB(); 4143 4144 Register Reg = MI.getOperand(2).getReg(); 4145 B.buildInstr(AMDGPU::SI_LOOP) 4146 .addUse(Reg) 4147 .addMBB(BrTarget); 4148 4149 if (Br) 4150 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 4151 4152 MI.eraseFromParent(); 4153 BrCond->eraseFromParent(); 4154 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4155 return true; 4156 } 4157 4158 return false; 4159 } 4160 case Intrinsic::amdgcn_kernarg_segment_ptr: 4161 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4162 B.setInstr(MI); 4163 // This only makes sense to call in a kernel, so just lower to null. 4164 B.buildConstant(MI.getOperand(0).getReg(), 0); 4165 MI.eraseFromParent(); 4166 return true; 4167 } 4168 4169 return legalizePreloadedArgIntrin( 4170 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4171 case Intrinsic::amdgcn_implicitarg_ptr: 4172 return legalizeImplicitArgPtr(MI, MRI, B); 4173 case Intrinsic::amdgcn_workitem_id_x: 4174 return legalizePreloadedArgIntrin(MI, MRI, B, 4175 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4176 case Intrinsic::amdgcn_workitem_id_y: 4177 return legalizePreloadedArgIntrin(MI, MRI, B, 4178 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4179 case Intrinsic::amdgcn_workitem_id_z: 4180 return legalizePreloadedArgIntrin(MI, MRI, B, 4181 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4182 case Intrinsic::amdgcn_workgroup_id_x: 4183 return legalizePreloadedArgIntrin(MI, MRI, B, 4184 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4185 case Intrinsic::amdgcn_workgroup_id_y: 4186 return legalizePreloadedArgIntrin(MI, MRI, B, 4187 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4188 case Intrinsic::amdgcn_workgroup_id_z: 4189 return legalizePreloadedArgIntrin(MI, MRI, B, 4190 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4191 case Intrinsic::amdgcn_dispatch_ptr: 4192 return legalizePreloadedArgIntrin(MI, MRI, B, 4193 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4194 case Intrinsic::amdgcn_queue_ptr: 4195 return legalizePreloadedArgIntrin(MI, MRI, B, 4196 AMDGPUFunctionArgInfo::QUEUE_PTR); 4197 case Intrinsic::amdgcn_implicit_buffer_ptr: 4198 return legalizePreloadedArgIntrin( 4199 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4200 case Intrinsic::amdgcn_dispatch_id: 4201 return legalizePreloadedArgIntrin(MI, MRI, B, 4202 AMDGPUFunctionArgInfo::DISPATCH_ID); 4203 case Intrinsic::amdgcn_fdiv_fast: 4204 return legalizeFDIVFastIntrin(MI, MRI, B); 4205 case Intrinsic::amdgcn_is_shared: 4206 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4207 case Intrinsic::amdgcn_is_private: 4208 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4209 case Intrinsic::amdgcn_wavefrontsize: { 4210 B.setInstr(MI); 4211 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4212 MI.eraseFromParent(); 4213 return true; 4214 } 4215 case Intrinsic::amdgcn_s_buffer_load: 4216 return legalizeSBufferLoad(MI, B, Observer); 4217 case Intrinsic::amdgcn_raw_buffer_store: 4218 case Intrinsic::amdgcn_struct_buffer_store: 4219 return legalizeBufferStore(MI, MRI, B, false, false); 4220 case Intrinsic::amdgcn_raw_buffer_store_format: 4221 case Intrinsic::amdgcn_struct_buffer_store_format: 4222 return legalizeBufferStore(MI, MRI, B, false, true); 4223 case Intrinsic::amdgcn_raw_tbuffer_store: 4224 case Intrinsic::amdgcn_struct_tbuffer_store: 4225 return legalizeBufferStore(MI, MRI, B, true, true); 4226 case Intrinsic::amdgcn_raw_buffer_load: 4227 case Intrinsic::amdgcn_struct_buffer_load: 4228 return legalizeBufferLoad(MI, MRI, B, false, false); 4229 case Intrinsic::amdgcn_raw_buffer_load_format: 4230 case Intrinsic::amdgcn_struct_buffer_load_format: 4231 return legalizeBufferLoad(MI, MRI, B, true, false); 4232 case Intrinsic::amdgcn_raw_tbuffer_load: 4233 case Intrinsic::amdgcn_struct_tbuffer_load: 4234 return legalizeBufferLoad(MI, MRI, B, true, true); 4235 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4236 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4237 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4238 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4239 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4240 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4241 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4242 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4243 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4244 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4245 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4246 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4247 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4248 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4249 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4250 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4251 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4252 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4253 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4254 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4255 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4256 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4257 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4258 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4259 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4260 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4261 return legalizeBufferAtomic(MI, B, IntrID); 4262 case Intrinsic::amdgcn_atomic_inc: 4263 return legalizeAtomicIncDec(MI, B, true); 4264 case Intrinsic::amdgcn_atomic_dec: 4265 return legalizeAtomicIncDec(MI, B, false); 4266 case Intrinsic::trap: 4267 return legalizeTrapIntrinsic(MI, MRI, B); 4268 case Intrinsic::debugtrap: 4269 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4270 default: { 4271 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4272 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4273 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4274 return true; 4275 } 4276 } 4277 4278 return true; 4279 } 4280