1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/ADT/ScopeExit.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 31 #include "llvm/CodeGen/TargetOpcodes.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/DerivedTypes.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/Support/Debug.h" 37 38 #define DEBUG_TYPE "amdgpu-legalinfo" 39 40 using namespace llvm; 41 using namespace LegalizeActions; 42 using namespace LegalizeMutations; 43 using namespace LegalityPredicates; 44 using namespace MIPatternMatch; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 static LegalityPredicate isMultiple32(unsigned TypeIdx, 61 unsigned MaxSize = 1024) { 62 return [=](const LegalityQuery &Query) { 63 const LLT Ty = Query.Types[TypeIdx]; 64 const LLT EltTy = Ty.getScalarType(); 65 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 66 }; 67 } 68 69 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 70 return [=](const LegalityQuery &Query) { 71 const LLT Ty = Query.Types[TypeIdx]; 72 return Ty.isVector() && 73 Ty.getNumElements() % 2 != 0 && 74 Ty.getElementType().getSizeInBits() < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 const LLT EltTy = Ty.getScalarType(); 83 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 84 }; 85 } 86 87 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getElementType(); 91 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 92 }; 93 } 94 95 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 unsigned Size = Ty.getSizeInBits(); 100 unsigned Pieces = (Size + 63) / 64; 101 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 102 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 103 }; 104 } 105 106 // Increase the number of vector elements to reach the next multiple of 32-bit 107 // type. 108 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 109 return [=](const LegalityQuery &Query) { 110 const LLT Ty = Query.Types[TypeIdx]; 111 112 const LLT EltTy = Ty.getElementType(); 113 const int Size = Ty.getSizeInBits(); 114 const int EltSize = EltTy.getSizeInBits(); 115 const int NextMul32 = (Size + 31) / 32; 116 117 assert(EltSize < 32); 118 119 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 120 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 121 }; 122 } 123 124 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 125 return [=](const LegalityQuery &Query) { 126 const LLT QueryTy = Query.Types[TypeIdx]; 127 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 128 }; 129 } 130 131 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 132 return [=](const LegalityQuery &Query) { 133 const LLT QueryTy = Query.Types[TypeIdx]; 134 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 135 }; 136 } 137 138 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 139 return [=](const LegalityQuery &Query) { 140 const LLT QueryTy = Query.Types[TypeIdx]; 141 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 142 }; 143 } 144 145 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 146 // v2s16. 147 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 if (Ty.isVector()) { 151 const int EltSize = Ty.getElementType().getSizeInBits(); 152 return EltSize == 32 || EltSize == 64 || 153 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 154 EltSize == 128 || EltSize == 256; 155 } 156 157 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 158 }; 159 } 160 161 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 if (!QueryTy.isVector()) 165 return false; 166 const LLT EltTy = QueryTy.getElementType(); 167 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 168 }; 169 } 170 171 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 172 return [=](const LegalityQuery &Query) { 173 const LLT Ty = Query.Types[TypeIdx]; 174 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 175 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 176 }; 177 } 178 179 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 180 const GCNTargetMachine &TM) 181 : ST(ST_) { 182 using namespace TargetOpcode; 183 184 auto GetAddrSpacePtr = [&TM](unsigned AS) { 185 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 186 }; 187 188 const LLT S1 = LLT::scalar(1); 189 const LLT S16 = LLT::scalar(16); 190 const LLT S32 = LLT::scalar(32); 191 const LLT S64 = LLT::scalar(64); 192 const LLT S128 = LLT::scalar(128); 193 const LLT S256 = LLT::scalar(256); 194 const LLT S512 = LLT::scalar(512); 195 const LLT S1024 = LLT::scalar(1024); 196 197 const LLT V2S16 = LLT::vector(2, 16); 198 const LLT V4S16 = LLT::vector(4, 16); 199 200 const LLT V2S32 = LLT::vector(2, 32); 201 const LLT V3S32 = LLT::vector(3, 32); 202 const LLT V4S32 = LLT::vector(4, 32); 203 const LLT V5S32 = LLT::vector(5, 32); 204 const LLT V6S32 = LLT::vector(6, 32); 205 const LLT V7S32 = LLT::vector(7, 32); 206 const LLT V8S32 = LLT::vector(8, 32); 207 const LLT V9S32 = LLT::vector(9, 32); 208 const LLT V10S32 = LLT::vector(10, 32); 209 const LLT V11S32 = LLT::vector(11, 32); 210 const LLT V12S32 = LLT::vector(12, 32); 211 const LLT V13S32 = LLT::vector(13, 32); 212 const LLT V14S32 = LLT::vector(14, 32); 213 const LLT V15S32 = LLT::vector(15, 32); 214 const LLT V16S32 = LLT::vector(16, 32); 215 const LLT V32S32 = LLT::vector(32, 32); 216 217 const LLT V2S64 = LLT::vector(2, 64); 218 const LLT V3S64 = LLT::vector(3, 64); 219 const LLT V4S64 = LLT::vector(4, 64); 220 const LLT V5S64 = LLT::vector(5, 64); 221 const LLT V6S64 = LLT::vector(6, 64); 222 const LLT V7S64 = LLT::vector(7, 64); 223 const LLT V8S64 = LLT::vector(8, 64); 224 const LLT V16S64 = LLT::vector(16, 64); 225 226 std::initializer_list<LLT> AllS32Vectors = 227 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 228 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 229 std::initializer_list<LLT> AllS64Vectors = 230 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 231 232 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 233 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 234 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 235 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 236 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 237 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 238 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 239 240 const LLT CodePtr = FlatPtr; 241 242 const std::initializer_list<LLT> AddrSpaces64 = { 243 GlobalPtr, ConstantPtr, FlatPtr 244 }; 245 246 const std::initializer_list<LLT> AddrSpaces32 = { 247 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 248 }; 249 250 const std::initializer_list<LLT> FPTypesBase = { 251 S32, S64 252 }; 253 254 const std::initializer_list<LLT> FPTypes16 = { 255 S32, S64, S16 256 }; 257 258 const std::initializer_list<LLT> FPTypesPK16 = { 259 S32, S64, S16, V2S16 260 }; 261 262 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 263 264 setAction({G_BRCOND, S1}, Legal); // VCC branches 265 setAction({G_BRCOND, S32}, Legal); // SCC branches 266 267 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 268 // elements for v3s16 269 getActionDefinitionsBuilder(G_PHI) 270 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 271 .legalFor(AllS32Vectors) 272 .legalFor(AllS64Vectors) 273 .legalFor(AddrSpaces64) 274 .legalFor(AddrSpaces32) 275 .clampScalar(0, S32, S256) 276 .widenScalarToNextPow2(0, 32) 277 .clampMaxNumElements(0, S32, 16) 278 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 279 .legalIf(isPointer(0)); 280 281 if (ST.hasVOP3PInsts()) { 282 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 283 .legalFor({S32, S16, V2S16}) 284 .clampScalar(0, S16, S32) 285 .clampMaxNumElements(0, S16, 2) 286 .scalarize(0) 287 .widenScalarToNextPow2(0, 32); 288 } else if (ST.has16BitInsts()) { 289 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 290 .legalFor({S32, S16}) 291 .clampScalar(0, S16, S32) 292 .scalarize(0) 293 .widenScalarToNextPow2(0, 32); 294 } else { 295 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 296 .legalFor({S32}) 297 .clampScalar(0, S32, S32) 298 .scalarize(0); 299 } 300 301 // FIXME: Not really legal. Placeholder for custom lowering. 302 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 303 .customFor({S32, S64}) 304 .clampScalar(0, S32, S64) 305 .widenScalarToNextPow2(0, 32) 306 .scalarize(0); 307 308 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 309 .legalFor({S32}) 310 .clampScalar(0, S32, S32) 311 .scalarize(0); 312 313 // Report legal for any types we can handle anywhere. For the cases only legal 314 // on the SALU, RegBankSelect will be able to re-legalize. 315 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 316 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 317 .clampScalar(0, S32, S64) 318 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 319 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 320 .widenScalarToNextPow2(0) 321 .scalarize(0); 322 323 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 324 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 325 .legalFor({{S32, S1}, {S32, S32}}) 326 .minScalar(0, S32) 327 // TODO: .scalarize(0) 328 .lower(); 329 330 getActionDefinitionsBuilder(G_BITCAST) 331 // Don't worry about the size constraint. 332 .legalIf(all(isRegisterType(0), isRegisterType(1))) 333 .lower(); 334 335 336 getActionDefinitionsBuilder(G_CONSTANT) 337 .legalFor({S1, S32, S64, S16, GlobalPtr, 338 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 339 .clampScalar(0, S32, S64) 340 .widenScalarToNextPow2(0) 341 .legalIf(isPointer(0)); 342 343 getActionDefinitionsBuilder(G_FCONSTANT) 344 .legalFor({S32, S64, S16}) 345 .clampScalar(0, S16, S64); 346 347 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 348 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 349 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 350 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 351 .clampScalarOrElt(0, S32, S1024) 352 .legalIf(isMultiple32(0)) 353 .widenScalarToNextPow2(0, 32) 354 .clampMaxNumElements(0, S32, 16); 355 356 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 357 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 358 .unsupportedFor({PrivatePtr}) 359 .custom(); 360 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 361 362 auto &FPOpActions = getActionDefinitionsBuilder( 363 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 364 .legalFor({S32, S64}); 365 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 366 .customFor({S32, S64}); 367 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 368 .customFor({S32, S64}); 369 370 if (ST.has16BitInsts()) { 371 if (ST.hasVOP3PInsts()) 372 FPOpActions.legalFor({S16, V2S16}); 373 else 374 FPOpActions.legalFor({S16}); 375 376 TrigActions.customFor({S16}); 377 FDIVActions.customFor({S16}); 378 } 379 380 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 381 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 382 383 if (ST.hasVOP3PInsts()) { 384 MinNumMaxNum.customFor(FPTypesPK16) 385 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 386 .clampMaxNumElements(0, S16, 2) 387 .clampScalar(0, S16, S64) 388 .scalarize(0); 389 } else if (ST.has16BitInsts()) { 390 MinNumMaxNum.customFor(FPTypes16) 391 .clampScalar(0, S16, S64) 392 .scalarize(0); 393 } else { 394 MinNumMaxNum.customFor(FPTypesBase) 395 .clampScalar(0, S32, S64) 396 .scalarize(0); 397 } 398 399 if (ST.hasVOP3PInsts()) 400 FPOpActions.clampMaxNumElements(0, S16, 2); 401 402 FPOpActions 403 .scalarize(0) 404 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 405 406 TrigActions 407 .scalarize(0) 408 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 409 410 FDIVActions 411 .scalarize(0) 412 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 413 414 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 415 .legalFor(FPTypesPK16) 416 .clampMaxNumElements(0, S16, 2) 417 .scalarize(0) 418 .clampScalar(0, S16, S64); 419 420 if (ST.has16BitInsts()) { 421 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 422 .legalFor({S32, S64, S16}) 423 .scalarize(0) 424 .clampScalar(0, S16, S64); 425 } else { 426 getActionDefinitionsBuilder(G_FSQRT) 427 .legalFor({S32, S64}) 428 .scalarize(0) 429 .clampScalar(0, S32, S64); 430 431 if (ST.hasFractBug()) { 432 getActionDefinitionsBuilder(G_FFLOOR) 433 .customFor({S64}) 434 .legalFor({S32, S64}) 435 .scalarize(0) 436 .clampScalar(0, S32, S64); 437 } else { 438 getActionDefinitionsBuilder(G_FFLOOR) 439 .legalFor({S32, S64}) 440 .scalarize(0) 441 .clampScalar(0, S32, S64); 442 } 443 } 444 445 getActionDefinitionsBuilder(G_FPTRUNC) 446 .legalFor({{S32, S64}, {S16, S32}}) 447 .scalarize(0) 448 .lower(); 449 450 getActionDefinitionsBuilder(G_FPEXT) 451 .legalFor({{S64, S32}, {S32, S16}}) 452 .lowerFor({{S64, S16}}) // FIXME: Implement 453 .scalarize(0); 454 455 getActionDefinitionsBuilder(G_FSUB) 456 // Use actual fsub instruction 457 .legalFor({S32}) 458 // Must use fadd + fneg 459 .lowerFor({S64, S16, V2S16}) 460 .scalarize(0) 461 .clampScalar(0, S32, S64); 462 463 // Whether this is legal depends on the floating point mode for the function. 464 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 465 if (ST.hasMadF16()) 466 FMad.customFor({S32, S16}); 467 else 468 FMad.customFor({S32}); 469 FMad.scalarize(0) 470 .lower(); 471 472 // TODO: Do we need to clamp maximum bitwidth? 473 getActionDefinitionsBuilder(G_TRUNC) 474 .legalIf(isScalar(0)) 475 .legalFor({{V2S16, V2S32}}) 476 .clampMaxNumElements(0, S16, 2) 477 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 478 // situations (like an invalid implicit use), we don't want to infinite loop 479 // in the legalizer. 480 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 481 .alwaysLegal(); 482 483 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 484 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 485 {S32, S1}, {S64, S1}, {S16, S1}}) 486 .scalarize(0) 487 .clampScalar(0, S32, S64) 488 .widenScalarToNextPow2(1, 32); 489 490 // TODO: Split s1->s64 during regbankselect for VALU. 491 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 492 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 493 .lowerFor({{S32, S64}}) 494 .lowerIf(typeIs(1, S1)) 495 .customFor({{S64, S64}}); 496 if (ST.has16BitInsts()) 497 IToFP.legalFor({{S16, S16}}); 498 IToFP.clampScalar(1, S32, S64) 499 .scalarize(0) 500 .widenScalarToNextPow2(1); 501 502 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 503 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 504 .customFor({{S64, S64}}); 505 if (ST.has16BitInsts()) 506 FPToI.legalFor({{S16, S16}}); 507 else 508 FPToI.minScalar(1, S32); 509 510 FPToI.minScalar(0, S32) 511 .scalarize(0) 512 .lower(); 513 514 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 515 .scalarize(0) 516 .lower(); 517 518 if (ST.has16BitInsts()) { 519 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 520 .legalFor({S16, S32, S64}) 521 .clampScalar(0, S16, S64) 522 .scalarize(0); 523 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 524 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 525 .legalFor({S32, S64}) 526 .clampScalar(0, S32, S64) 527 .scalarize(0); 528 } else { 529 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 530 .legalFor({S32}) 531 .customFor({S64}) 532 .clampScalar(0, S32, S64) 533 .scalarize(0); 534 } 535 536 // FIXME: Clamp offset operand. 537 getActionDefinitionsBuilder(G_PTR_ADD) 538 .legalIf(isPointer(0)) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder(G_PTRMASK) 542 .legalIf(typeInSet(1, {S64, S32})) 543 .minScalar(1, S32) 544 .maxScalarIf(sizeIs(0, 32), 1, S32) 545 .maxScalarIf(sizeIs(0, 64), 1, S64) 546 .scalarize(0); 547 548 auto &CmpBuilder = 549 getActionDefinitionsBuilder(G_ICMP) 550 // The compare output type differs based on the register bank of the output, 551 // so make both s1 and s32 legal. 552 // 553 // Scalar compares producing output in scc will be promoted to s32, as that 554 // is the allocatable register type that will be needed for the copy from 555 // scc. This will be promoted during RegBankSelect, and we assume something 556 // before that won't try to use s32 result types. 557 // 558 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 559 // bank. 560 .legalForCartesianProduct( 561 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 562 .legalForCartesianProduct( 563 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 564 if (ST.has16BitInsts()) { 565 CmpBuilder.legalFor({{S1, S16}}); 566 } 567 568 CmpBuilder 569 .widenScalarToNextPow2(1) 570 .clampScalar(1, S32, S64) 571 .scalarize(0) 572 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 573 574 getActionDefinitionsBuilder(G_FCMP) 575 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 576 .widenScalarToNextPow2(1) 577 .clampScalar(1, S32, S64) 578 .scalarize(0); 579 580 // FIXME: fpow has a selection pattern that should move to custom lowering. 581 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 582 if (ST.has16BitInsts()) 583 Exp2Ops.legalFor({S32, S16}); 584 else 585 Exp2Ops.legalFor({S32}); 586 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 587 Exp2Ops.scalarize(0); 588 589 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 590 if (ST.has16BitInsts()) 591 ExpOps.customFor({{S32}, {S16}}); 592 else 593 ExpOps.customFor({S32}); 594 ExpOps.clampScalar(0, MinScalarFPTy, S32) 595 .scalarize(0); 596 597 // The 64-bit versions produce 32-bit results, but only on the SALU. 598 getActionDefinitionsBuilder(G_CTPOP) 599 .legalFor({{S32, S32}, {S32, S64}}) 600 .clampScalar(0, S32, S32) 601 .clampScalar(1, S32, S64) 602 .scalarize(0) 603 .widenScalarToNextPow2(0, 32) 604 .widenScalarToNextPow2(1, 32); 605 606 // The hardware instructions return a different result on 0 than the generic 607 // instructions expect. The hardware produces -1, but these produce the 608 // bitwidth. 609 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 610 .scalarize(0) 611 .clampScalar(0, S32, S32) 612 .clampScalar(1, S32, S64) 613 .widenScalarToNextPow2(0, 32) 614 .widenScalarToNextPow2(1, 32) 615 .lower(); 616 617 // The 64-bit versions produce 32-bit results, but only on the SALU. 618 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 619 .legalFor({{S32, S32}, {S32, S64}}) 620 .clampScalar(0, S32, S32) 621 .clampScalar(1, S32, S64) 622 .scalarize(0) 623 .widenScalarToNextPow2(0, 32) 624 .widenScalarToNextPow2(1, 32); 625 626 getActionDefinitionsBuilder(G_BITREVERSE) 627 .legalFor({S32}) 628 .clampScalar(0, S32, S32) 629 .scalarize(0); 630 631 if (ST.has16BitInsts()) { 632 getActionDefinitionsBuilder(G_BSWAP) 633 .legalFor({S16, S32, V2S16}) 634 .clampMaxNumElements(0, S16, 2) 635 // FIXME: Fixing non-power-of-2 before clamp is workaround for 636 // narrowScalar limitation. 637 .widenScalarToNextPow2(0) 638 .clampScalar(0, S16, S32) 639 .scalarize(0); 640 641 if (ST.hasVOP3PInsts()) { 642 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 643 .legalFor({S32, S16, V2S16}) 644 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 645 .clampMaxNumElements(0, S16, 2) 646 .minScalar(0, S16) 647 .widenScalarToNextPow2(0) 648 .scalarize(0) 649 .lower(); 650 } else { 651 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 652 .legalFor({S32, S16}) 653 .widenScalarToNextPow2(0) 654 .minScalar(0, S16) 655 .scalarize(0) 656 .lower(); 657 } 658 } else { 659 // TODO: Should have same legality without v_perm_b32 660 getActionDefinitionsBuilder(G_BSWAP) 661 .legalFor({S32}) 662 .lowerIf(scalarNarrowerThan(0, 32)) 663 // FIXME: Fixing non-power-of-2 before clamp is workaround for 664 // narrowScalar limitation. 665 .widenScalarToNextPow2(0) 666 .maxScalar(0, S32) 667 .scalarize(0) 668 .lower(); 669 670 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 671 .legalFor({S32}) 672 .minScalar(0, S32) 673 .widenScalarToNextPow2(0) 674 .scalarize(0) 675 .lower(); 676 } 677 678 getActionDefinitionsBuilder(G_INTTOPTR) 679 // List the common cases 680 .legalForCartesianProduct(AddrSpaces64, {S64}) 681 .legalForCartesianProduct(AddrSpaces32, {S32}) 682 .scalarize(0) 683 // Accept any address space as long as the size matches 684 .legalIf(sameSize(0, 1)) 685 .widenScalarIf(smallerThan(1, 0), 686 [](const LegalityQuery &Query) { 687 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 688 }) 689 .narrowScalarIf(largerThan(1, 0), 690 [](const LegalityQuery &Query) { 691 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 692 }); 693 694 getActionDefinitionsBuilder(G_PTRTOINT) 695 // List the common cases 696 .legalForCartesianProduct(AddrSpaces64, {S64}) 697 .legalForCartesianProduct(AddrSpaces32, {S32}) 698 .scalarize(0) 699 // Accept any address space as long as the size matches 700 .legalIf(sameSize(0, 1)) 701 .widenScalarIf(smallerThan(0, 1), 702 [](const LegalityQuery &Query) { 703 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 704 }) 705 .narrowScalarIf( 706 largerThan(0, 1), 707 [](const LegalityQuery &Query) { 708 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 709 }); 710 711 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 712 .scalarize(0) 713 .custom(); 714 715 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 716 // handle some operations by just promoting the register during 717 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 718 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 719 switch (AS) { 720 // FIXME: Private element size. 721 case AMDGPUAS::PRIVATE_ADDRESS: 722 return 32; 723 // FIXME: Check subtarget 724 case AMDGPUAS::LOCAL_ADDRESS: 725 return ST.useDS128() ? 128 : 64; 726 727 // Treat constant and global as identical. SMRD loads are sometimes usable 728 // for global loads (ideally constant address space should be eliminated) 729 // depending on the context. Legality cannot be context dependent, but 730 // RegBankSelect can split the load as necessary depending on the pointer 731 // register bank/uniformity and if the memory is invariant or not written in 732 // a kernel. 733 case AMDGPUAS::CONSTANT_ADDRESS: 734 case AMDGPUAS::GLOBAL_ADDRESS: 735 return IsLoad ? 512 : 128; 736 default: 737 return 128; 738 } 739 }; 740 741 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 742 bool IsLoad) -> bool { 743 const LLT DstTy = Query.Types[0]; 744 745 // Split vector extloads. 746 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 747 unsigned Align = Query.MMODescrs[0].AlignInBits; 748 749 if (MemSize < DstTy.getSizeInBits()) 750 MemSize = std::max(MemSize, Align); 751 752 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 753 return true; 754 755 const LLT PtrTy = Query.Types[1]; 756 unsigned AS = PtrTy.getAddressSpace(); 757 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 758 return true; 759 760 // Catch weird sized loads that don't evenly divide into the access sizes 761 // TODO: May be able to widen depending on alignment etc. 762 unsigned NumRegs = (MemSize + 31) / 32; 763 if (NumRegs == 3) { 764 if (!ST.hasDwordx3LoadStores()) 765 return true; 766 } else { 767 // If the alignment allows, these should have been widened. 768 if (!isPowerOf2_32(NumRegs)) 769 return true; 770 } 771 772 if (Align < MemSize) { 773 const SITargetLowering *TLI = ST.getTargetLowering(); 774 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 775 } 776 777 return false; 778 }; 779 780 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 781 unsigned Size = Query.Types[0].getSizeInBits(); 782 if (isPowerOf2_32(Size)) 783 return false; 784 785 if (Size == 96 && ST.hasDwordx3LoadStores()) 786 return false; 787 788 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 789 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 790 return false; 791 792 unsigned Align = Query.MMODescrs[0].AlignInBits; 793 unsigned RoundedSize = NextPowerOf2(Size); 794 return (Align >= RoundedSize); 795 }; 796 797 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 798 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 799 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 800 801 // TODO: Refine based on subtargets which support unaligned access or 128-bit 802 // LDS 803 // TODO: Unsupported flat for SI. 804 805 for (unsigned Op : {G_LOAD, G_STORE}) { 806 const bool IsStore = Op == G_STORE; 807 808 auto &Actions = getActionDefinitionsBuilder(Op); 809 // Whitelist the common cases. 810 // TODO: Loads to s16 on gfx9 811 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 812 {V2S32, GlobalPtr, 64, GlobalAlign32}, 813 {V4S32, GlobalPtr, 128, GlobalAlign32}, 814 {S128, GlobalPtr, 128, GlobalAlign32}, 815 {S64, GlobalPtr, 64, GlobalAlign32}, 816 {V2S64, GlobalPtr, 128, GlobalAlign32}, 817 {V2S16, GlobalPtr, 32, GlobalAlign32}, 818 {S32, GlobalPtr, 8, GlobalAlign8}, 819 {S32, GlobalPtr, 16, GlobalAlign16}, 820 821 {S32, LocalPtr, 32, 32}, 822 {S64, LocalPtr, 64, 32}, 823 {V2S32, LocalPtr, 64, 32}, 824 {S32, LocalPtr, 8, 8}, 825 {S32, LocalPtr, 16, 16}, 826 {V2S16, LocalPtr, 32, 32}, 827 828 {S32, PrivatePtr, 32, 32}, 829 {S32, PrivatePtr, 8, 8}, 830 {S32, PrivatePtr, 16, 16}, 831 {V2S16, PrivatePtr, 32, 32}, 832 833 {S32, FlatPtr, 32, GlobalAlign32}, 834 {S32, FlatPtr, 16, GlobalAlign16}, 835 {S32, FlatPtr, 8, GlobalAlign8}, 836 {V2S16, FlatPtr, 32, GlobalAlign32}, 837 838 {S32, ConstantPtr, 32, GlobalAlign32}, 839 {V2S32, ConstantPtr, 64, GlobalAlign32}, 840 {V4S32, ConstantPtr, 128, GlobalAlign32}, 841 {S64, ConstantPtr, 64, GlobalAlign32}, 842 {S128, ConstantPtr, 128, GlobalAlign32}, 843 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 844 Actions 845 .customIf(typeIs(1, Constant32Ptr)) 846 // Widen suitably aligned loads by loading extra elements. 847 .moreElementsIf([=](const LegalityQuery &Query) { 848 const LLT Ty = Query.Types[0]; 849 return Op == G_LOAD && Ty.isVector() && 850 shouldWidenLoadResult(Query); 851 }, moreElementsToNextPow2(0)) 852 .widenScalarIf([=](const LegalityQuery &Query) { 853 const LLT Ty = Query.Types[0]; 854 return Op == G_LOAD && !Ty.isVector() && 855 shouldWidenLoadResult(Query); 856 }, widenScalarOrEltToNextPow2(0)) 857 .narrowScalarIf( 858 [=](const LegalityQuery &Query) -> bool { 859 return !Query.Types[0].isVector() && 860 needToSplitMemOp(Query, Op == G_LOAD); 861 }, 862 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 863 const LLT DstTy = Query.Types[0]; 864 const LLT PtrTy = Query.Types[1]; 865 866 const unsigned DstSize = DstTy.getSizeInBits(); 867 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 868 869 // Split extloads. 870 if (DstSize > MemSize) 871 return std::make_pair(0, LLT::scalar(MemSize)); 872 873 if (!isPowerOf2_32(DstSize)) { 874 // We're probably decomposing an odd sized store. Try to split 875 // to the widest type. TODO: Account for alignment. As-is it 876 // should be OK, since the new parts will be further legalized. 877 unsigned FloorSize = PowerOf2Floor(DstSize); 878 return std::make_pair(0, LLT::scalar(FloorSize)); 879 } 880 881 if (DstSize > 32 && (DstSize % 32 != 0)) { 882 // FIXME: Need a way to specify non-extload of larger size if 883 // suitably aligned. 884 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 885 } 886 887 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 888 Op == G_LOAD); 889 if (MemSize > MaxSize) 890 return std::make_pair(0, LLT::scalar(MaxSize)); 891 892 unsigned Align = Query.MMODescrs[0].AlignInBits; 893 return std::make_pair(0, LLT::scalar(Align)); 894 }) 895 .fewerElementsIf( 896 [=](const LegalityQuery &Query) -> bool { 897 return Query.Types[0].isVector() && 898 needToSplitMemOp(Query, Op == G_LOAD); 899 }, 900 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 901 const LLT DstTy = Query.Types[0]; 902 const LLT PtrTy = Query.Types[1]; 903 904 LLT EltTy = DstTy.getElementType(); 905 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 906 Op == G_LOAD); 907 908 // FIXME: Handle widened to power of 2 results better. This ends 909 // up scalarizing. 910 // FIXME: 3 element stores scalarized on SI 911 912 // Split if it's too large for the address space. 913 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 914 unsigned NumElts = DstTy.getNumElements(); 915 unsigned EltSize = EltTy.getSizeInBits(); 916 917 if (MaxSize % EltSize == 0) { 918 return std::make_pair( 919 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 920 } 921 922 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 923 924 // FIXME: Refine when odd breakdowns handled 925 // The scalars will need to be re-legalized. 926 if (NumPieces == 1 || NumPieces >= NumElts || 927 NumElts % NumPieces != 0) 928 return std::make_pair(0, EltTy); 929 930 return std::make_pair(0, 931 LLT::vector(NumElts / NumPieces, EltTy)); 932 } 933 934 // FIXME: We could probably handle weird extending loads better. 935 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 936 if (DstTy.getSizeInBits() > MemSize) 937 return std::make_pair(0, EltTy); 938 939 unsigned EltSize = EltTy.getSizeInBits(); 940 unsigned DstSize = DstTy.getSizeInBits(); 941 if (!isPowerOf2_32(DstSize)) { 942 // We're probably decomposing an odd sized store. Try to split 943 // to the widest type. TODO: Account for alignment. As-is it 944 // should be OK, since the new parts will be further legalized. 945 unsigned FloorSize = PowerOf2Floor(DstSize); 946 return std::make_pair( 947 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 948 } 949 950 // Need to split because of alignment. 951 unsigned Align = Query.MMODescrs[0].AlignInBits; 952 if (EltSize > Align && 953 (EltSize / Align < DstTy.getNumElements())) { 954 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 955 } 956 957 // May need relegalization for the scalars. 958 return std::make_pair(0, EltTy); 959 }) 960 .minScalar(0, S32); 961 962 if (IsStore) 963 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 964 965 // TODO: Need a bitcast lower option? 966 Actions 967 .legalIf([=](const LegalityQuery &Query) { 968 const LLT Ty0 = Query.Types[0]; 969 unsigned Size = Ty0.getSizeInBits(); 970 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 971 unsigned Align = Query.MMODescrs[0].AlignInBits; 972 973 // FIXME: Widening store from alignment not valid. 974 if (MemSize < Size) 975 MemSize = std::max(MemSize, Align); 976 977 // No extending vector loads. 978 if (Size > MemSize && Ty0.isVector()) 979 return false; 980 981 switch (MemSize) { 982 case 8: 983 case 16: 984 return Size == 32; 985 case 32: 986 case 64: 987 case 128: 988 return true; 989 case 96: 990 return ST.hasDwordx3LoadStores(); 991 case 256: 992 case 512: 993 return true; 994 default: 995 return false; 996 } 997 }) 998 .widenScalarToNextPow2(0) 999 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1000 } 1001 1002 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1003 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1004 {S32, GlobalPtr, 16, 2 * 8}, 1005 {S32, LocalPtr, 8, 8}, 1006 {S32, LocalPtr, 16, 16}, 1007 {S32, PrivatePtr, 8, 8}, 1008 {S32, PrivatePtr, 16, 16}, 1009 {S32, ConstantPtr, 8, 8}, 1010 {S32, ConstantPtr, 16, 2 * 8}}); 1011 if (ST.hasFlatAddressSpace()) { 1012 ExtLoads.legalForTypesWithMemDesc( 1013 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1014 } 1015 1016 ExtLoads.clampScalar(0, S32, S32) 1017 .widenScalarToNextPow2(0) 1018 .unsupportedIfMemSizeNotPow2() 1019 .lower(); 1020 1021 auto &Atomics = getActionDefinitionsBuilder( 1022 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1023 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1024 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1025 G_ATOMICRMW_UMIN}) 1026 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1027 {S64, GlobalPtr}, {S64, LocalPtr}}); 1028 if (ST.hasFlatAddressSpace()) { 1029 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1030 } 1031 1032 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1033 .legalFor({{S32, LocalPtr}}); 1034 1035 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1036 // demarshalling 1037 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1038 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1039 {S32, FlatPtr}, {S64, FlatPtr}}) 1040 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1041 {S32, RegionPtr}, {S64, RegionPtr}}); 1042 // TODO: Pointer types, any 32-bit or 64-bit vector 1043 1044 // Condition should be s32 for scalar, s1 for vector. 1045 getActionDefinitionsBuilder(G_SELECT) 1046 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1047 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1048 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1049 .clampScalar(0, S16, S64) 1050 .scalarize(1) 1051 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1052 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1053 .clampMaxNumElements(0, S32, 2) 1054 .clampMaxNumElements(0, LocalPtr, 2) 1055 .clampMaxNumElements(0, PrivatePtr, 2) 1056 .scalarize(0) 1057 .widenScalarToNextPow2(0) 1058 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1059 1060 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1061 // be more flexible with the shift amount type. 1062 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1063 .legalFor({{S32, S32}, {S64, S32}}); 1064 if (ST.has16BitInsts()) { 1065 if (ST.hasVOP3PInsts()) { 1066 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1067 .clampMaxNumElements(0, S16, 2); 1068 } else 1069 Shifts.legalFor({{S16, S16}}); 1070 1071 // TODO: Support 16-bit shift amounts for all types 1072 Shifts.widenScalarIf( 1073 [=](const LegalityQuery &Query) { 1074 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1075 // 32-bit amount. 1076 const LLT ValTy = Query.Types[0]; 1077 const LLT AmountTy = Query.Types[1]; 1078 return ValTy.getSizeInBits() <= 16 && 1079 AmountTy.getSizeInBits() < 16; 1080 }, changeTo(1, S16)); 1081 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1082 Shifts.clampScalar(1, S32, S32); 1083 Shifts.clampScalar(0, S16, S64); 1084 Shifts.widenScalarToNextPow2(0, 16); 1085 } else { 1086 // Make sure we legalize the shift amount type first, as the general 1087 // expansion for the shifted type will produce much worse code if it hasn't 1088 // been truncated already. 1089 Shifts.clampScalar(1, S32, S32); 1090 Shifts.clampScalar(0, S32, S64); 1091 Shifts.widenScalarToNextPow2(0, 32); 1092 } 1093 Shifts.scalarize(0); 1094 1095 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1096 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1097 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1098 unsigned IdxTypeIdx = 2; 1099 1100 getActionDefinitionsBuilder(Op) 1101 .customIf([=](const LegalityQuery &Query) { 1102 const LLT EltTy = Query.Types[EltTypeIdx]; 1103 const LLT VecTy = Query.Types[VecTypeIdx]; 1104 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1105 return (EltTy.getSizeInBits() == 16 || 1106 EltTy.getSizeInBits() % 32 == 0) && 1107 VecTy.getSizeInBits() % 32 == 0 && 1108 VecTy.getSizeInBits() <= 1024 && 1109 IdxTy.getSizeInBits() == 32; 1110 }) 1111 .clampScalar(EltTypeIdx, S32, S64) 1112 .clampScalar(VecTypeIdx, S32, S64) 1113 .clampScalar(IdxTypeIdx, S32, S32); 1114 } 1115 1116 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1117 .unsupportedIf([=](const LegalityQuery &Query) { 1118 const LLT &EltTy = Query.Types[1].getElementType(); 1119 return Query.Types[0] != EltTy; 1120 }); 1121 1122 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1123 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1124 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1125 1126 // FIXME: Doesn't handle extract of illegal sizes. 1127 getActionDefinitionsBuilder(Op) 1128 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1129 // FIXME: Multiples of 16 should not be legal. 1130 .legalIf([=](const LegalityQuery &Query) { 1131 const LLT BigTy = Query.Types[BigTyIdx]; 1132 const LLT LitTy = Query.Types[LitTyIdx]; 1133 return (BigTy.getSizeInBits() % 32 == 0) && 1134 (LitTy.getSizeInBits() % 16 == 0); 1135 }) 1136 .widenScalarIf( 1137 [=](const LegalityQuery &Query) { 1138 const LLT BigTy = Query.Types[BigTyIdx]; 1139 return (BigTy.getScalarSizeInBits() < 16); 1140 }, 1141 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1142 .widenScalarIf( 1143 [=](const LegalityQuery &Query) { 1144 const LLT LitTy = Query.Types[LitTyIdx]; 1145 return (LitTy.getScalarSizeInBits() < 16); 1146 }, 1147 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1148 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1149 .widenScalarToNextPow2(BigTyIdx, 32); 1150 1151 } 1152 1153 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1154 .legalForCartesianProduct(AllS32Vectors, {S32}) 1155 .legalForCartesianProduct(AllS64Vectors, {S64}) 1156 .clampNumElements(0, V16S32, V32S32) 1157 .clampNumElements(0, V2S64, V16S64) 1158 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1159 1160 if (ST.hasScalarPackInsts()) { 1161 BuildVector 1162 // FIXME: Should probably widen s1 vectors straight to s32 1163 .minScalarOrElt(0, S16) 1164 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1165 .minScalar(1, S32); 1166 1167 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1168 .legalFor({V2S16, S32}) 1169 .lower(); 1170 BuildVector.minScalarOrElt(0, S32); 1171 } else { 1172 BuildVector.customFor({V2S16, S16}); 1173 BuildVector.minScalarOrElt(0, S32); 1174 1175 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1176 .customFor({V2S16, S32}) 1177 .lower(); 1178 } 1179 1180 BuildVector.legalIf(isRegisterType(0)); 1181 1182 // FIXME: Clamp maximum size 1183 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1184 .legalIf(isRegisterType(0)); 1185 1186 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1187 // pre-legalize. 1188 if (ST.hasVOP3PInsts()) { 1189 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1190 .customFor({V2S16, V2S16}) 1191 .lower(); 1192 } else 1193 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1194 1195 // Merge/Unmerge 1196 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1197 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1198 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1199 1200 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1201 const LLT Ty = Query.Types[TypeIdx]; 1202 if (Ty.isVector()) { 1203 const LLT &EltTy = Ty.getElementType(); 1204 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1205 return true; 1206 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1207 return true; 1208 } 1209 return false; 1210 }; 1211 1212 auto &Builder = getActionDefinitionsBuilder(Op) 1213 .lowerFor({{S16, V2S16}}) 1214 .lowerIf([=](const LegalityQuery &Query) { 1215 const LLT BigTy = Query.Types[BigTyIdx]; 1216 return BigTy.getSizeInBits() == 32; 1217 }) 1218 // Try to widen to s16 first for small types. 1219 // TODO: Only do this on targets with legal s16 shifts 1220 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1221 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1222 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1223 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1224 elementTypeIs(1, S16)), 1225 changeTo(1, V2S16)) 1226 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1227 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1228 // valid. 1229 .clampScalar(LitTyIdx, S32, S512) 1230 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1231 // Break up vectors with weird elements into scalars 1232 .fewerElementsIf( 1233 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1234 scalarize(0)) 1235 .fewerElementsIf( 1236 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1237 scalarize(1)) 1238 .clampScalar(BigTyIdx, S32, S1024); 1239 1240 if (Op == G_MERGE_VALUES) { 1241 Builder.widenScalarIf( 1242 // TODO: Use 16-bit shifts if legal for 8-bit values? 1243 [=](const LegalityQuery &Query) { 1244 const LLT Ty = Query.Types[LitTyIdx]; 1245 return Ty.getSizeInBits() < 32; 1246 }, 1247 changeTo(LitTyIdx, S32)); 1248 } 1249 1250 Builder.widenScalarIf( 1251 [=](const LegalityQuery &Query) { 1252 const LLT Ty = Query.Types[BigTyIdx]; 1253 return !isPowerOf2_32(Ty.getSizeInBits()) && 1254 Ty.getSizeInBits() % 16 != 0; 1255 }, 1256 [=](const LegalityQuery &Query) { 1257 // Pick the next power of 2, or a multiple of 64 over 128. 1258 // Whichever is smaller. 1259 const LLT &Ty = Query.Types[BigTyIdx]; 1260 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1261 if (NewSizeInBits >= 256) { 1262 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1263 if (RoundedTo < NewSizeInBits) 1264 NewSizeInBits = RoundedTo; 1265 } 1266 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1267 }) 1268 .legalIf([=](const LegalityQuery &Query) { 1269 const LLT &BigTy = Query.Types[BigTyIdx]; 1270 const LLT &LitTy = Query.Types[LitTyIdx]; 1271 1272 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1273 return false; 1274 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1275 return false; 1276 1277 return BigTy.getSizeInBits() % 16 == 0 && 1278 LitTy.getSizeInBits() % 16 == 0 && 1279 BigTy.getSizeInBits() <= 1024; 1280 }) 1281 // Any vectors left are the wrong size. Scalarize them. 1282 .scalarize(0) 1283 .scalarize(1); 1284 } 1285 1286 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1287 // RegBankSelect. 1288 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1289 .legalFor({{S32}, {S64}}); 1290 1291 if (ST.hasVOP3PInsts()) { 1292 SextInReg.lowerFor({{V2S16}}) 1293 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1294 // get more vector shift opportunities, since we'll get those when 1295 // expanded. 1296 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1297 } else if (ST.has16BitInsts()) { 1298 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1299 } else { 1300 // Prefer to promote to s32 before lowering if we don't have 16-bit 1301 // shifts. This avoid a lot of intermediate truncate and extend operations. 1302 SextInReg.lowerFor({{S32}, {S64}}); 1303 } 1304 1305 SextInReg 1306 .scalarize(0) 1307 .clampScalar(0, S32, S64) 1308 .lower(); 1309 1310 getActionDefinitionsBuilder(G_FSHR) 1311 .legalFor({{S32, S32}}) 1312 .scalarize(0) 1313 .lower(); 1314 1315 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1316 .legalFor({S64}); 1317 1318 getActionDefinitionsBuilder({ 1319 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1320 G_FCOPYSIGN, 1321 1322 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1323 G_READ_REGISTER, 1324 G_WRITE_REGISTER, 1325 1326 G_SADDO, G_SSUBO, 1327 1328 // TODO: Implement 1329 G_FMINIMUM, G_FMAXIMUM, 1330 G_FSHL 1331 }).lower(); 1332 1333 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1334 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1335 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1336 .unsupported(); 1337 1338 computeTables(); 1339 verify(*ST.getInstrInfo()); 1340 } 1341 1342 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1343 MachineRegisterInfo &MRI, 1344 MachineIRBuilder &B, 1345 GISelChangeObserver &Observer) const { 1346 switch (MI.getOpcode()) { 1347 case TargetOpcode::G_ADDRSPACE_CAST: 1348 return legalizeAddrSpaceCast(MI, MRI, B); 1349 case TargetOpcode::G_FRINT: 1350 return legalizeFrint(MI, MRI, B); 1351 case TargetOpcode::G_FCEIL: 1352 return legalizeFceil(MI, MRI, B); 1353 case TargetOpcode::G_INTRINSIC_TRUNC: 1354 return legalizeIntrinsicTrunc(MI, MRI, B); 1355 case TargetOpcode::G_SITOFP: 1356 return legalizeITOFP(MI, MRI, B, true); 1357 case TargetOpcode::G_UITOFP: 1358 return legalizeITOFP(MI, MRI, B, false); 1359 case TargetOpcode::G_FPTOSI: 1360 return legalizeFPTOI(MI, MRI, B, true); 1361 case TargetOpcode::G_FPTOUI: 1362 return legalizeFPTOI(MI, MRI, B, false); 1363 case TargetOpcode::G_FMINNUM: 1364 case TargetOpcode::G_FMAXNUM: 1365 case TargetOpcode::G_FMINNUM_IEEE: 1366 case TargetOpcode::G_FMAXNUM_IEEE: 1367 return legalizeMinNumMaxNum(MI, MRI, B); 1368 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1369 return legalizeExtractVectorElt(MI, MRI, B); 1370 case TargetOpcode::G_INSERT_VECTOR_ELT: 1371 return legalizeInsertVectorElt(MI, MRI, B); 1372 case TargetOpcode::G_SHUFFLE_VECTOR: 1373 return legalizeShuffleVector(MI, MRI, B); 1374 case TargetOpcode::G_FSIN: 1375 case TargetOpcode::G_FCOS: 1376 return legalizeSinCos(MI, MRI, B); 1377 case TargetOpcode::G_GLOBAL_VALUE: 1378 return legalizeGlobalValue(MI, MRI, B); 1379 case TargetOpcode::G_LOAD: 1380 return legalizeLoad(MI, MRI, B, Observer); 1381 case TargetOpcode::G_FMAD: 1382 return legalizeFMad(MI, MRI, B); 1383 case TargetOpcode::G_FDIV: 1384 return legalizeFDIV(MI, MRI, B); 1385 case TargetOpcode::G_UDIV: 1386 case TargetOpcode::G_UREM: 1387 return legalizeUDIV_UREM(MI, MRI, B); 1388 case TargetOpcode::G_SDIV: 1389 case TargetOpcode::G_SREM: 1390 return legalizeSDIV_SREM(MI, MRI, B); 1391 case TargetOpcode::G_ATOMIC_CMPXCHG: 1392 return legalizeAtomicCmpXChg(MI, MRI, B); 1393 case TargetOpcode::G_FLOG: 1394 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1395 case TargetOpcode::G_FLOG10: 1396 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1397 case TargetOpcode::G_FEXP: 1398 return legalizeFExp(MI, B); 1399 case TargetOpcode::G_FPOW: 1400 return legalizeFPow(MI, B); 1401 case TargetOpcode::G_FFLOOR: 1402 return legalizeFFloor(MI, MRI, B); 1403 case TargetOpcode::G_BUILD_VECTOR: 1404 return legalizeBuildVector(MI, MRI, B); 1405 default: 1406 return false; 1407 } 1408 1409 llvm_unreachable("expected switch to return"); 1410 } 1411 1412 Register AMDGPULegalizerInfo::getSegmentAperture( 1413 unsigned AS, 1414 MachineRegisterInfo &MRI, 1415 MachineIRBuilder &B) const { 1416 MachineFunction &MF = B.getMF(); 1417 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1418 const LLT S32 = LLT::scalar(32); 1419 1420 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1421 1422 if (ST.hasApertureRegs()) { 1423 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1424 // getreg. 1425 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1426 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1427 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1428 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1429 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1430 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1431 unsigned Encoding = 1432 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1433 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1434 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1435 1436 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1437 1438 B.buildInstr(AMDGPU::S_GETREG_B32) 1439 .addDef(GetReg) 1440 .addImm(Encoding); 1441 MRI.setType(GetReg, S32); 1442 1443 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1444 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1445 } 1446 1447 Register QueuePtr = MRI.createGenericVirtualRegister( 1448 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1449 1450 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1451 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1452 return Register(); 1453 1454 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1455 // private_segment_aperture_base_hi. 1456 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1457 1458 // TODO: can we be smarter about machine pointer info? 1459 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1460 MachineMemOperand *MMO = MF.getMachineMemOperand( 1461 PtrInfo, 1462 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1463 MachineMemOperand::MOInvariant, 1464 4, commonAlignment(Align(64), StructOffset)); 1465 1466 Register LoadAddr; 1467 1468 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1469 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1470 } 1471 1472 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1473 MachineInstr &MI, MachineRegisterInfo &MRI, 1474 MachineIRBuilder &B) const { 1475 MachineFunction &MF = B.getMF(); 1476 1477 B.setInstr(MI); 1478 1479 const LLT S32 = LLT::scalar(32); 1480 Register Dst = MI.getOperand(0).getReg(); 1481 Register Src = MI.getOperand(1).getReg(); 1482 1483 LLT DstTy = MRI.getType(Dst); 1484 LLT SrcTy = MRI.getType(Src); 1485 unsigned DestAS = DstTy.getAddressSpace(); 1486 unsigned SrcAS = SrcTy.getAddressSpace(); 1487 1488 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1489 // vector element. 1490 assert(!DstTy.isVector()); 1491 1492 const AMDGPUTargetMachine &TM 1493 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1494 1495 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1496 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1497 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1498 return true; 1499 } 1500 1501 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1502 // Truncate. 1503 B.buildExtract(Dst, Src, 0); 1504 MI.eraseFromParent(); 1505 return true; 1506 } 1507 1508 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1509 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1510 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1511 1512 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1513 // another. Merge operands are required to be the same type, but creating an 1514 // extra ptrtoint would be kind of pointless. 1515 auto HighAddr = B.buildConstant( 1516 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1517 B.buildMerge(Dst, {Src, HighAddr}); 1518 MI.eraseFromParent(); 1519 return true; 1520 } 1521 1522 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1523 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1524 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1525 unsigned NullVal = TM.getNullPointerValue(DestAS); 1526 1527 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1528 auto FlatNull = B.buildConstant(SrcTy, 0); 1529 1530 // Extract low 32-bits of the pointer. 1531 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1532 1533 auto CmpRes = 1534 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1535 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1536 1537 MI.eraseFromParent(); 1538 return true; 1539 } 1540 1541 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1542 return false; 1543 1544 if (!ST.hasFlatAddressSpace()) 1545 return false; 1546 1547 auto SegmentNull = 1548 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1549 auto FlatNull = 1550 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1551 1552 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1553 if (!ApertureReg.isValid()) 1554 return false; 1555 1556 auto CmpRes = 1557 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1558 1559 // Coerce the type of the low half of the result so we can use merge_values. 1560 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1561 1562 // TODO: Should we allow mismatched types but matching sizes in merges to 1563 // avoid the ptrtoint? 1564 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1565 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1566 1567 MI.eraseFromParent(); 1568 return true; 1569 } 1570 1571 bool AMDGPULegalizerInfo::legalizeFrint( 1572 MachineInstr &MI, MachineRegisterInfo &MRI, 1573 MachineIRBuilder &B) const { 1574 B.setInstr(MI); 1575 1576 Register Src = MI.getOperand(1).getReg(); 1577 LLT Ty = MRI.getType(Src); 1578 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1579 1580 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1581 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1582 1583 auto C1 = B.buildFConstant(Ty, C1Val); 1584 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1585 1586 // TODO: Should this propagate fast-math-flags? 1587 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1588 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1589 1590 auto C2 = B.buildFConstant(Ty, C2Val); 1591 auto Fabs = B.buildFAbs(Ty, Src); 1592 1593 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1594 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1595 return true; 1596 } 1597 1598 bool AMDGPULegalizerInfo::legalizeFceil( 1599 MachineInstr &MI, MachineRegisterInfo &MRI, 1600 MachineIRBuilder &B) const { 1601 B.setInstr(MI); 1602 1603 const LLT S1 = LLT::scalar(1); 1604 const LLT S64 = LLT::scalar(64); 1605 1606 Register Src = MI.getOperand(1).getReg(); 1607 assert(MRI.getType(Src) == S64); 1608 1609 // result = trunc(src) 1610 // if (src > 0.0 && src != result) 1611 // result += 1.0 1612 1613 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1614 1615 const auto Zero = B.buildFConstant(S64, 0.0); 1616 const auto One = B.buildFConstant(S64, 1.0); 1617 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1618 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1619 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1620 auto Add = B.buildSelect(S64, And, One, Zero); 1621 1622 // TODO: Should this propagate fast-math-flags? 1623 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1624 return true; 1625 } 1626 1627 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1628 MachineIRBuilder &B) { 1629 const unsigned FractBits = 52; 1630 const unsigned ExpBits = 11; 1631 LLT S32 = LLT::scalar(32); 1632 1633 auto Const0 = B.buildConstant(S32, FractBits - 32); 1634 auto Const1 = B.buildConstant(S32, ExpBits); 1635 1636 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1637 .addUse(Const0.getReg(0)) 1638 .addUse(Const1.getReg(0)); 1639 1640 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1641 } 1642 1643 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1644 MachineInstr &MI, MachineRegisterInfo &MRI, 1645 MachineIRBuilder &B) const { 1646 B.setInstr(MI); 1647 1648 const LLT S1 = LLT::scalar(1); 1649 const LLT S32 = LLT::scalar(32); 1650 const LLT S64 = LLT::scalar(64); 1651 1652 Register Src = MI.getOperand(1).getReg(); 1653 assert(MRI.getType(Src) == S64); 1654 1655 // TODO: Should this use extract since the low half is unused? 1656 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1657 Register Hi = Unmerge.getReg(1); 1658 1659 // Extract the upper half, since this is where we will find the sign and 1660 // exponent. 1661 auto Exp = extractF64Exponent(Hi, B); 1662 1663 const unsigned FractBits = 52; 1664 1665 // Extract the sign bit. 1666 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1667 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1668 1669 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1670 1671 const auto Zero32 = B.buildConstant(S32, 0); 1672 1673 // Extend back to 64-bits. 1674 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1675 1676 auto Shr = B.buildAShr(S64, FractMask, Exp); 1677 auto Not = B.buildNot(S64, Shr); 1678 auto Tmp0 = B.buildAnd(S64, Src, Not); 1679 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1680 1681 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1682 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1683 1684 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1685 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1686 return true; 1687 } 1688 1689 bool AMDGPULegalizerInfo::legalizeITOFP( 1690 MachineInstr &MI, MachineRegisterInfo &MRI, 1691 MachineIRBuilder &B, bool Signed) const { 1692 B.setInstr(MI); 1693 1694 Register Dst = MI.getOperand(0).getReg(); 1695 Register Src = MI.getOperand(1).getReg(); 1696 1697 const LLT S64 = LLT::scalar(64); 1698 const LLT S32 = LLT::scalar(32); 1699 1700 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1701 1702 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1703 1704 auto CvtHi = Signed ? 1705 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1706 B.buildUITOFP(S64, Unmerge.getReg(1)); 1707 1708 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1709 1710 auto ThirtyTwo = B.buildConstant(S32, 32); 1711 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1712 .addUse(CvtHi.getReg(0)) 1713 .addUse(ThirtyTwo.getReg(0)); 1714 1715 // TODO: Should this propagate fast-math-flags? 1716 B.buildFAdd(Dst, LdExp, CvtLo); 1717 MI.eraseFromParent(); 1718 return true; 1719 } 1720 1721 // TODO: Copied from DAG implementation. Verify logic and document how this 1722 // actually works. 1723 bool AMDGPULegalizerInfo::legalizeFPTOI( 1724 MachineInstr &MI, MachineRegisterInfo &MRI, 1725 MachineIRBuilder &B, bool Signed) const { 1726 B.setInstr(MI); 1727 1728 Register Dst = MI.getOperand(0).getReg(); 1729 Register Src = MI.getOperand(1).getReg(); 1730 1731 const LLT S64 = LLT::scalar(64); 1732 const LLT S32 = LLT::scalar(32); 1733 1734 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1735 1736 unsigned Flags = MI.getFlags(); 1737 1738 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1739 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1740 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1741 1742 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1743 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1744 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1745 1746 auto Hi = Signed ? 1747 B.buildFPTOSI(S32, FloorMul) : 1748 B.buildFPTOUI(S32, FloorMul); 1749 auto Lo = B.buildFPTOUI(S32, Fma); 1750 1751 B.buildMerge(Dst, { Lo, Hi }); 1752 MI.eraseFromParent(); 1753 1754 return true; 1755 } 1756 1757 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1758 MachineInstr &MI, MachineRegisterInfo &MRI, 1759 MachineIRBuilder &B) const { 1760 MachineFunction &MF = B.getMF(); 1761 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1762 1763 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1764 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1765 1766 // With ieee_mode disabled, the instructions have the correct behavior 1767 // already for G_FMINNUM/G_FMAXNUM 1768 if (!MFI->getMode().IEEE) 1769 return !IsIEEEOp; 1770 1771 if (IsIEEEOp) 1772 return true; 1773 1774 MachineIRBuilder HelperBuilder(MI); 1775 GISelObserverWrapper DummyObserver; 1776 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1777 HelperBuilder.setInstr(MI); 1778 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1779 } 1780 1781 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1782 MachineInstr &MI, MachineRegisterInfo &MRI, 1783 MachineIRBuilder &B) const { 1784 // TODO: Should move some of this into LegalizerHelper. 1785 1786 // TODO: Promote dynamic indexing of s16 to s32 1787 1788 // FIXME: Artifact combiner probably should have replaced the truncated 1789 // constant before this, so we shouldn't need 1790 // getConstantVRegValWithLookThrough. 1791 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1792 MI.getOperand(2).getReg(), MRI); 1793 if (!IdxVal) // Dynamic case will be selected to register indexing. 1794 return true; 1795 1796 Register Dst = MI.getOperand(0).getReg(); 1797 Register Vec = MI.getOperand(1).getReg(); 1798 1799 LLT VecTy = MRI.getType(Vec); 1800 LLT EltTy = VecTy.getElementType(); 1801 assert(EltTy == MRI.getType(Dst)); 1802 1803 B.setInstr(MI); 1804 1805 if (IdxVal->Value < VecTy.getNumElements()) 1806 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1807 else 1808 B.buildUndef(Dst); 1809 1810 MI.eraseFromParent(); 1811 return true; 1812 } 1813 1814 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1815 MachineInstr &MI, MachineRegisterInfo &MRI, 1816 MachineIRBuilder &B) const { 1817 // TODO: Should move some of this into LegalizerHelper. 1818 1819 // TODO: Promote dynamic indexing of s16 to s32 1820 1821 // FIXME: Artifact combiner probably should have replaced the truncated 1822 // constant before this, so we shouldn't need 1823 // getConstantVRegValWithLookThrough. 1824 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1825 MI.getOperand(3).getReg(), MRI); 1826 if (!IdxVal) // Dynamic case will be selected to register indexing. 1827 return true; 1828 1829 Register Dst = MI.getOperand(0).getReg(); 1830 Register Vec = MI.getOperand(1).getReg(); 1831 Register Ins = MI.getOperand(2).getReg(); 1832 1833 LLT VecTy = MRI.getType(Vec); 1834 LLT EltTy = VecTy.getElementType(); 1835 assert(EltTy == MRI.getType(Ins)); 1836 1837 B.setInstr(MI); 1838 1839 if (IdxVal->Value < VecTy.getNumElements()) 1840 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1841 else 1842 B.buildUndef(Dst); 1843 1844 MI.eraseFromParent(); 1845 return true; 1846 } 1847 1848 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1849 MachineInstr &MI, MachineRegisterInfo &MRI, 1850 MachineIRBuilder &B) const { 1851 const LLT V2S16 = LLT::vector(2, 16); 1852 1853 Register Dst = MI.getOperand(0).getReg(); 1854 Register Src0 = MI.getOperand(1).getReg(); 1855 LLT DstTy = MRI.getType(Dst); 1856 LLT SrcTy = MRI.getType(Src0); 1857 1858 if (SrcTy == V2S16 && DstTy == V2S16 && 1859 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1860 return true; 1861 1862 MachineIRBuilder HelperBuilder(MI); 1863 GISelObserverWrapper DummyObserver; 1864 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1865 HelperBuilder.setInstr(MI); 1866 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1867 } 1868 1869 bool AMDGPULegalizerInfo::legalizeSinCos( 1870 MachineInstr &MI, MachineRegisterInfo &MRI, 1871 MachineIRBuilder &B) const { 1872 B.setInstr(MI); 1873 1874 Register DstReg = MI.getOperand(0).getReg(); 1875 Register SrcReg = MI.getOperand(1).getReg(); 1876 LLT Ty = MRI.getType(DstReg); 1877 unsigned Flags = MI.getFlags(); 1878 1879 Register TrigVal; 1880 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1881 if (ST.hasTrigReducedRange()) { 1882 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1883 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1884 .addUse(MulVal.getReg(0)) 1885 .setMIFlags(Flags).getReg(0); 1886 } else 1887 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1888 1889 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1890 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1891 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1892 .addUse(TrigVal) 1893 .setMIFlags(Flags); 1894 MI.eraseFromParent(); 1895 return true; 1896 } 1897 1898 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1899 Register DstReg, LLT PtrTy, 1900 MachineIRBuilder &B, const GlobalValue *GV, 1901 unsigned Offset, unsigned GAFlags) const { 1902 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1903 // to the following code sequence: 1904 // 1905 // For constant address space: 1906 // s_getpc_b64 s[0:1] 1907 // s_add_u32 s0, s0, $symbol 1908 // s_addc_u32 s1, s1, 0 1909 // 1910 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1911 // a fixup or relocation is emitted to replace $symbol with a literal 1912 // constant, which is a pc-relative offset from the encoding of the $symbol 1913 // operand to the global variable. 1914 // 1915 // For global address space: 1916 // s_getpc_b64 s[0:1] 1917 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1918 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1919 // 1920 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1921 // fixups or relocations are emitted to replace $symbol@*@lo and 1922 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1923 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1924 // operand to the global variable. 1925 // 1926 // What we want here is an offset from the value returned by s_getpc 1927 // (which is the address of the s_add_u32 instruction) to the global 1928 // variable, but since the encoding of $symbol starts 4 bytes after the start 1929 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1930 // small. This requires us to add 4 to the global variable offset in order to 1931 // compute the correct address. 1932 1933 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1934 1935 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1936 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1937 1938 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1939 .addDef(PCReg); 1940 1941 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1942 if (GAFlags == SIInstrInfo::MO_NONE) 1943 MIB.addImm(0); 1944 else 1945 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1946 1947 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1948 1949 if (PtrTy.getSizeInBits() == 32) 1950 B.buildExtract(DstReg, PCReg, 0); 1951 return true; 1952 } 1953 1954 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1955 MachineInstr &MI, MachineRegisterInfo &MRI, 1956 MachineIRBuilder &B) const { 1957 Register DstReg = MI.getOperand(0).getReg(); 1958 LLT Ty = MRI.getType(DstReg); 1959 unsigned AS = Ty.getAddressSpace(); 1960 1961 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1962 MachineFunction &MF = B.getMF(); 1963 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1964 B.setInstr(MI); 1965 1966 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1967 if (!MFI->isEntryFunction()) { 1968 const Function &Fn = MF.getFunction(); 1969 DiagnosticInfoUnsupported BadLDSDecl( 1970 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1971 DS_Warning); 1972 Fn.getContext().diagnose(BadLDSDecl); 1973 1974 // We currently don't have a way to correctly allocate LDS objects that 1975 // aren't directly associated with a kernel. We do force inlining of 1976 // functions that use local objects. However, if these dead functions are 1977 // not eliminated, we don't want a compile time error. Just emit a warning 1978 // and a trap, since there should be no callable path here. 1979 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1980 B.buildUndef(DstReg); 1981 MI.eraseFromParent(); 1982 return true; 1983 } 1984 1985 // TODO: We could emit code to handle the initialization somewhere. 1986 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1987 const SITargetLowering *TLI = ST.getTargetLowering(); 1988 if (!TLI->shouldUseLDSConstAddress(GV)) { 1989 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 1990 return true; // Leave in place; 1991 } 1992 1993 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1994 MI.eraseFromParent(); 1995 return true; 1996 } 1997 1998 const Function &Fn = MF.getFunction(); 1999 DiagnosticInfoUnsupported BadInit( 2000 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2001 Fn.getContext().diagnose(BadInit); 2002 return true; 2003 } 2004 2005 const SITargetLowering *TLI = ST.getTargetLowering(); 2006 2007 if (TLI->shouldEmitFixup(GV)) { 2008 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2009 MI.eraseFromParent(); 2010 return true; 2011 } 2012 2013 if (TLI->shouldEmitPCReloc(GV)) { 2014 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2015 MI.eraseFromParent(); 2016 return true; 2017 } 2018 2019 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2020 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2021 2022 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2023 MachinePointerInfo::getGOT(MF), 2024 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2025 MachineMemOperand::MOInvariant, 2026 8 /*Size*/, Align(8)); 2027 2028 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2029 2030 if (Ty.getSizeInBits() == 32) { 2031 // Truncate if this is a 32-bit constant adrdess. 2032 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2033 B.buildExtract(DstReg, Load, 0); 2034 } else 2035 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2036 2037 MI.eraseFromParent(); 2038 return true; 2039 } 2040 2041 bool AMDGPULegalizerInfo::legalizeLoad( 2042 MachineInstr &MI, MachineRegisterInfo &MRI, 2043 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2044 B.setInstr(MI); 2045 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2046 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2047 Observer.changingInstr(MI); 2048 MI.getOperand(1).setReg(Cast.getReg(0)); 2049 Observer.changedInstr(MI); 2050 return true; 2051 } 2052 2053 bool AMDGPULegalizerInfo::legalizeFMad( 2054 MachineInstr &MI, MachineRegisterInfo &MRI, 2055 MachineIRBuilder &B) const { 2056 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2057 assert(Ty.isScalar()); 2058 2059 MachineFunction &MF = B.getMF(); 2060 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2061 2062 // TODO: Always legal with future ftz flag. 2063 // FIXME: Do we need just output? 2064 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2065 return true; 2066 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2067 return true; 2068 2069 MachineIRBuilder HelperBuilder(MI); 2070 GISelObserverWrapper DummyObserver; 2071 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2072 HelperBuilder.setInstr(MI); 2073 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2074 } 2075 2076 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2077 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2078 Register DstReg = MI.getOperand(0).getReg(); 2079 Register PtrReg = MI.getOperand(1).getReg(); 2080 Register CmpVal = MI.getOperand(2).getReg(); 2081 Register NewVal = MI.getOperand(3).getReg(); 2082 2083 assert(SITargetLowering::isFlatGlobalAddrSpace( 2084 MRI.getType(PtrReg).getAddressSpace()) && 2085 "this should not have been custom lowered"); 2086 2087 LLT ValTy = MRI.getType(CmpVal); 2088 LLT VecTy = LLT::vector(2, ValTy); 2089 2090 B.setInstr(MI); 2091 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2092 2093 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2094 .addDef(DstReg) 2095 .addUse(PtrReg) 2096 .addUse(PackedVal) 2097 .setMemRefs(MI.memoperands()); 2098 2099 MI.eraseFromParent(); 2100 return true; 2101 } 2102 2103 bool AMDGPULegalizerInfo::legalizeFlog( 2104 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2105 Register Dst = MI.getOperand(0).getReg(); 2106 Register Src = MI.getOperand(1).getReg(); 2107 LLT Ty = B.getMRI()->getType(Dst); 2108 unsigned Flags = MI.getFlags(); 2109 B.setInstr(MI); 2110 2111 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2112 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2113 2114 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2115 MI.eraseFromParent(); 2116 return true; 2117 } 2118 2119 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2120 MachineIRBuilder &B) const { 2121 Register Dst = MI.getOperand(0).getReg(); 2122 Register Src = MI.getOperand(1).getReg(); 2123 unsigned Flags = MI.getFlags(); 2124 LLT Ty = B.getMRI()->getType(Dst); 2125 B.setInstr(MI); 2126 2127 auto K = B.buildFConstant(Ty, numbers::log2e); 2128 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2129 B.buildFExp2(Dst, Mul, Flags); 2130 MI.eraseFromParent(); 2131 return true; 2132 } 2133 2134 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2135 MachineIRBuilder &B) const { 2136 Register Dst = MI.getOperand(0).getReg(); 2137 Register Src0 = MI.getOperand(1).getReg(); 2138 Register Src1 = MI.getOperand(2).getReg(); 2139 unsigned Flags = MI.getFlags(); 2140 LLT Ty = B.getMRI()->getType(Dst); 2141 B.setInstr(MI); 2142 const LLT S16 = LLT::scalar(16); 2143 const LLT S32 = LLT::scalar(32); 2144 2145 if (Ty == S32) { 2146 auto Log = B.buildFLog2(S32, Src0, Flags); 2147 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2148 .addUse(Log.getReg(0)) 2149 .addUse(Src1) 2150 .setMIFlags(Flags); 2151 B.buildFExp2(Dst, Mul, Flags); 2152 } else if (Ty == S16) { 2153 // There's no f16 fmul_legacy, so we need to convert for it. 2154 auto Log = B.buildFLog2(S16, Src0, Flags); 2155 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2156 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2157 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2158 .addUse(Ext0.getReg(0)) 2159 .addUse(Ext1.getReg(0)) 2160 .setMIFlags(Flags); 2161 2162 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2163 } else 2164 return false; 2165 2166 MI.eraseFromParent(); 2167 return true; 2168 } 2169 2170 // Find a source register, ignoring any possible source modifiers. 2171 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2172 Register ModSrc = OrigSrc; 2173 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2174 ModSrc = SrcFNeg->getOperand(1).getReg(); 2175 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2176 ModSrc = SrcFAbs->getOperand(1).getReg(); 2177 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2178 ModSrc = SrcFAbs->getOperand(1).getReg(); 2179 return ModSrc; 2180 } 2181 2182 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2183 MachineRegisterInfo &MRI, 2184 MachineIRBuilder &B) const { 2185 B.setInstr(MI); 2186 2187 const LLT S1 = LLT::scalar(1); 2188 const LLT S64 = LLT::scalar(64); 2189 Register Dst = MI.getOperand(0).getReg(); 2190 Register OrigSrc = MI.getOperand(1).getReg(); 2191 unsigned Flags = MI.getFlags(); 2192 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2193 "this should not have been custom lowered"); 2194 2195 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2196 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2197 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2198 // V_FRACT bug is: 2199 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2200 // 2201 // Convert floor(x) to (x - fract(x)) 2202 2203 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2204 .addUse(OrigSrc) 2205 .setMIFlags(Flags); 2206 2207 // Give source modifier matching some assistance before obscuring a foldable 2208 // pattern. 2209 2210 // TODO: We can avoid the neg on the fract? The input sign to fract 2211 // shouldn't matter? 2212 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2213 2214 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2215 2216 Register Min = MRI.createGenericVirtualRegister(S64); 2217 2218 // We don't need to concern ourselves with the snan handling difference, so 2219 // use the one which will directly select. 2220 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2221 if (MFI->getMode().IEEE) 2222 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2223 else 2224 B.buildFMinNum(Min, Fract, Const, Flags); 2225 2226 Register CorrectedFract = Min; 2227 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2228 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2229 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2230 } 2231 2232 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2233 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2234 2235 MI.eraseFromParent(); 2236 return true; 2237 } 2238 2239 // Turn an illegal packed v2s16 build vector into bit operations. 2240 // TODO: This should probably be a bitcast action in LegalizerHelper. 2241 bool AMDGPULegalizerInfo::legalizeBuildVector( 2242 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2243 Register Dst = MI.getOperand(0).getReg(); 2244 const LLT S32 = LLT::scalar(32); 2245 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2246 2247 Register Src0 = MI.getOperand(1).getReg(); 2248 Register Src1 = MI.getOperand(2).getReg(); 2249 assert(MRI.getType(Src0) == LLT::scalar(16)); 2250 2251 B.setInstr(MI); 2252 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2253 B.buildBitcast(Dst, Merge); 2254 2255 MI.eraseFromParent(); 2256 return true; 2257 } 2258 2259 // Return the use branch instruction, otherwise null if the usage is invalid. 2260 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2261 MachineRegisterInfo &MRI, 2262 MachineInstr *&Br, 2263 MachineBasicBlock *&UncondBrTarget) { 2264 Register CondDef = MI.getOperand(0).getReg(); 2265 if (!MRI.hasOneNonDBGUse(CondDef)) 2266 return nullptr; 2267 2268 MachineBasicBlock *Parent = MI.getParent(); 2269 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2270 if (UseMI.getParent() != Parent || 2271 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2272 return nullptr; 2273 2274 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2275 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2276 if (Next == Parent->end()) { 2277 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2278 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2279 return nullptr; 2280 UncondBrTarget = &*NextMBB; 2281 } else { 2282 if (Next->getOpcode() != AMDGPU::G_BR) 2283 return nullptr; 2284 Br = &*Next; 2285 UncondBrTarget = Br->getOperand(0).getMBB(); 2286 } 2287 2288 return &UseMI; 2289 } 2290 2291 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2292 MachineRegisterInfo &MRI, 2293 Register LiveIn, 2294 Register PhyReg) const { 2295 assert(PhyReg.isPhysical() && "Physical register expected"); 2296 2297 // Insert the live-in copy, if required, by defining destination virtual 2298 // register. 2299 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2300 if (!MRI.getVRegDef(LiveIn)) { 2301 // FIXME: Should have scoped insert pt 2302 MachineBasicBlock &OrigInsBB = B.getMBB(); 2303 auto OrigInsPt = B.getInsertPt(); 2304 2305 MachineBasicBlock &EntryMBB = B.getMF().front(); 2306 EntryMBB.addLiveIn(PhyReg); 2307 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2308 B.buildCopy(LiveIn, PhyReg); 2309 2310 B.setInsertPt(OrigInsBB, OrigInsPt); 2311 } 2312 2313 return LiveIn; 2314 } 2315 2316 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2317 MachineRegisterInfo &MRI, 2318 Register PhyReg, LLT Ty, 2319 bool InsertLiveInCopy) const { 2320 assert(PhyReg.isPhysical() && "Physical register expected"); 2321 2322 // Get or create virtual live-in regester 2323 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2324 if (!LiveIn) { 2325 LiveIn = MRI.createGenericVirtualRegister(Ty); 2326 MRI.addLiveIn(PhyReg, LiveIn); 2327 } 2328 2329 // When the actual true copy required is from virtual register to physical 2330 // register (to be inserted later), live-in copy insertion from physical 2331 // to register virtual register is not required 2332 if (!InsertLiveInCopy) 2333 return LiveIn; 2334 2335 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2336 } 2337 2338 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2339 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2340 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2341 const ArgDescriptor *Arg; 2342 const TargetRegisterClass *RC; 2343 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2344 if (!Arg) { 2345 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2346 return nullptr; 2347 } 2348 return Arg; 2349 } 2350 2351 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2352 const ArgDescriptor *Arg) const { 2353 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2354 return false; // TODO: Handle these 2355 2356 Register SrcReg = Arg->getRegister(); 2357 assert(SrcReg.isPhysical() && "Physical register expected"); 2358 assert(DstReg.isVirtual() && "Virtual register expected"); 2359 2360 MachineRegisterInfo &MRI = *B.getMRI(); 2361 2362 LLT Ty = MRI.getType(DstReg); 2363 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2364 2365 if (Arg->isMasked()) { 2366 // TODO: Should we try to emit this once in the entry block? 2367 const LLT S32 = LLT::scalar(32); 2368 const unsigned Mask = Arg->getMask(); 2369 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2370 2371 Register AndMaskSrc = LiveIn; 2372 2373 if (Shift != 0) { 2374 auto ShiftAmt = B.buildConstant(S32, Shift); 2375 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2376 } 2377 2378 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2379 } else { 2380 B.buildCopy(DstReg, LiveIn); 2381 } 2382 2383 return true; 2384 } 2385 2386 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2387 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2388 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2389 B.setInstr(MI); 2390 2391 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2392 if (!Arg) 2393 return false; 2394 2395 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2396 return false; 2397 2398 MI.eraseFromParent(); 2399 return true; 2400 } 2401 2402 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2403 MachineRegisterInfo &MRI, 2404 MachineIRBuilder &B) const { 2405 B.setInstr(MI); 2406 Register Dst = MI.getOperand(0).getReg(); 2407 LLT DstTy = MRI.getType(Dst); 2408 LLT S16 = LLT::scalar(16); 2409 LLT S32 = LLT::scalar(32); 2410 LLT S64 = LLT::scalar(64); 2411 2412 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2413 return true; 2414 2415 if (DstTy == S16) 2416 return legalizeFDIV16(MI, MRI, B); 2417 if (DstTy == S32) 2418 return legalizeFDIV32(MI, MRI, B); 2419 if (DstTy == S64) 2420 return legalizeFDIV64(MI, MRI, B); 2421 2422 return false; 2423 } 2424 2425 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2426 const LLT S32 = LLT::scalar(32); 2427 2428 auto Cvt0 = B.buildUITOFP(S32, Src); 2429 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2430 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2431 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2432 return B.buildFPTOUI(S32, Mul).getReg(0); 2433 } 2434 2435 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2436 Register DstReg, 2437 Register Num, 2438 Register Den, 2439 bool IsRem) const { 2440 const LLT S1 = LLT::scalar(1); 2441 const LLT S32 = LLT::scalar(32); 2442 2443 // RCP = URECIP(Den) = 2^32 / Den + e 2444 // e is rounding error. 2445 auto RCP = buildDivRCP(B, Den); 2446 2447 // RCP_LO = mul(RCP, Den) 2448 auto RCP_LO = B.buildMul(S32, RCP, Den); 2449 2450 // RCP_HI = mulhu (RCP, Den) */ 2451 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2452 2453 // NEG_RCP_LO = -RCP_LO 2454 auto Zero = B.buildConstant(S32, 0); 2455 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2456 2457 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2458 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2459 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2460 2461 // Calculate the rounding error from the URECIP instruction 2462 // E = mulhu(ABS_RCP_LO, RCP) 2463 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2464 2465 // RCP_A_E = RCP + E 2466 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2467 2468 // RCP_S_E = RCP - E 2469 auto RCP_S_E = B.buildSub(S32, RCP, E); 2470 2471 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2472 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2473 2474 // Quotient = mulhu(Tmp0, Num)stmp 2475 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2476 2477 // Num_S_Remainder = Quotient * Den 2478 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2479 2480 // Remainder = Num - Num_S_Remainder 2481 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2482 2483 // Remainder_GE_Den = Remainder >= Den 2484 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2485 2486 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2487 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2488 Num, Num_S_Remainder); 2489 2490 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2491 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2492 2493 // Calculate Division result: 2494 2495 // Quotient_A_One = Quotient + 1 2496 auto One = B.buildConstant(S32, 1); 2497 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2498 2499 // Quotient_S_One = Quotient - 1 2500 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2501 2502 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2503 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2504 2505 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2506 if (IsRem) { 2507 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2508 2509 // Calculate Rem result: 2510 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2511 2512 // Remainder_A_Den = Remainder + Den 2513 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2514 2515 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2516 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2517 2518 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2519 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2520 } else { 2521 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2522 } 2523 } 2524 2525 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2526 MachineRegisterInfo &MRI, 2527 MachineIRBuilder &B) const { 2528 B.setInstr(MI); 2529 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2530 Register DstReg = MI.getOperand(0).getReg(); 2531 Register Num = MI.getOperand(1).getReg(); 2532 Register Den = MI.getOperand(2).getReg(); 2533 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2534 MI.eraseFromParent(); 2535 return true; 2536 } 2537 2538 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2539 // 2540 // Return lo, hi of result 2541 // 2542 // %cvt.lo = G_UITOFP Val.lo 2543 // %cvt.hi = G_UITOFP Val.hi 2544 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2545 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2546 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2547 // %mul2 = G_FMUL %mul1, 2**(-32) 2548 // %trunc = G_INTRINSIC_TRUNC %mul2 2549 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2550 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2551 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2552 Register Val) { 2553 const LLT S32 = LLT::scalar(32); 2554 auto Unmerge = B.buildUnmerge(S32, Val); 2555 2556 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2557 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2558 2559 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2560 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2561 2562 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2563 auto Mul1 = 2564 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2565 2566 // 2**(-32) 2567 auto Mul2 = 2568 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2569 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2570 2571 // -(2**32) 2572 auto Mad2 = B.buildFMAD(S32, Trunc, 2573 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2574 2575 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2576 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2577 2578 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2579 } 2580 2581 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2582 MachineRegisterInfo &MRI, 2583 MachineIRBuilder &B) const { 2584 B.setInstr(MI); 2585 2586 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2587 const LLT S32 = LLT::scalar(32); 2588 const LLT S64 = LLT::scalar(64); 2589 const LLT S1 = LLT::scalar(1); 2590 Register Numer = MI.getOperand(1).getReg(); 2591 Register Denom = MI.getOperand(2).getReg(); 2592 Register RcpLo, RcpHi; 2593 2594 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2595 2596 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2597 2598 auto Zero64 = B.buildConstant(S64, 0); 2599 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2600 2601 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2602 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2603 2604 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2605 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2606 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2607 2608 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2609 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2610 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2611 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2612 2613 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2614 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2615 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2616 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2617 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2618 2619 auto Zero32 = B.buildConstant(S32, 0); 2620 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2621 auto Add2_HiC = 2622 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2623 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2624 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2625 2626 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2627 Register NumerLo = UnmergeNumer.getReg(0); 2628 Register NumerHi = UnmergeNumer.getReg(1); 2629 2630 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2631 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2632 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2633 Register Mul3_Lo = UnmergeMul3.getReg(0); 2634 Register Mul3_Hi = UnmergeMul3.getReg(1); 2635 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2636 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2637 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2638 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2639 2640 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2641 Register DenomLo = UnmergeDenom.getReg(0); 2642 Register DenomHi = UnmergeDenom.getReg(1); 2643 2644 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2645 auto C1 = B.buildSExt(S32, CmpHi); 2646 2647 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2648 auto C2 = B.buildSExt(S32, CmpLo); 2649 2650 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2651 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2652 2653 // TODO: Here and below portions of the code can be enclosed into if/endif. 2654 // Currently control flow is unconditional and we have 4 selects after 2655 // potential endif to substitute PHIs. 2656 2657 // if C3 != 0 ... 2658 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2659 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2660 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2661 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2662 2663 auto One64 = B.buildConstant(S64, 1); 2664 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2665 2666 auto C4 = 2667 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2668 auto C5 = 2669 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2670 auto C6 = B.buildSelect( 2671 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2672 2673 // if (C6 != 0) 2674 auto Add4 = B.buildAdd(S64, Add3, One64); 2675 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2676 2677 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2678 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2679 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2680 2681 // endif C6 2682 // endif C3 2683 2684 if (IsDiv) { 2685 auto Sel1 = B.buildSelect( 2686 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2687 B.buildSelect(MI.getOperand(0), 2688 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2689 } else { 2690 auto Sel2 = B.buildSelect( 2691 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2692 B.buildSelect(MI.getOperand(0), 2693 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2694 } 2695 2696 MI.eraseFromParent(); 2697 return true; 2698 } 2699 2700 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2701 MachineRegisterInfo &MRI, 2702 MachineIRBuilder &B) const { 2703 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2704 if (Ty == LLT::scalar(32)) 2705 return legalizeUDIV_UREM32(MI, MRI, B); 2706 if (Ty == LLT::scalar(64)) 2707 return legalizeUDIV_UREM64(MI, MRI, B); 2708 return false; 2709 } 2710 2711 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2712 MachineRegisterInfo &MRI, 2713 MachineIRBuilder &B) const { 2714 B.setInstr(MI); 2715 const LLT S32 = LLT::scalar(32); 2716 2717 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2718 Register DstReg = MI.getOperand(0).getReg(); 2719 Register LHS = MI.getOperand(1).getReg(); 2720 Register RHS = MI.getOperand(2).getReg(); 2721 2722 auto ThirtyOne = B.buildConstant(S32, 31); 2723 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2724 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2725 2726 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2727 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2728 2729 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2730 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2731 2732 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2733 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2734 2735 if (IsRem) { 2736 auto RSign = LHSign; // Remainder sign is the same as LHS 2737 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2738 B.buildSub(DstReg, UDivRem, RSign); 2739 } else { 2740 auto DSign = B.buildXor(S32, LHSign, RHSign); 2741 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2742 B.buildSub(DstReg, UDivRem, DSign); 2743 } 2744 2745 MI.eraseFromParent(); 2746 return true; 2747 } 2748 2749 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2750 MachineRegisterInfo &MRI, 2751 MachineIRBuilder &B) const { 2752 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2753 return legalizeSDIV_SREM32(MI, MRI, B); 2754 return false; 2755 } 2756 2757 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2758 MachineRegisterInfo &MRI, 2759 MachineIRBuilder &B) const { 2760 Register Res = MI.getOperand(0).getReg(); 2761 Register LHS = MI.getOperand(1).getReg(); 2762 Register RHS = MI.getOperand(2).getReg(); 2763 2764 uint16_t Flags = MI.getFlags(); 2765 2766 LLT ResTy = MRI.getType(Res); 2767 LLT S32 = LLT::scalar(32); 2768 LLT S64 = LLT::scalar(64); 2769 2770 const MachineFunction &MF = B.getMF(); 2771 bool Unsafe = 2772 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2773 2774 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2775 return false; 2776 2777 if (!Unsafe && ResTy == S32 && 2778 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2779 return false; 2780 2781 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2782 // 1 / x -> RCP(x) 2783 if (CLHS->isExactlyValue(1.0)) { 2784 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2785 .addUse(RHS) 2786 .setMIFlags(Flags); 2787 2788 MI.eraseFromParent(); 2789 return true; 2790 } 2791 2792 // -1 / x -> RCP( FNEG(x) ) 2793 if (CLHS->isExactlyValue(-1.0)) { 2794 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2795 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2796 .addUse(FNeg.getReg(0)) 2797 .setMIFlags(Flags); 2798 2799 MI.eraseFromParent(); 2800 return true; 2801 } 2802 } 2803 2804 // x / y -> x * (1.0 / y) 2805 if (Unsafe) { 2806 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2807 .addUse(RHS) 2808 .setMIFlags(Flags); 2809 B.buildFMul(Res, LHS, RCP, Flags); 2810 2811 MI.eraseFromParent(); 2812 return true; 2813 } 2814 2815 return false; 2816 } 2817 2818 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2819 MachineRegisterInfo &MRI, 2820 MachineIRBuilder &B) const { 2821 B.setInstr(MI); 2822 Register Res = MI.getOperand(0).getReg(); 2823 Register LHS = MI.getOperand(1).getReg(); 2824 Register RHS = MI.getOperand(2).getReg(); 2825 2826 uint16_t Flags = MI.getFlags(); 2827 2828 LLT S16 = LLT::scalar(16); 2829 LLT S32 = LLT::scalar(32); 2830 2831 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2832 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2833 2834 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2835 .addUse(RHSExt.getReg(0)) 2836 .setMIFlags(Flags); 2837 2838 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2839 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2840 2841 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2842 .addUse(RDst.getReg(0)) 2843 .addUse(RHS) 2844 .addUse(LHS) 2845 .setMIFlags(Flags); 2846 2847 MI.eraseFromParent(); 2848 return true; 2849 } 2850 2851 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2852 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2853 static void toggleSPDenormMode(bool Enable, 2854 MachineIRBuilder &B, 2855 const GCNSubtarget &ST, 2856 AMDGPU::SIModeRegisterDefaults Mode) { 2857 // Set SP denorm mode to this value. 2858 unsigned SPDenormMode = 2859 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2860 2861 if (ST.hasDenormModeInst()) { 2862 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2863 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2864 2865 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2866 B.buildInstr(AMDGPU::S_DENORM_MODE) 2867 .addImm(NewDenormModeValue); 2868 2869 } else { 2870 // Select FP32 bit field in mode register. 2871 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2872 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2873 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2874 2875 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2876 .addImm(SPDenormMode) 2877 .addImm(SPDenormModeBitField); 2878 } 2879 } 2880 2881 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2882 MachineRegisterInfo &MRI, 2883 MachineIRBuilder &B) const { 2884 B.setInstr(MI); 2885 Register Res = MI.getOperand(0).getReg(); 2886 Register LHS = MI.getOperand(1).getReg(); 2887 Register RHS = MI.getOperand(2).getReg(); 2888 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2889 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2890 2891 uint16_t Flags = MI.getFlags(); 2892 2893 LLT S32 = LLT::scalar(32); 2894 LLT S1 = LLT::scalar(1); 2895 2896 auto One = B.buildFConstant(S32, 1.0f); 2897 2898 auto DenominatorScaled = 2899 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2900 .addUse(LHS) 2901 .addUse(RHS) 2902 .addImm(0) 2903 .setMIFlags(Flags); 2904 auto NumeratorScaled = 2905 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2906 .addUse(LHS) 2907 .addUse(RHS) 2908 .addImm(1) 2909 .setMIFlags(Flags); 2910 2911 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2912 .addUse(DenominatorScaled.getReg(0)) 2913 .setMIFlags(Flags); 2914 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2915 2916 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2917 // aren't modeled as reading it. 2918 if (!Mode.allFP32Denormals()) 2919 toggleSPDenormMode(true, B, ST, Mode); 2920 2921 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2922 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2923 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2924 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2925 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2926 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2927 2928 if (!Mode.allFP32Denormals()) 2929 toggleSPDenormMode(false, B, ST, Mode); 2930 2931 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2932 .addUse(Fma4.getReg(0)) 2933 .addUse(Fma1.getReg(0)) 2934 .addUse(Fma3.getReg(0)) 2935 .addUse(NumeratorScaled.getReg(1)) 2936 .setMIFlags(Flags); 2937 2938 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2939 .addUse(Fmas.getReg(0)) 2940 .addUse(RHS) 2941 .addUse(LHS) 2942 .setMIFlags(Flags); 2943 2944 MI.eraseFromParent(); 2945 return true; 2946 } 2947 2948 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2949 MachineRegisterInfo &MRI, 2950 MachineIRBuilder &B) const { 2951 B.setInstr(MI); 2952 Register Res = MI.getOperand(0).getReg(); 2953 Register LHS = MI.getOperand(1).getReg(); 2954 Register RHS = MI.getOperand(2).getReg(); 2955 2956 uint16_t Flags = MI.getFlags(); 2957 2958 LLT S64 = LLT::scalar(64); 2959 LLT S1 = LLT::scalar(1); 2960 2961 auto One = B.buildFConstant(S64, 1.0); 2962 2963 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2964 .addUse(LHS) 2965 .addUse(RHS) 2966 .addImm(0) 2967 .setMIFlags(Flags); 2968 2969 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2970 2971 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2972 .addUse(DivScale0.getReg(0)) 2973 .setMIFlags(Flags); 2974 2975 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2976 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2977 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2978 2979 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2980 .addUse(LHS) 2981 .addUse(RHS) 2982 .addImm(1) 2983 .setMIFlags(Flags); 2984 2985 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2986 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 2987 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2988 2989 Register Scale; 2990 if (!ST.hasUsableDivScaleConditionOutput()) { 2991 // Workaround a hardware bug on SI where the condition output from div_scale 2992 // is not usable. 2993 2994 LLT S32 = LLT::scalar(32); 2995 2996 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2997 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2998 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2999 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3000 3001 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3002 Scale1Unmerge.getReg(1)); 3003 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3004 Scale0Unmerge.getReg(1)); 3005 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3006 } else { 3007 Scale = DivScale1.getReg(1); 3008 } 3009 3010 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3011 .addUse(Fma4.getReg(0)) 3012 .addUse(Fma3.getReg(0)) 3013 .addUse(Mul.getReg(0)) 3014 .addUse(Scale) 3015 .setMIFlags(Flags); 3016 3017 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3018 .addUse(Fmas.getReg(0)) 3019 .addUse(RHS) 3020 .addUse(LHS) 3021 .setMIFlags(Flags); 3022 3023 MI.eraseFromParent(); 3024 return true; 3025 } 3026 3027 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3028 MachineRegisterInfo &MRI, 3029 MachineIRBuilder &B) const { 3030 B.setInstr(MI); 3031 Register Res = MI.getOperand(0).getReg(); 3032 Register LHS = MI.getOperand(2).getReg(); 3033 Register RHS = MI.getOperand(3).getReg(); 3034 uint16_t Flags = MI.getFlags(); 3035 3036 LLT S32 = LLT::scalar(32); 3037 LLT S1 = LLT::scalar(1); 3038 3039 auto Abs = B.buildFAbs(S32, RHS, Flags); 3040 const APFloat C0Val(1.0f); 3041 3042 auto C0 = B.buildConstant(S32, 0x6f800000); 3043 auto C1 = B.buildConstant(S32, 0x2f800000); 3044 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3045 3046 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3047 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3048 3049 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3050 3051 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3052 .addUse(Mul0.getReg(0)) 3053 .setMIFlags(Flags); 3054 3055 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3056 3057 B.buildFMul(Res, Sel, Mul1, Flags); 3058 3059 MI.eraseFromParent(); 3060 return true; 3061 } 3062 3063 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3064 MachineRegisterInfo &MRI, 3065 MachineIRBuilder &B) const { 3066 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3067 if (!MFI->isEntryFunction()) { 3068 return legalizePreloadedArgIntrin(MI, MRI, B, 3069 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3070 } 3071 3072 B.setInstr(MI); 3073 3074 uint64_t Offset = 3075 ST.getTargetLowering()->getImplicitParameterOffset( 3076 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3077 Register DstReg = MI.getOperand(0).getReg(); 3078 LLT DstTy = MRI.getType(DstReg); 3079 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3080 3081 const ArgDescriptor *Arg; 3082 const TargetRegisterClass *RC; 3083 std::tie(Arg, RC) 3084 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3085 if (!Arg) 3086 return false; 3087 3088 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3089 if (!loadInputValue(KernargPtrReg, B, Arg)) 3090 return false; 3091 3092 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3093 MI.eraseFromParent(); 3094 return true; 3095 } 3096 3097 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3098 MachineRegisterInfo &MRI, 3099 MachineIRBuilder &B, 3100 unsigned AddrSpace) const { 3101 B.setInstr(MI); 3102 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3103 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3104 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3105 MI.eraseFromParent(); 3106 return true; 3107 } 3108 3109 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3110 // offset (the offset that is included in bounds checking and swizzling, to be 3111 // split between the instruction's voffset and immoffset fields) and soffset 3112 // (the offset that is excluded from bounds checking and swizzling, to go in 3113 // the instruction's soffset field). This function takes the first kind of 3114 // offset and figures out how to split it between voffset and immoffset. 3115 std::tuple<Register, unsigned, unsigned> 3116 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3117 Register OrigOffset) const { 3118 const unsigned MaxImm = 4095; 3119 Register BaseReg; 3120 unsigned TotalConstOffset; 3121 MachineInstr *OffsetDef; 3122 const LLT S32 = LLT::scalar(32); 3123 3124 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3125 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3126 3127 unsigned ImmOffset = TotalConstOffset; 3128 3129 // If the immediate value is too big for the immoffset field, put the value 3130 // and -4096 into the immoffset field so that the value that is copied/added 3131 // for the voffset field is a multiple of 4096, and it stands more chance 3132 // of being CSEd with the copy/add for another similar load/store. 3133 // However, do not do that rounding down to a multiple of 4096 if that is a 3134 // negative number, as it appears to be illegal to have a negative offset 3135 // in the vgpr, even if adding the immediate offset makes it positive. 3136 unsigned Overflow = ImmOffset & ~MaxImm; 3137 ImmOffset -= Overflow; 3138 if ((int32_t)Overflow < 0) { 3139 Overflow += ImmOffset; 3140 ImmOffset = 0; 3141 } 3142 3143 if (Overflow != 0) { 3144 if (!BaseReg) { 3145 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3146 } else { 3147 auto OverflowVal = B.buildConstant(S32, Overflow); 3148 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3149 } 3150 } 3151 3152 if (!BaseReg) 3153 BaseReg = B.buildConstant(S32, 0).getReg(0); 3154 3155 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3156 } 3157 3158 /// Handle register layout difference for f16 images for some subtargets. 3159 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3160 MachineRegisterInfo &MRI, 3161 Register Reg) const { 3162 if (!ST.hasUnpackedD16VMem()) 3163 return Reg; 3164 3165 const LLT S16 = LLT::scalar(16); 3166 const LLT S32 = LLT::scalar(32); 3167 LLT StoreVT = MRI.getType(Reg); 3168 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3169 3170 auto Unmerge = B.buildUnmerge(S16, Reg); 3171 3172 SmallVector<Register, 4> WideRegs; 3173 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3174 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3175 3176 int NumElts = StoreVT.getNumElements(); 3177 3178 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3179 } 3180 3181 Register AMDGPULegalizerInfo::fixStoreSourceType( 3182 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3183 MachineRegisterInfo *MRI = B.getMRI(); 3184 LLT Ty = MRI->getType(VData); 3185 3186 const LLT S16 = LLT::scalar(16); 3187 3188 // Fixup illegal register types for i8 stores. 3189 if (Ty == LLT::scalar(8) || Ty == S16) { 3190 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3191 return AnyExt; 3192 } 3193 3194 if (Ty.isVector()) { 3195 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3196 if (IsFormat) 3197 return handleD16VData(B, *MRI, VData); 3198 } 3199 } 3200 3201 return VData; 3202 } 3203 3204 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3205 MachineRegisterInfo &MRI, 3206 MachineIRBuilder &B, 3207 bool IsTyped, 3208 bool IsFormat) const { 3209 B.setInstr(MI); 3210 3211 Register VData = MI.getOperand(1).getReg(); 3212 LLT Ty = MRI.getType(VData); 3213 LLT EltTy = Ty.getScalarType(); 3214 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3215 const LLT S32 = LLT::scalar(32); 3216 3217 VData = fixStoreSourceType(B, VData, IsFormat); 3218 Register RSrc = MI.getOperand(2).getReg(); 3219 3220 MachineMemOperand *MMO = *MI.memoperands_begin(); 3221 const int MemSize = MMO->getSize(); 3222 3223 unsigned ImmOffset; 3224 unsigned TotalOffset; 3225 3226 // The typed intrinsics add an immediate after the registers. 3227 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3228 3229 // The struct intrinsic variants add one additional operand over raw. 3230 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3231 Register VIndex; 3232 int OpOffset = 0; 3233 if (HasVIndex) { 3234 VIndex = MI.getOperand(3).getReg(); 3235 OpOffset = 1; 3236 } 3237 3238 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3239 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3240 3241 unsigned Format = 0; 3242 if (IsTyped) { 3243 Format = MI.getOperand(5 + OpOffset).getImm(); 3244 ++OpOffset; 3245 } 3246 3247 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3248 3249 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3250 if (TotalOffset != 0) 3251 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3252 3253 unsigned Opc; 3254 if (IsTyped) { 3255 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3256 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3257 } else if (IsFormat) { 3258 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3259 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3260 } else { 3261 switch (MemSize) { 3262 case 1: 3263 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3264 break; 3265 case 2: 3266 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3267 break; 3268 default: 3269 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3270 break; 3271 } 3272 } 3273 3274 if (!VIndex) 3275 VIndex = B.buildConstant(S32, 0).getReg(0); 3276 3277 auto MIB = B.buildInstr(Opc) 3278 .addUse(VData) // vdata 3279 .addUse(RSrc) // rsrc 3280 .addUse(VIndex) // vindex 3281 .addUse(VOffset) // voffset 3282 .addUse(SOffset) // soffset 3283 .addImm(ImmOffset); // offset(imm) 3284 3285 if (IsTyped) 3286 MIB.addImm(Format); 3287 3288 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3289 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3290 .addMemOperand(MMO); 3291 3292 MI.eraseFromParent(); 3293 return true; 3294 } 3295 3296 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3297 MachineRegisterInfo &MRI, 3298 MachineIRBuilder &B, 3299 bool IsFormat, 3300 bool IsTyped) const { 3301 B.setInstr(MI); 3302 3303 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3304 MachineMemOperand *MMO = *MI.memoperands_begin(); 3305 const int MemSize = MMO->getSize(); 3306 const LLT S32 = LLT::scalar(32); 3307 3308 Register Dst = MI.getOperand(0).getReg(); 3309 Register RSrc = MI.getOperand(2).getReg(); 3310 3311 // The typed intrinsics add an immediate after the registers. 3312 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3313 3314 // The struct intrinsic variants add one additional operand over raw. 3315 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3316 Register VIndex; 3317 int OpOffset = 0; 3318 if (HasVIndex) { 3319 VIndex = MI.getOperand(3).getReg(); 3320 OpOffset = 1; 3321 } 3322 3323 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3324 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3325 3326 unsigned Format = 0; 3327 if (IsTyped) { 3328 Format = MI.getOperand(5 + OpOffset).getImm(); 3329 ++OpOffset; 3330 } 3331 3332 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3333 unsigned ImmOffset; 3334 unsigned TotalOffset; 3335 3336 LLT Ty = MRI.getType(Dst); 3337 LLT EltTy = Ty.getScalarType(); 3338 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3339 const bool Unpacked = ST.hasUnpackedD16VMem(); 3340 3341 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3342 if (TotalOffset != 0) 3343 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3344 3345 unsigned Opc; 3346 3347 if (IsTyped) { 3348 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3349 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3350 } else if (IsFormat) { 3351 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3352 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3353 } else { 3354 switch (MemSize) { 3355 case 1: 3356 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3357 break; 3358 case 2: 3359 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3360 break; 3361 default: 3362 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3363 break; 3364 } 3365 } 3366 3367 Register LoadDstReg; 3368 3369 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3370 LLT UnpackedTy = Ty.changeElementSize(32); 3371 3372 if (IsExtLoad) 3373 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3374 else if (Unpacked && IsD16 && Ty.isVector()) 3375 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3376 else 3377 LoadDstReg = Dst; 3378 3379 if (!VIndex) 3380 VIndex = B.buildConstant(S32, 0).getReg(0); 3381 3382 auto MIB = B.buildInstr(Opc) 3383 .addDef(LoadDstReg) // vdata 3384 .addUse(RSrc) // rsrc 3385 .addUse(VIndex) // vindex 3386 .addUse(VOffset) // voffset 3387 .addUse(SOffset) // soffset 3388 .addImm(ImmOffset); // offset(imm) 3389 3390 if (IsTyped) 3391 MIB.addImm(Format); 3392 3393 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3394 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3395 .addMemOperand(MMO); 3396 3397 if (LoadDstReg != Dst) { 3398 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3399 3400 // Widen result for extending loads was widened. 3401 if (IsExtLoad) 3402 B.buildTrunc(Dst, LoadDstReg); 3403 else { 3404 // Repack to original 16-bit vector result 3405 // FIXME: G_TRUNC should work, but legalization currently fails 3406 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3407 SmallVector<Register, 4> Repack; 3408 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3409 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3410 B.buildMerge(Dst, Repack); 3411 } 3412 } 3413 3414 MI.eraseFromParent(); 3415 return true; 3416 } 3417 3418 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3419 MachineIRBuilder &B, 3420 bool IsInc) const { 3421 B.setInstr(MI); 3422 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3423 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3424 B.buildInstr(Opc) 3425 .addDef(MI.getOperand(0).getReg()) 3426 .addUse(MI.getOperand(2).getReg()) 3427 .addUse(MI.getOperand(3).getReg()) 3428 .cloneMemRefs(MI); 3429 MI.eraseFromParent(); 3430 return true; 3431 } 3432 3433 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3434 switch (IntrID) { 3435 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3436 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3437 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3438 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3439 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3440 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3441 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3442 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3443 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3444 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3445 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3446 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3447 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3448 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3449 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3450 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3451 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3452 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3453 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3454 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3455 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3456 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3457 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3458 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3459 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3460 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3461 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3462 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3463 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3464 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3465 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3466 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3467 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3468 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3469 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3470 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3471 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3472 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3473 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3474 default: 3475 llvm_unreachable("unhandled atomic opcode"); 3476 } 3477 } 3478 3479 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3480 MachineIRBuilder &B, 3481 Intrinsic::ID IID) const { 3482 B.setInstr(MI); 3483 3484 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3485 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3486 3487 Register Dst = MI.getOperand(0).getReg(); 3488 Register VData = MI.getOperand(2).getReg(); 3489 3490 Register CmpVal; 3491 int OpOffset = 0; 3492 3493 if (IsCmpSwap) { 3494 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3495 ++OpOffset; 3496 } 3497 3498 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3499 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3500 3501 // The struct intrinsic variants add one additional operand over raw. 3502 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3503 Register VIndex; 3504 if (HasVIndex) { 3505 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3506 ++OpOffset; 3507 } 3508 3509 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3510 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3511 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3512 3513 MachineMemOperand *MMO = *MI.memoperands_begin(); 3514 3515 unsigned ImmOffset; 3516 unsigned TotalOffset; 3517 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3518 if (TotalOffset != 0) 3519 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3520 3521 if (!VIndex) 3522 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3523 3524 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3525 .addDef(Dst) 3526 .addUse(VData); // vdata 3527 3528 if (IsCmpSwap) 3529 MIB.addReg(CmpVal); 3530 3531 MIB.addUse(RSrc) // rsrc 3532 .addUse(VIndex) // vindex 3533 .addUse(VOffset) // voffset 3534 .addUse(SOffset) // soffset 3535 .addImm(ImmOffset) // offset(imm) 3536 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3537 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3538 .addMemOperand(MMO); 3539 3540 MI.eraseFromParent(); 3541 return true; 3542 } 3543 3544 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3545 /// vector with s16 typed elements. 3546 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3547 SmallVectorImpl<Register> &PackedAddrs, 3548 int AddrIdx, int DimIdx, int NumVAddrs, 3549 int NumGradients) { 3550 const LLT S16 = LLT::scalar(16); 3551 const LLT V2S16 = LLT::vector(2, 16); 3552 3553 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3554 MachineOperand &SrcOp = MI.getOperand(I); 3555 if (!SrcOp.isReg()) 3556 continue; // _L to _LZ may have eliminated this. 3557 3558 Register AddrReg = SrcOp.getReg(); 3559 3560 if (I < DimIdx) { 3561 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3562 PackedAddrs.push_back(AddrReg); 3563 } else { 3564 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3565 // derivatives dx/dh and dx/dv are packed with undef. 3566 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3567 ((NumGradients / 2) % 2 == 1 && 3568 (I == DimIdx + (NumGradients / 2) - 1 || 3569 I == DimIdx + NumGradients - 1)) || 3570 // Check for _L to _LZ optimization 3571 !MI.getOperand(I + 1).isReg()) { 3572 PackedAddrs.push_back( 3573 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3574 .getReg(0)); 3575 } else { 3576 PackedAddrs.push_back( 3577 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3578 .getReg(0)); 3579 ++I; 3580 } 3581 } 3582 } 3583 } 3584 3585 /// Convert from separate vaddr components to a single vector address register, 3586 /// and replace the remaining operands with $noreg. 3587 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3588 int DimIdx, int NumVAddrs) { 3589 const LLT S32 = LLT::scalar(32); 3590 3591 SmallVector<Register, 8> AddrRegs; 3592 for (int I = 0; I != NumVAddrs; ++I) { 3593 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3594 if (SrcOp.isReg()) { 3595 AddrRegs.push_back(SrcOp.getReg()); 3596 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3597 } 3598 } 3599 3600 int NumAddrRegs = AddrRegs.size(); 3601 if (NumAddrRegs != 1) { 3602 // Round up to 8 elements for v5-v7 3603 // FIXME: Missing intermediate sized register classes and instructions. 3604 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3605 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3606 auto Undef = B.buildUndef(S32); 3607 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3608 NumAddrRegs = RoundedNumRegs; 3609 } 3610 3611 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3612 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3613 } 3614 3615 for (int I = 1; I != NumVAddrs; ++I) { 3616 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3617 if (SrcOp.isReg()) 3618 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3619 } 3620 } 3621 3622 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3623 /// 3624 /// Depending on the subtarget, load/store with 16-bit element data need to be 3625 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3626 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3627 /// registers. 3628 /// 3629 /// We don't want to directly select image instructions just yet, but also want 3630 /// to exposes all register repacking to the legalizer/combiners. We also don't 3631 /// want a selected instrution entering RegBankSelect. In order to avoid 3632 /// defining a multitude of intermediate image instructions, directly hack on 3633 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3634 /// now unnecessary arguments with $noreg. 3635 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3636 MachineInstr &MI, MachineIRBuilder &B, 3637 GISelChangeObserver &Observer, 3638 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3639 B.setInstr(MI); 3640 3641 const int NumDefs = MI.getNumExplicitDefs(); 3642 bool IsTFE = NumDefs == 2; 3643 // We are only processing the operands of d16 image operations on subtargets 3644 // that use the unpacked register layout, or need to repack the TFE result. 3645 3646 // TODO: Do we need to guard against already legalized intrinsics? 3647 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3648 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3649 3650 MachineRegisterInfo *MRI = B.getMRI(); 3651 const LLT S32 = LLT::scalar(32); 3652 const LLT S16 = LLT::scalar(16); 3653 const LLT V2S16 = LLT::vector(2, 16); 3654 3655 // Index of first address argument 3656 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3657 3658 // Check for 16 bit addresses and pack if true. 3659 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3660 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3661 const bool IsA16 = AddrTy == S16; 3662 3663 int NumVAddrs, NumGradients; 3664 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3665 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3666 getDMaskIdx(BaseOpcode, NumDefs); 3667 unsigned DMask = 0; 3668 3669 int DMaskLanes = 0; 3670 if (!BaseOpcode->Atomic) { 3671 DMask = MI.getOperand(DMaskIdx).getImm(); 3672 if (BaseOpcode->Gather4) { 3673 DMaskLanes = 4; 3674 } else if (DMask != 0) { 3675 DMaskLanes = countPopulation(DMask); 3676 } else if (!IsTFE && !BaseOpcode->Store) { 3677 // If dmask is 0, this is a no-op load. This can be eliminated. 3678 B.buildUndef(MI.getOperand(0)); 3679 MI.eraseFromParent(); 3680 return true; 3681 } 3682 } 3683 3684 Observer.changingInstr(MI); 3685 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3686 3687 unsigned NewOpcode = NumDefs == 0 ? 3688 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3689 3690 // Track that we legalized this 3691 MI.setDesc(B.getTII().get(NewOpcode)); 3692 3693 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3694 // dmask to be at least 1 otherwise the instruction will fail 3695 if (IsTFE && DMask == 0) { 3696 DMask = 0x1; 3697 DMaskLanes = 1; 3698 MI.getOperand(DMaskIdx).setImm(DMask); 3699 } 3700 3701 if (BaseOpcode->Atomic) { 3702 Register VData0 = MI.getOperand(2).getReg(); 3703 LLT Ty = MRI->getType(VData0); 3704 3705 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3706 if (Ty.isVector()) 3707 return false; 3708 3709 if (BaseOpcode->AtomicX2) { 3710 Register VData1 = MI.getOperand(3).getReg(); 3711 // The two values are packed in one register. 3712 LLT PackedTy = LLT::vector(2, Ty); 3713 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3714 MI.getOperand(2).setReg(Concat.getReg(0)); 3715 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3716 } 3717 } 3718 3719 int CorrectedNumVAddrs = NumVAddrs; 3720 3721 // Optimize _L to _LZ when _L is zero 3722 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3723 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3724 const ConstantFP *ConstantLod; 3725 const int LodIdx = AddrIdx + NumVAddrs - 1; 3726 3727 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3728 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3729 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3730 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3731 LZMappingInfo->LZ, ImageDimIntr->Dim); 3732 3733 // The starting indexes should remain in the same place. 3734 --NumVAddrs; 3735 --CorrectedNumVAddrs; 3736 3737 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3738 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3739 MI.RemoveOperand(LodIdx); 3740 } 3741 } 3742 } 3743 3744 // Optimize _mip away, when 'lod' is zero 3745 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3746 int64_t ConstantLod; 3747 const int LodIdx = AddrIdx + NumVAddrs - 1; 3748 3749 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3750 if (ConstantLod == 0) { 3751 // TODO: Change intrinsic opcode and remove operand instead or replacing 3752 // it with 0, as the _L to _LZ handling is done above. 3753 MI.getOperand(LodIdx).ChangeToImmediate(0); 3754 --CorrectedNumVAddrs; 3755 } 3756 } 3757 } 3758 3759 // If the register allocator cannot place the address registers contiguously 3760 // without introducing moves, then using the non-sequential address encoding 3761 // is always preferable, since it saves VALU instructions and is usually a 3762 // wash in terms of code size or even better. 3763 // 3764 // However, we currently have no way of hinting to the register allocator 3765 // that MIMG addresses should be placed contiguously when it is possible to 3766 // do so, so force non-NSA for the common 2-address case as a heuristic. 3767 // 3768 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3769 // allocation when possible. 3770 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3771 3772 // Rewrite the addressing register layout before doing anything else. 3773 if (IsA16) { 3774 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3775 // should be introduced. 3776 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3777 return false; 3778 3779 if (NumVAddrs > 1) { 3780 SmallVector<Register, 4> PackedRegs; 3781 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3782 NumGradients); 3783 3784 if (!UseNSA && PackedRegs.size() > 1) { 3785 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3786 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3787 PackedRegs[0] = Concat.getReg(0); 3788 PackedRegs.resize(1); 3789 } 3790 3791 const int NumPacked = PackedRegs.size(); 3792 for (int I = 0; I != NumVAddrs; ++I) { 3793 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3794 if (!SrcOp.isReg()) { 3795 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3796 continue; 3797 } 3798 3799 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3800 3801 if (I < NumPacked) 3802 SrcOp.setReg(PackedRegs[I]); 3803 else 3804 SrcOp.setReg(AMDGPU::NoRegister); 3805 } 3806 } 3807 } else if (!UseNSA && NumVAddrs > 1) { 3808 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3809 } 3810 3811 3812 if (BaseOpcode->Store) { // No TFE for stores? 3813 // TODO: Handle dmask trim 3814 Register VData = MI.getOperand(1).getReg(); 3815 LLT Ty = MRI->getType(VData); 3816 if (!Ty.isVector() || Ty.getElementType() != S16) 3817 return true; 3818 3819 B.setInstr(MI); 3820 3821 Register RepackedReg = handleD16VData(B, *MRI, VData); 3822 if (RepackedReg != VData) { 3823 MI.getOperand(1).setReg(RepackedReg); 3824 } 3825 3826 return true; 3827 } 3828 3829 Register DstReg = MI.getOperand(0).getReg(); 3830 LLT Ty = MRI->getType(DstReg); 3831 const LLT EltTy = Ty.getScalarType(); 3832 const bool IsD16 = Ty.getScalarType() == S16; 3833 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3834 3835 // Confirm that the return type is large enough for the dmask specified 3836 if (NumElts < DMaskLanes) 3837 return false; 3838 3839 if (NumElts > 4 || DMaskLanes > 4) 3840 return false; 3841 3842 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3843 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3844 3845 // The raw dword aligned data component of the load. The only legal cases 3846 // where this matters should be when using the packed D16 format, for 3847 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3848 LLT RoundedTy; 3849 3850 // S32 vector to to cover all data, plus TFE result element. 3851 LLT TFETy; 3852 3853 // Register type to use for each loaded component. Will be S32 or V2S16. 3854 LLT RegTy; 3855 3856 if (IsD16 && ST.hasUnpackedD16VMem()) { 3857 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3858 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3859 RegTy = S32; 3860 } else { 3861 unsigned EltSize = EltTy.getSizeInBits(); 3862 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3863 unsigned RoundedSize = 32 * RoundedElts; 3864 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3865 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3866 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3867 } 3868 3869 // The return type does not need adjustment. 3870 // TODO: Should we change s16 case to s32 or <2 x s16>? 3871 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3872 return true; 3873 3874 Register Dst1Reg; 3875 3876 // Insert after the instruction. 3877 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3878 3879 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3880 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3881 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3882 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3883 3884 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3885 3886 MI.getOperand(0).setReg(NewResultReg); 3887 3888 // In the IR, TFE is supposed to be used with a 2 element struct return 3889 // type. The intruction really returns these two values in one contiguous 3890 // register, with one additional dword beyond the loaded data. Rewrite the 3891 // return type to use a single register result. 3892 3893 if (IsTFE) { 3894 Dst1Reg = MI.getOperand(1).getReg(); 3895 if (MRI->getType(Dst1Reg) != S32) 3896 return false; 3897 3898 // TODO: Make sure the TFE operand bit is set. 3899 MI.RemoveOperand(1); 3900 3901 // Handle the easy case that requires no repack instructions. 3902 if (Ty == S32) { 3903 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3904 return true; 3905 } 3906 } 3907 3908 // Now figure out how to copy the new result register back into the old 3909 // result. 3910 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3911 3912 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3913 3914 if (ResultNumRegs == 1) { 3915 assert(!IsTFE); 3916 ResultRegs[0] = NewResultReg; 3917 } else { 3918 // We have to repack into a new vector of some kind. 3919 for (int I = 0; I != NumDataRegs; ++I) 3920 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3921 B.buildUnmerge(ResultRegs, NewResultReg); 3922 3923 // Drop the final TFE element to get the data part. The TFE result is 3924 // directly written to the right place already. 3925 if (IsTFE) 3926 ResultRegs.resize(NumDataRegs); 3927 } 3928 3929 // For an s16 scalar result, we form an s32 result with a truncate regardless 3930 // of packed vs. unpacked. 3931 if (IsD16 && !Ty.isVector()) { 3932 B.buildTrunc(DstReg, ResultRegs[0]); 3933 return true; 3934 } 3935 3936 // Avoid a build/concat_vector of 1 entry. 3937 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3938 B.buildBitcast(DstReg, ResultRegs[0]); 3939 return true; 3940 } 3941 3942 assert(Ty.isVector()); 3943 3944 if (IsD16) { 3945 // For packed D16 results with TFE enabled, all the data components are 3946 // S32. Cast back to the expected type. 3947 // 3948 // TODO: We don't really need to use load s32 elements. We would only need one 3949 // cast for the TFE result if a multiple of v2s16 was used. 3950 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3951 for (Register &Reg : ResultRegs) 3952 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3953 } else if (ST.hasUnpackedD16VMem()) { 3954 for (Register &Reg : ResultRegs) 3955 Reg = B.buildTrunc(S16, Reg).getReg(0); 3956 } 3957 } 3958 3959 auto padWithUndef = [&](LLT Ty, int NumElts) { 3960 if (NumElts == 0) 3961 return; 3962 Register Undef = B.buildUndef(Ty).getReg(0); 3963 for (int I = 0; I != NumElts; ++I) 3964 ResultRegs.push_back(Undef); 3965 }; 3966 3967 // Pad out any elements eliminated due to the dmask. 3968 LLT ResTy = MRI->getType(ResultRegs[0]); 3969 if (!ResTy.isVector()) { 3970 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3971 B.buildBuildVector(DstReg, ResultRegs); 3972 return true; 3973 } 3974 3975 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 3976 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 3977 3978 // Deal with the one annoying legal case. 3979 const LLT V3S16 = LLT::vector(3, 16); 3980 if (Ty == V3S16) { 3981 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 3982 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 3983 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 3984 return true; 3985 } 3986 3987 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 3988 B.buildConcatVectors(DstReg, ResultRegs); 3989 return true; 3990 } 3991 3992 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 3993 MachineInstr &MI, MachineIRBuilder &B, 3994 GISelChangeObserver &Observer) const { 3995 Register Dst = MI.getOperand(0).getReg(); 3996 LLT Ty = B.getMRI()->getType(Dst); 3997 unsigned Size = Ty.getSizeInBits(); 3998 MachineFunction &MF = B.getMF(); 3999 4000 Observer.changingInstr(MI); 4001 4002 // FIXME: We don't really need this intermediate instruction. The intrinsic 4003 // should be fixed to have a memory operand. Since it's readnone, we're not 4004 // allowed to add one. 4005 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4006 MI.RemoveOperand(1); // Remove intrinsic ID 4007 4008 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4009 // TODO: Should this use datalayout alignment? 4010 const unsigned MemSize = (Size + 7) / 8; 4011 const Align MemAlign(4); 4012 MachineMemOperand *MMO = MF.getMachineMemOperand( 4013 MachinePointerInfo(), 4014 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4015 MachineMemOperand::MOInvariant, 4016 MemSize, MemAlign); 4017 MI.addMemOperand(MF, MMO); 4018 4019 // There are no 96-bit result scalar loads, but widening to 128-bit should 4020 // always be legal. We may need to restore this to a 96-bit result if it turns 4021 // out this needs to be converted to a vector load during RegBankSelect. 4022 if (!isPowerOf2_32(Size)) { 4023 LegalizerHelper Helper(MF, *this, Observer, B); 4024 B.setInstr(MI); 4025 4026 if (Ty.isVector()) 4027 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4028 else 4029 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4030 } 4031 4032 Observer.changedInstr(MI); 4033 return true; 4034 } 4035 4036 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4037 MachineRegisterInfo &MRI, 4038 MachineIRBuilder &B) const { 4039 B.setInstr(MI); 4040 4041 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4042 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4043 !ST.isTrapHandlerEnabled()) { 4044 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4045 } else { 4046 // Pass queue pointer to trap handler as input, and insert trap instruction 4047 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4048 const ArgDescriptor *Arg = 4049 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4050 if (!Arg) 4051 return false; 4052 MachineRegisterInfo &MRI = *B.getMRI(); 4053 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4054 Register LiveIn = getLiveInRegister( 4055 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4056 /*InsertLiveInCopy=*/false); 4057 if (!loadInputValue(LiveIn, B, Arg)) 4058 return false; 4059 B.buildCopy(SGPR01, LiveIn); 4060 B.buildInstr(AMDGPU::S_TRAP) 4061 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4062 .addReg(SGPR01, RegState::Implicit); 4063 } 4064 4065 MI.eraseFromParent(); 4066 return true; 4067 } 4068 4069 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4070 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4071 B.setInstr(MI); 4072 4073 // Is non-HSA path or trap-handler disabled? then, report a warning 4074 // accordingly 4075 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4076 !ST.isTrapHandlerEnabled()) { 4077 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4078 "debugtrap handler not supported", 4079 MI.getDebugLoc(), DS_Warning); 4080 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4081 Ctx.diagnose(NoTrap); 4082 } else { 4083 // Insert debug-trap instruction 4084 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4085 } 4086 4087 MI.eraseFromParent(); 4088 return true; 4089 } 4090 4091 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4092 MachineIRBuilder &B, 4093 GISelChangeObserver &Observer) const { 4094 MachineRegisterInfo &MRI = *B.getMRI(); 4095 4096 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4097 auto IntrID = MI.getIntrinsicID(); 4098 switch (IntrID) { 4099 case Intrinsic::amdgcn_if: 4100 case Intrinsic::amdgcn_else: { 4101 MachineInstr *Br = nullptr; 4102 MachineBasicBlock *UncondBrTarget = nullptr; 4103 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4104 const SIRegisterInfo *TRI 4105 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4106 4107 B.setInstr(*BrCond); 4108 Register Def = MI.getOperand(1).getReg(); 4109 Register Use = MI.getOperand(3).getReg(); 4110 4111 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4112 if (IntrID == Intrinsic::amdgcn_if) { 4113 B.buildInstr(AMDGPU::SI_IF) 4114 .addDef(Def) 4115 .addUse(Use) 4116 .addMBB(UncondBrTarget); 4117 } else { 4118 B.buildInstr(AMDGPU::SI_ELSE) 4119 .addDef(Def) 4120 .addUse(Use) 4121 .addMBB(UncondBrTarget) 4122 .addImm(0); 4123 } 4124 4125 if (Br) { 4126 Br->getOperand(0).setMBB(CondBrTarget); 4127 } else { 4128 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4129 // since we're swapping branch targets it needs to be reinserted. 4130 // FIXME: IRTranslator should probably not do this 4131 B.buildBr(*CondBrTarget); 4132 } 4133 4134 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4135 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4136 MI.eraseFromParent(); 4137 BrCond->eraseFromParent(); 4138 return true; 4139 } 4140 4141 return false; 4142 } 4143 case Intrinsic::amdgcn_loop: { 4144 MachineInstr *Br = nullptr; 4145 MachineBasicBlock *UncondBrTarget = nullptr; 4146 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4147 const SIRegisterInfo *TRI 4148 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4149 4150 B.setInstr(*BrCond); 4151 4152 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4153 Register Reg = MI.getOperand(2).getReg(); 4154 B.buildInstr(AMDGPU::SI_LOOP) 4155 .addUse(Reg) 4156 .addMBB(UncondBrTarget); 4157 4158 if (Br) 4159 Br->getOperand(0).setMBB(CondBrTarget); 4160 else 4161 B.buildBr(*CondBrTarget); 4162 4163 MI.eraseFromParent(); 4164 BrCond->eraseFromParent(); 4165 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4166 return true; 4167 } 4168 4169 return false; 4170 } 4171 case Intrinsic::amdgcn_kernarg_segment_ptr: 4172 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4173 B.setInstr(MI); 4174 // This only makes sense to call in a kernel, so just lower to null. 4175 B.buildConstant(MI.getOperand(0).getReg(), 0); 4176 MI.eraseFromParent(); 4177 return true; 4178 } 4179 4180 return legalizePreloadedArgIntrin( 4181 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4182 case Intrinsic::amdgcn_implicitarg_ptr: 4183 return legalizeImplicitArgPtr(MI, MRI, B); 4184 case Intrinsic::amdgcn_workitem_id_x: 4185 return legalizePreloadedArgIntrin(MI, MRI, B, 4186 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4187 case Intrinsic::amdgcn_workitem_id_y: 4188 return legalizePreloadedArgIntrin(MI, MRI, B, 4189 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4190 case Intrinsic::amdgcn_workitem_id_z: 4191 return legalizePreloadedArgIntrin(MI, MRI, B, 4192 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4193 case Intrinsic::amdgcn_workgroup_id_x: 4194 return legalizePreloadedArgIntrin(MI, MRI, B, 4195 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4196 case Intrinsic::amdgcn_workgroup_id_y: 4197 return legalizePreloadedArgIntrin(MI, MRI, B, 4198 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4199 case Intrinsic::amdgcn_workgroup_id_z: 4200 return legalizePreloadedArgIntrin(MI, MRI, B, 4201 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4202 case Intrinsic::amdgcn_dispatch_ptr: 4203 return legalizePreloadedArgIntrin(MI, MRI, B, 4204 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4205 case Intrinsic::amdgcn_queue_ptr: 4206 return legalizePreloadedArgIntrin(MI, MRI, B, 4207 AMDGPUFunctionArgInfo::QUEUE_PTR); 4208 case Intrinsic::amdgcn_implicit_buffer_ptr: 4209 return legalizePreloadedArgIntrin( 4210 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4211 case Intrinsic::amdgcn_dispatch_id: 4212 return legalizePreloadedArgIntrin(MI, MRI, B, 4213 AMDGPUFunctionArgInfo::DISPATCH_ID); 4214 case Intrinsic::amdgcn_fdiv_fast: 4215 return legalizeFDIVFastIntrin(MI, MRI, B); 4216 case Intrinsic::amdgcn_is_shared: 4217 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4218 case Intrinsic::amdgcn_is_private: 4219 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4220 case Intrinsic::amdgcn_wavefrontsize: { 4221 B.setInstr(MI); 4222 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4223 MI.eraseFromParent(); 4224 return true; 4225 } 4226 case Intrinsic::amdgcn_s_buffer_load: 4227 return legalizeSBufferLoad(MI, B, Observer); 4228 case Intrinsic::amdgcn_raw_buffer_store: 4229 case Intrinsic::amdgcn_struct_buffer_store: 4230 return legalizeBufferStore(MI, MRI, B, false, false); 4231 case Intrinsic::amdgcn_raw_buffer_store_format: 4232 case Intrinsic::amdgcn_struct_buffer_store_format: 4233 return legalizeBufferStore(MI, MRI, B, false, true); 4234 case Intrinsic::amdgcn_raw_tbuffer_store: 4235 case Intrinsic::amdgcn_struct_tbuffer_store: 4236 return legalizeBufferStore(MI, MRI, B, true, true); 4237 case Intrinsic::amdgcn_raw_buffer_load: 4238 case Intrinsic::amdgcn_struct_buffer_load: 4239 return legalizeBufferLoad(MI, MRI, B, false, false); 4240 case Intrinsic::amdgcn_raw_buffer_load_format: 4241 case Intrinsic::amdgcn_struct_buffer_load_format: 4242 return legalizeBufferLoad(MI, MRI, B, true, false); 4243 case Intrinsic::amdgcn_raw_tbuffer_load: 4244 case Intrinsic::amdgcn_struct_tbuffer_load: 4245 return legalizeBufferLoad(MI, MRI, B, true, true); 4246 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4247 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4248 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4249 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4250 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4251 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4252 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4253 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4254 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4255 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4256 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4257 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4258 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4259 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4260 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4261 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4262 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4263 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4264 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4265 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4266 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4267 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4268 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4269 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4270 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4271 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4272 return legalizeBufferAtomic(MI, B, IntrID); 4273 case Intrinsic::amdgcn_atomic_inc: 4274 return legalizeAtomicIncDec(MI, B, true); 4275 case Intrinsic::amdgcn_atomic_dec: 4276 return legalizeAtomicIncDec(MI, B, false); 4277 case Intrinsic::trap: 4278 return legalizeTrapIntrinsic(MI, MRI, B); 4279 case Intrinsic::debugtrap: 4280 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4281 default: { 4282 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4283 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4284 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4285 return true; 4286 } 4287 } 4288 4289 return true; 4290 } 4291