1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/ADT/ScopeExit.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 31 #include "llvm/CodeGen/TargetOpcodes.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/DerivedTypes.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/Support/Debug.h" 37 38 #define DEBUG_TYPE "amdgpu-legalinfo" 39 40 using namespace llvm; 41 using namespace LegalizeActions; 42 using namespace LegalizeMutations; 43 using namespace LegalityPredicates; 44 using namespace MIPatternMatch; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 static LegalityPredicate isMultiple32(unsigned TypeIdx, 61 unsigned MaxSize = 1024) { 62 return [=](const LegalityQuery &Query) { 63 const LLT Ty = Query.Types[TypeIdx]; 64 const LLT EltTy = Ty.getScalarType(); 65 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 66 }; 67 } 68 69 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 70 return [=](const LegalityQuery &Query) { 71 const LLT Ty = Query.Types[TypeIdx]; 72 return Ty.isVector() && 73 Ty.getNumElements() % 2 != 0 && 74 Ty.getElementType().getSizeInBits() < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 const LLT EltTy = Ty.getScalarType(); 83 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 84 }; 85 } 86 87 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getElementType(); 91 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 92 }; 93 } 94 95 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 unsigned Size = Ty.getSizeInBits(); 100 unsigned Pieces = (Size + 63) / 64; 101 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 102 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 103 }; 104 } 105 106 // Increase the number of vector elements to reach the next multiple of 32-bit 107 // type. 108 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 109 return [=](const LegalityQuery &Query) { 110 const LLT Ty = Query.Types[TypeIdx]; 111 112 const LLT EltTy = Ty.getElementType(); 113 const int Size = Ty.getSizeInBits(); 114 const int EltSize = EltTy.getSizeInBits(); 115 const int NextMul32 = (Size + 31) / 32; 116 117 assert(EltSize < 32); 118 119 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 120 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 121 }; 122 } 123 124 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 125 return [=](const LegalityQuery &Query) { 126 const LLT QueryTy = Query.Types[TypeIdx]; 127 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 128 }; 129 } 130 131 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 132 return [=](const LegalityQuery &Query) { 133 const LLT QueryTy = Query.Types[TypeIdx]; 134 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 135 }; 136 } 137 138 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 139 return [=](const LegalityQuery &Query) { 140 const LLT QueryTy = Query.Types[TypeIdx]; 141 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 142 }; 143 } 144 145 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 146 // v2s16. 147 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 if (Ty.isVector()) { 151 const int EltSize = Ty.getElementType().getSizeInBits(); 152 return EltSize == 32 || EltSize == 64 || 153 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 154 EltSize == 128 || EltSize == 256; 155 } 156 157 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 158 }; 159 } 160 161 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 162 return [=](const LegalityQuery &Query) { 163 const LLT QueryTy = Query.Types[TypeIdx]; 164 return QueryTy.isVector() && QueryTy.getElementType() == Type; 165 }; 166 } 167 168 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 169 return [=](const LegalityQuery &Query) { 170 const LLT QueryTy = Query.Types[TypeIdx]; 171 if (!QueryTy.isVector()) 172 return false; 173 const LLT EltTy = QueryTy.getElementType(); 174 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 175 }; 176 } 177 178 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT Ty = Query.Types[TypeIdx]; 181 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 182 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 183 }; 184 } 185 186 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 187 return [=](const LegalityQuery &Query) { 188 return Query.Types[TypeIdx0].getSizeInBits() < 189 Query.Types[TypeIdx1].getSizeInBits(); 190 }; 191 } 192 193 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 194 return [=](const LegalityQuery &Query) { 195 return Query.Types[TypeIdx0].getSizeInBits() > 196 Query.Types[TypeIdx1].getSizeInBits(); 197 }; 198 } 199 200 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 201 const GCNTargetMachine &TM) 202 : ST(ST_) { 203 using namespace TargetOpcode; 204 205 auto GetAddrSpacePtr = [&TM](unsigned AS) { 206 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 207 }; 208 209 const LLT S1 = LLT::scalar(1); 210 const LLT S16 = LLT::scalar(16); 211 const LLT S32 = LLT::scalar(32); 212 const LLT S64 = LLT::scalar(64); 213 const LLT S128 = LLT::scalar(128); 214 const LLT S256 = LLT::scalar(256); 215 const LLT S512 = LLT::scalar(512); 216 const LLT S1024 = LLT::scalar(1024); 217 218 const LLT V2S16 = LLT::vector(2, 16); 219 const LLT V4S16 = LLT::vector(4, 16); 220 221 const LLT V2S32 = LLT::vector(2, 32); 222 const LLT V3S32 = LLT::vector(3, 32); 223 const LLT V4S32 = LLT::vector(4, 32); 224 const LLT V5S32 = LLT::vector(5, 32); 225 const LLT V6S32 = LLT::vector(6, 32); 226 const LLT V7S32 = LLT::vector(7, 32); 227 const LLT V8S32 = LLT::vector(8, 32); 228 const LLT V9S32 = LLT::vector(9, 32); 229 const LLT V10S32 = LLT::vector(10, 32); 230 const LLT V11S32 = LLT::vector(11, 32); 231 const LLT V12S32 = LLT::vector(12, 32); 232 const LLT V13S32 = LLT::vector(13, 32); 233 const LLT V14S32 = LLT::vector(14, 32); 234 const LLT V15S32 = LLT::vector(15, 32); 235 const LLT V16S32 = LLT::vector(16, 32); 236 const LLT V32S32 = LLT::vector(32, 32); 237 238 const LLT V2S64 = LLT::vector(2, 64); 239 const LLT V3S64 = LLT::vector(3, 64); 240 const LLT V4S64 = LLT::vector(4, 64); 241 const LLT V5S64 = LLT::vector(5, 64); 242 const LLT V6S64 = LLT::vector(6, 64); 243 const LLT V7S64 = LLT::vector(7, 64); 244 const LLT V8S64 = LLT::vector(8, 64); 245 const LLT V16S64 = LLT::vector(16, 64); 246 247 std::initializer_list<LLT> AllS32Vectors = 248 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 249 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 250 std::initializer_list<LLT> AllS64Vectors = 251 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 252 253 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 254 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 255 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 256 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 257 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 258 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 259 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 260 261 const LLT CodePtr = FlatPtr; 262 263 const std::initializer_list<LLT> AddrSpaces64 = { 264 GlobalPtr, ConstantPtr, FlatPtr 265 }; 266 267 const std::initializer_list<LLT> AddrSpaces32 = { 268 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 269 }; 270 271 const std::initializer_list<LLT> FPTypesBase = { 272 S32, S64 273 }; 274 275 const std::initializer_list<LLT> FPTypes16 = { 276 S32, S64, S16 277 }; 278 279 const std::initializer_list<LLT> FPTypesPK16 = { 280 S32, S64, S16, V2S16 281 }; 282 283 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 284 285 setAction({G_BRCOND, S1}, Legal); // VCC branches 286 setAction({G_BRCOND, S32}, Legal); // SCC branches 287 288 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 289 // elements for v3s16 290 getActionDefinitionsBuilder(G_PHI) 291 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 292 .legalFor(AllS32Vectors) 293 .legalFor(AllS64Vectors) 294 .legalFor(AddrSpaces64) 295 .legalFor(AddrSpaces32) 296 .clampScalar(0, S32, S256) 297 .widenScalarToNextPow2(0, 32) 298 .clampMaxNumElements(0, S32, 16) 299 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 300 .legalIf(isPointer(0)); 301 302 if (ST.hasVOP3PInsts()) { 303 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 304 .legalFor({S32, S16, V2S16}) 305 .clampScalar(0, S16, S32) 306 .clampMaxNumElements(0, S16, 2) 307 .scalarize(0) 308 .widenScalarToNextPow2(0, 32); 309 } else if (ST.has16BitInsts()) { 310 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 311 .legalFor({S32, S16}) 312 .clampScalar(0, S16, S32) 313 .scalarize(0) 314 .widenScalarToNextPow2(0, 32); 315 } else { 316 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 317 .legalFor({S32}) 318 .clampScalar(0, S32, S32) 319 .scalarize(0); 320 } 321 322 // FIXME: Not really legal. Placeholder for custom lowering. 323 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 324 .customFor({S32, S64}) 325 .clampScalar(0, S32, S64) 326 .widenScalarToNextPow2(0, 32) 327 .scalarize(0); 328 329 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 330 .legalFor({S32}) 331 .clampScalar(0, S32, S32) 332 .scalarize(0); 333 334 // Report legal for any types we can handle anywhere. For the cases only legal 335 // on the SALU, RegBankSelect will be able to re-legalize. 336 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 337 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 338 .clampScalar(0, S32, S64) 339 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 340 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 341 .widenScalarToNextPow2(0) 342 .scalarize(0); 343 344 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 345 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 346 .legalFor({{S32, S1}, {S32, S32}}) 347 .minScalar(0, S32) 348 // TODO: .scalarize(0) 349 .lower(); 350 351 getActionDefinitionsBuilder(G_BITCAST) 352 // Don't worry about the size constraint. 353 .legalIf(all(isRegisterType(0), isRegisterType(1))) 354 .lower(); 355 356 357 getActionDefinitionsBuilder(G_CONSTANT) 358 .legalFor({S1, S32, S64, S16, GlobalPtr, 359 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 360 .clampScalar(0, S32, S64) 361 .widenScalarToNextPow2(0) 362 .legalIf(isPointer(0)); 363 364 getActionDefinitionsBuilder(G_FCONSTANT) 365 .legalFor({S32, S64, S16}) 366 .clampScalar(0, S16, S64); 367 368 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 369 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 370 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 371 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 372 .clampScalarOrElt(0, S32, S1024) 373 .legalIf(isMultiple32(0)) 374 .widenScalarToNextPow2(0, 32) 375 .clampMaxNumElements(0, S32, 16); 376 377 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 378 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 379 .unsupportedFor({PrivatePtr}) 380 .custom(); 381 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 382 383 auto &FPOpActions = getActionDefinitionsBuilder( 384 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 385 .legalFor({S32, S64}); 386 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 387 .customFor({S32, S64}); 388 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 389 .customFor({S32, S64}); 390 391 if (ST.has16BitInsts()) { 392 if (ST.hasVOP3PInsts()) 393 FPOpActions.legalFor({S16, V2S16}); 394 else 395 FPOpActions.legalFor({S16}); 396 397 TrigActions.customFor({S16}); 398 FDIVActions.customFor({S16}); 399 } 400 401 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 402 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 403 404 if (ST.hasVOP3PInsts()) { 405 MinNumMaxNum.customFor(FPTypesPK16) 406 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 407 .clampMaxNumElements(0, S16, 2) 408 .clampScalar(0, S16, S64) 409 .scalarize(0); 410 } else if (ST.has16BitInsts()) { 411 MinNumMaxNum.customFor(FPTypes16) 412 .clampScalar(0, S16, S64) 413 .scalarize(0); 414 } else { 415 MinNumMaxNum.customFor(FPTypesBase) 416 .clampScalar(0, S32, S64) 417 .scalarize(0); 418 } 419 420 if (ST.hasVOP3PInsts()) 421 FPOpActions.clampMaxNumElements(0, S16, 2); 422 423 FPOpActions 424 .scalarize(0) 425 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 426 427 TrigActions 428 .scalarize(0) 429 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 430 431 FDIVActions 432 .scalarize(0) 433 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 434 435 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 436 .legalFor(FPTypesPK16) 437 .clampMaxNumElements(0, S16, 2) 438 .scalarize(0) 439 .clampScalar(0, S16, S64); 440 441 if (ST.has16BitInsts()) { 442 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 443 .legalFor({S32, S64, S16}) 444 .scalarize(0) 445 .clampScalar(0, S16, S64); 446 } else { 447 getActionDefinitionsBuilder(G_FSQRT) 448 .legalFor({S32, S64}) 449 .scalarize(0) 450 .clampScalar(0, S32, S64); 451 452 if (ST.hasFractBug()) { 453 getActionDefinitionsBuilder(G_FFLOOR) 454 .customFor({S64}) 455 .legalFor({S32, S64}) 456 .scalarize(0) 457 .clampScalar(0, S32, S64); 458 } else { 459 getActionDefinitionsBuilder(G_FFLOOR) 460 .legalFor({S32, S64}) 461 .scalarize(0) 462 .clampScalar(0, S32, S64); 463 } 464 } 465 466 getActionDefinitionsBuilder(G_FPTRUNC) 467 .legalFor({{S32, S64}, {S16, S32}}) 468 .scalarize(0) 469 .lower(); 470 471 getActionDefinitionsBuilder(G_FPEXT) 472 .legalFor({{S64, S32}, {S32, S16}}) 473 .lowerFor({{S64, S16}}) // FIXME: Implement 474 .scalarize(0); 475 476 getActionDefinitionsBuilder(G_FSUB) 477 // Use actual fsub instruction 478 .legalFor({S32}) 479 // Must use fadd + fneg 480 .lowerFor({S64, S16, V2S16}) 481 .scalarize(0) 482 .clampScalar(0, S32, S64); 483 484 // Whether this is legal depends on the floating point mode for the function. 485 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 486 if (ST.hasMadF16()) 487 FMad.customFor({S32, S16}); 488 else 489 FMad.customFor({S32}); 490 FMad.scalarize(0) 491 .lower(); 492 493 // TODO: Do we need to clamp maximum bitwidth? 494 getActionDefinitionsBuilder(G_TRUNC) 495 .legalIf(isScalar(0)) 496 .legalFor({{V2S16, V2S32}}) 497 .clampMaxNumElements(0, S16, 2) 498 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 499 // situations (like an invalid implicit use), we don't want to infinite loop 500 // in the legalizer. 501 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 502 .alwaysLegal(); 503 504 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 505 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 506 {S32, S1}, {S64, S1}, {S16, S1}}) 507 .scalarize(0) 508 .clampScalar(0, S32, S64) 509 .widenScalarToNextPow2(1, 32); 510 511 // TODO: Split s1->s64 during regbankselect for VALU. 512 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 513 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 514 .lowerFor({{S32, S64}}) 515 .lowerIf(typeIs(1, S1)) 516 .customFor({{S64, S64}}); 517 if (ST.has16BitInsts()) 518 IToFP.legalFor({{S16, S16}}); 519 IToFP.clampScalar(1, S32, S64) 520 .scalarize(0) 521 .widenScalarToNextPow2(1); 522 523 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 524 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 525 .customFor({{S64, S64}}); 526 if (ST.has16BitInsts()) 527 FPToI.legalFor({{S16, S16}}); 528 else 529 FPToI.minScalar(1, S32); 530 531 FPToI.minScalar(0, S32) 532 .scalarize(0) 533 .lower(); 534 535 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 536 .scalarize(0) 537 .lower(); 538 539 if (ST.has16BitInsts()) { 540 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 541 .legalFor({S16, S32, S64}) 542 .clampScalar(0, S16, S64) 543 .scalarize(0); 544 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 545 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 546 .legalFor({S32, S64}) 547 .clampScalar(0, S32, S64) 548 .scalarize(0); 549 } else { 550 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 551 .legalFor({S32}) 552 .customFor({S64}) 553 .clampScalar(0, S32, S64) 554 .scalarize(0); 555 } 556 557 // FIXME: Clamp offset operand. 558 getActionDefinitionsBuilder(G_PTR_ADD) 559 .legalIf(isPointer(0)) 560 .scalarize(0); 561 562 getActionDefinitionsBuilder(G_PTRMASK) 563 .legalIf(typeInSet(1, {S64, S32})) 564 .minScalar(1, S32) 565 .maxScalarIf(sizeIs(0, 32), 1, S32) 566 .maxScalarIf(sizeIs(0, 64), 1, S64) 567 .scalarize(0); 568 569 auto &CmpBuilder = 570 getActionDefinitionsBuilder(G_ICMP) 571 // The compare output type differs based on the register bank of the output, 572 // so make both s1 and s32 legal. 573 // 574 // Scalar compares producing output in scc will be promoted to s32, as that 575 // is the allocatable register type that will be needed for the copy from 576 // scc. This will be promoted during RegBankSelect, and we assume something 577 // before that won't try to use s32 result types. 578 // 579 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 580 // bank. 581 .legalForCartesianProduct( 582 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 583 .legalForCartesianProduct( 584 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 585 if (ST.has16BitInsts()) { 586 CmpBuilder.legalFor({{S1, S16}}); 587 } 588 589 CmpBuilder 590 .widenScalarToNextPow2(1) 591 .clampScalar(1, S32, S64) 592 .scalarize(0) 593 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 594 595 getActionDefinitionsBuilder(G_FCMP) 596 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 597 .widenScalarToNextPow2(1) 598 .clampScalar(1, S32, S64) 599 .scalarize(0); 600 601 // FIXME: fpow has a selection pattern that should move to custom lowering. 602 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 603 if (ST.has16BitInsts()) 604 Exp2Ops.legalFor({S32, S16}); 605 else 606 Exp2Ops.legalFor({S32}); 607 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 608 Exp2Ops.scalarize(0); 609 610 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 611 if (ST.has16BitInsts()) 612 ExpOps.customFor({{S32}, {S16}}); 613 else 614 ExpOps.customFor({S32}); 615 ExpOps.clampScalar(0, MinScalarFPTy, S32) 616 .scalarize(0); 617 618 // The 64-bit versions produce 32-bit results, but only on the SALU. 619 getActionDefinitionsBuilder(G_CTPOP) 620 .legalFor({{S32, S32}, {S32, S64}}) 621 .clampScalar(0, S32, S32) 622 .clampScalar(1, S32, S64) 623 .scalarize(0) 624 .widenScalarToNextPow2(0, 32) 625 .widenScalarToNextPow2(1, 32); 626 627 // The hardware instructions return a different result on 0 than the generic 628 // instructions expect. The hardware produces -1, but these produce the 629 // bitwidth. 630 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 631 .scalarize(0) 632 .clampScalar(0, S32, S32) 633 .clampScalar(1, S32, S64) 634 .widenScalarToNextPow2(0, 32) 635 .widenScalarToNextPow2(1, 32) 636 .lower(); 637 638 // The 64-bit versions produce 32-bit results, but only on the SALU. 639 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 640 .legalFor({{S32, S32}, {S32, S64}}) 641 .clampScalar(0, S32, S32) 642 .clampScalar(1, S32, S64) 643 .scalarize(0) 644 .widenScalarToNextPow2(0, 32) 645 .widenScalarToNextPow2(1, 32); 646 647 getActionDefinitionsBuilder(G_BITREVERSE) 648 .legalFor({S32}) 649 .clampScalar(0, S32, S32) 650 .scalarize(0); 651 652 if (ST.has16BitInsts()) { 653 getActionDefinitionsBuilder(G_BSWAP) 654 .legalFor({S16, S32, V2S16}) 655 .clampMaxNumElements(0, S16, 2) 656 // FIXME: Fixing non-power-of-2 before clamp is workaround for 657 // narrowScalar limitation. 658 .widenScalarToNextPow2(0) 659 .clampScalar(0, S16, S32) 660 .scalarize(0); 661 662 if (ST.hasVOP3PInsts()) { 663 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 664 .legalFor({S32, S16, V2S16}) 665 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 666 .clampMaxNumElements(0, S16, 2) 667 .minScalar(0, S16) 668 .widenScalarToNextPow2(0) 669 .scalarize(0) 670 .lower(); 671 } else { 672 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 673 .legalFor({S32, S16}) 674 .widenScalarToNextPow2(0) 675 .minScalar(0, S16) 676 .scalarize(0) 677 .lower(); 678 } 679 } else { 680 // TODO: Should have same legality without v_perm_b32 681 getActionDefinitionsBuilder(G_BSWAP) 682 .legalFor({S32}) 683 .lowerIf(narrowerThan(0, 32)) 684 // FIXME: Fixing non-power-of-2 before clamp is workaround for 685 // narrowScalar limitation. 686 .widenScalarToNextPow2(0) 687 .maxScalar(0, S32) 688 .scalarize(0) 689 .lower(); 690 691 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 692 .legalFor({S32}) 693 .minScalar(0, S32) 694 .widenScalarToNextPow2(0) 695 .scalarize(0) 696 .lower(); 697 } 698 699 getActionDefinitionsBuilder(G_INTTOPTR) 700 // List the common cases 701 .legalForCartesianProduct(AddrSpaces64, {S64}) 702 .legalForCartesianProduct(AddrSpaces32, {S32}) 703 .scalarize(0) 704 // Accept any address space as long as the size matches 705 .legalIf(sameSize(0, 1)) 706 .widenScalarIf(smallerThan(1, 0), 707 [](const LegalityQuery &Query) { 708 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 709 }) 710 .narrowScalarIf(greaterThan(1, 0), 711 [](const LegalityQuery &Query) { 712 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 713 }); 714 715 getActionDefinitionsBuilder(G_PTRTOINT) 716 // List the common cases 717 .legalForCartesianProduct(AddrSpaces64, {S64}) 718 .legalForCartesianProduct(AddrSpaces32, {S32}) 719 .scalarize(0) 720 // Accept any address space as long as the size matches 721 .legalIf(sameSize(0, 1)) 722 .widenScalarIf(smallerThan(0, 1), 723 [](const LegalityQuery &Query) { 724 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 725 }) 726 .narrowScalarIf( 727 greaterThan(0, 1), 728 [](const LegalityQuery &Query) { 729 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 730 }); 731 732 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 733 .scalarize(0) 734 .custom(); 735 736 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 737 // handle some operations by just promoting the register during 738 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 739 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 740 switch (AS) { 741 // FIXME: Private element size. 742 case AMDGPUAS::PRIVATE_ADDRESS: 743 return 32; 744 // FIXME: Check subtarget 745 case AMDGPUAS::LOCAL_ADDRESS: 746 return ST.useDS128() ? 128 : 64; 747 748 // Treat constant and global as identical. SMRD loads are sometimes usable 749 // for global loads (ideally constant address space should be eliminated) 750 // depending on the context. Legality cannot be context dependent, but 751 // RegBankSelect can split the load as necessary depending on the pointer 752 // register bank/uniformity and if the memory is invariant or not written in 753 // a kernel. 754 case AMDGPUAS::CONSTANT_ADDRESS: 755 case AMDGPUAS::GLOBAL_ADDRESS: 756 return IsLoad ? 512 : 128; 757 default: 758 return 128; 759 } 760 }; 761 762 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 763 bool IsLoad) -> bool { 764 const LLT DstTy = Query.Types[0]; 765 766 // Split vector extloads. 767 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 768 unsigned Align = Query.MMODescrs[0].AlignInBits; 769 770 if (MemSize < DstTy.getSizeInBits()) 771 MemSize = std::max(MemSize, Align); 772 773 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 774 return true; 775 776 const LLT PtrTy = Query.Types[1]; 777 unsigned AS = PtrTy.getAddressSpace(); 778 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 779 return true; 780 781 // Catch weird sized loads that don't evenly divide into the access sizes 782 // TODO: May be able to widen depending on alignment etc. 783 unsigned NumRegs = (MemSize + 31) / 32; 784 if (NumRegs == 3) { 785 if (!ST.hasDwordx3LoadStores()) 786 return true; 787 } else { 788 // If the alignment allows, these should have been widened. 789 if (!isPowerOf2_32(NumRegs)) 790 return true; 791 } 792 793 if (Align < MemSize) { 794 const SITargetLowering *TLI = ST.getTargetLowering(); 795 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 796 } 797 798 return false; 799 }; 800 801 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 802 unsigned Size = Query.Types[0].getSizeInBits(); 803 if (isPowerOf2_32(Size)) 804 return false; 805 806 if (Size == 96 && ST.hasDwordx3LoadStores()) 807 return false; 808 809 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 810 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 811 return false; 812 813 unsigned Align = Query.MMODescrs[0].AlignInBits; 814 unsigned RoundedSize = NextPowerOf2(Size); 815 return (Align >= RoundedSize); 816 }; 817 818 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 819 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 820 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 821 822 // TODO: Refine based on subtargets which support unaligned access or 128-bit 823 // LDS 824 // TODO: Unsupported flat for SI. 825 826 for (unsigned Op : {G_LOAD, G_STORE}) { 827 const bool IsStore = Op == G_STORE; 828 829 auto &Actions = getActionDefinitionsBuilder(Op); 830 // Whitelist the common cases. 831 // TODO: Loads to s16 on gfx9 832 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 833 {V2S32, GlobalPtr, 64, GlobalAlign32}, 834 {V4S32, GlobalPtr, 128, GlobalAlign32}, 835 {S128, GlobalPtr, 128, GlobalAlign32}, 836 {S64, GlobalPtr, 64, GlobalAlign32}, 837 {V2S64, GlobalPtr, 128, GlobalAlign32}, 838 {V2S16, GlobalPtr, 32, GlobalAlign32}, 839 {S32, GlobalPtr, 8, GlobalAlign8}, 840 {S32, GlobalPtr, 16, GlobalAlign16}, 841 842 {S32, LocalPtr, 32, 32}, 843 {S64, LocalPtr, 64, 32}, 844 {V2S32, LocalPtr, 64, 32}, 845 {S32, LocalPtr, 8, 8}, 846 {S32, LocalPtr, 16, 16}, 847 {V2S16, LocalPtr, 32, 32}, 848 849 {S32, PrivatePtr, 32, 32}, 850 {S32, PrivatePtr, 8, 8}, 851 {S32, PrivatePtr, 16, 16}, 852 {V2S16, PrivatePtr, 32, 32}, 853 854 {S32, FlatPtr, 32, GlobalAlign32}, 855 {S32, FlatPtr, 16, GlobalAlign16}, 856 {S32, FlatPtr, 8, GlobalAlign8}, 857 {V2S16, FlatPtr, 32, GlobalAlign32}, 858 859 {S32, ConstantPtr, 32, GlobalAlign32}, 860 {V2S32, ConstantPtr, 64, GlobalAlign32}, 861 {V4S32, ConstantPtr, 128, GlobalAlign32}, 862 {S64, ConstantPtr, 64, GlobalAlign32}, 863 {S128, ConstantPtr, 128, GlobalAlign32}, 864 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 865 Actions 866 .customIf(typeIs(1, Constant32Ptr)) 867 // Widen suitably aligned loads by loading extra elements. 868 .moreElementsIf([=](const LegalityQuery &Query) { 869 const LLT Ty = Query.Types[0]; 870 return Op == G_LOAD && Ty.isVector() && 871 shouldWidenLoadResult(Query); 872 }, moreElementsToNextPow2(0)) 873 .widenScalarIf([=](const LegalityQuery &Query) { 874 const LLT Ty = Query.Types[0]; 875 return Op == G_LOAD && !Ty.isVector() && 876 shouldWidenLoadResult(Query); 877 }, widenScalarOrEltToNextPow2(0)) 878 .narrowScalarIf( 879 [=](const LegalityQuery &Query) -> bool { 880 return !Query.Types[0].isVector() && 881 needToSplitMemOp(Query, Op == G_LOAD); 882 }, 883 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 884 const LLT DstTy = Query.Types[0]; 885 const LLT PtrTy = Query.Types[1]; 886 887 const unsigned DstSize = DstTy.getSizeInBits(); 888 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 889 890 // Split extloads. 891 if (DstSize > MemSize) 892 return std::make_pair(0, LLT::scalar(MemSize)); 893 894 if (!isPowerOf2_32(DstSize)) { 895 // We're probably decomposing an odd sized store. Try to split 896 // to the widest type. TODO: Account for alignment. As-is it 897 // should be OK, since the new parts will be further legalized. 898 unsigned FloorSize = PowerOf2Floor(DstSize); 899 return std::make_pair(0, LLT::scalar(FloorSize)); 900 } 901 902 if (DstSize > 32 && (DstSize % 32 != 0)) { 903 // FIXME: Need a way to specify non-extload of larger size if 904 // suitably aligned. 905 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 906 } 907 908 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 909 Op == G_LOAD); 910 if (MemSize > MaxSize) 911 return std::make_pair(0, LLT::scalar(MaxSize)); 912 913 unsigned Align = Query.MMODescrs[0].AlignInBits; 914 return std::make_pair(0, LLT::scalar(Align)); 915 }) 916 .fewerElementsIf( 917 [=](const LegalityQuery &Query) -> bool { 918 return Query.Types[0].isVector() && 919 needToSplitMemOp(Query, Op == G_LOAD); 920 }, 921 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 922 const LLT DstTy = Query.Types[0]; 923 const LLT PtrTy = Query.Types[1]; 924 925 LLT EltTy = DstTy.getElementType(); 926 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 927 Op == G_LOAD); 928 929 // FIXME: Handle widened to power of 2 results better. This ends 930 // up scalarizing. 931 // FIXME: 3 element stores scalarized on SI 932 933 // Split if it's too large for the address space. 934 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 935 unsigned NumElts = DstTy.getNumElements(); 936 unsigned EltSize = EltTy.getSizeInBits(); 937 938 if (MaxSize % EltSize == 0) { 939 return std::make_pair( 940 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 941 } 942 943 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 944 945 // FIXME: Refine when odd breakdowns handled 946 // The scalars will need to be re-legalized. 947 if (NumPieces == 1 || NumPieces >= NumElts || 948 NumElts % NumPieces != 0) 949 return std::make_pair(0, EltTy); 950 951 return std::make_pair(0, 952 LLT::vector(NumElts / NumPieces, EltTy)); 953 } 954 955 // FIXME: We could probably handle weird extending loads better. 956 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 957 if (DstTy.getSizeInBits() > MemSize) 958 return std::make_pair(0, EltTy); 959 960 unsigned EltSize = EltTy.getSizeInBits(); 961 unsigned DstSize = DstTy.getSizeInBits(); 962 if (!isPowerOf2_32(DstSize)) { 963 // We're probably decomposing an odd sized store. Try to split 964 // to the widest type. TODO: Account for alignment. As-is it 965 // should be OK, since the new parts will be further legalized. 966 unsigned FloorSize = PowerOf2Floor(DstSize); 967 return std::make_pair( 968 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 969 } 970 971 // Need to split because of alignment. 972 unsigned Align = Query.MMODescrs[0].AlignInBits; 973 if (EltSize > Align && 974 (EltSize / Align < DstTy.getNumElements())) { 975 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 976 } 977 978 // May need relegalization for the scalars. 979 return std::make_pair(0, EltTy); 980 }) 981 .minScalar(0, S32); 982 983 if (IsStore) 984 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 985 986 // TODO: Need a bitcast lower option? 987 Actions 988 .legalIf([=](const LegalityQuery &Query) { 989 const LLT Ty0 = Query.Types[0]; 990 unsigned Size = Ty0.getSizeInBits(); 991 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 992 unsigned Align = Query.MMODescrs[0].AlignInBits; 993 994 // FIXME: Widening store from alignment not valid. 995 if (MemSize < Size) 996 MemSize = std::max(MemSize, Align); 997 998 // No extending vector loads. 999 if (Size > MemSize && Ty0.isVector()) 1000 return false; 1001 1002 switch (MemSize) { 1003 case 8: 1004 case 16: 1005 return Size == 32; 1006 case 32: 1007 case 64: 1008 case 128: 1009 return true; 1010 case 96: 1011 return ST.hasDwordx3LoadStores(); 1012 case 256: 1013 case 512: 1014 return true; 1015 default: 1016 return false; 1017 } 1018 }) 1019 .widenScalarToNextPow2(0) 1020 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1021 } 1022 1023 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1024 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1025 {S32, GlobalPtr, 16, 2 * 8}, 1026 {S32, LocalPtr, 8, 8}, 1027 {S32, LocalPtr, 16, 16}, 1028 {S32, PrivatePtr, 8, 8}, 1029 {S32, PrivatePtr, 16, 16}, 1030 {S32, ConstantPtr, 8, 8}, 1031 {S32, ConstantPtr, 16, 2 * 8}}); 1032 if (ST.hasFlatAddressSpace()) { 1033 ExtLoads.legalForTypesWithMemDesc( 1034 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1035 } 1036 1037 ExtLoads.clampScalar(0, S32, S32) 1038 .widenScalarToNextPow2(0) 1039 .unsupportedIfMemSizeNotPow2() 1040 .lower(); 1041 1042 auto &Atomics = getActionDefinitionsBuilder( 1043 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1044 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1045 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1046 G_ATOMICRMW_UMIN}) 1047 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1048 {S64, GlobalPtr}, {S64, LocalPtr}}); 1049 if (ST.hasFlatAddressSpace()) { 1050 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1051 } 1052 1053 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1054 .legalFor({{S32, LocalPtr}}); 1055 1056 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1057 // demarshalling 1058 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1059 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1060 {S32, FlatPtr}, {S64, FlatPtr}}) 1061 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1062 {S32, RegionPtr}, {S64, RegionPtr}}); 1063 // TODO: Pointer types, any 32-bit or 64-bit vector 1064 1065 // Condition should be s32 for scalar, s1 for vector. 1066 getActionDefinitionsBuilder(G_SELECT) 1067 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1068 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1069 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1070 .clampScalar(0, S16, S64) 1071 .scalarize(1) 1072 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1073 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1074 .clampMaxNumElements(0, S32, 2) 1075 .clampMaxNumElements(0, LocalPtr, 2) 1076 .clampMaxNumElements(0, PrivatePtr, 2) 1077 .scalarize(0) 1078 .widenScalarToNextPow2(0) 1079 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1080 1081 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1082 // be more flexible with the shift amount type. 1083 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1084 .legalFor({{S32, S32}, {S64, S32}}); 1085 if (ST.has16BitInsts()) { 1086 if (ST.hasVOP3PInsts()) { 1087 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1088 .clampMaxNumElements(0, S16, 2); 1089 } else 1090 Shifts.legalFor({{S16, S16}}); 1091 1092 // TODO: Support 16-bit shift amounts for all types 1093 Shifts.widenScalarIf( 1094 [=](const LegalityQuery &Query) { 1095 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1096 // 32-bit amount. 1097 const LLT ValTy = Query.Types[0]; 1098 const LLT AmountTy = Query.Types[1]; 1099 return ValTy.getSizeInBits() <= 16 && 1100 AmountTy.getSizeInBits() < 16; 1101 }, changeTo(1, S16)); 1102 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1103 Shifts.clampScalar(1, S32, S32); 1104 Shifts.clampScalar(0, S16, S64); 1105 Shifts.widenScalarToNextPow2(0, 16); 1106 } else { 1107 // Make sure we legalize the shift amount type first, as the general 1108 // expansion for the shifted type will produce much worse code if it hasn't 1109 // been truncated already. 1110 Shifts.clampScalar(1, S32, S32); 1111 Shifts.clampScalar(0, S32, S64); 1112 Shifts.widenScalarToNextPow2(0, 32); 1113 } 1114 Shifts.scalarize(0); 1115 1116 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1117 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1118 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1119 unsigned IdxTypeIdx = 2; 1120 1121 getActionDefinitionsBuilder(Op) 1122 .customIf([=](const LegalityQuery &Query) { 1123 const LLT EltTy = Query.Types[EltTypeIdx]; 1124 const LLT VecTy = Query.Types[VecTypeIdx]; 1125 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1126 return (EltTy.getSizeInBits() == 16 || 1127 EltTy.getSizeInBits() % 32 == 0) && 1128 VecTy.getSizeInBits() % 32 == 0 && 1129 VecTy.getSizeInBits() <= 1024 && 1130 IdxTy.getSizeInBits() == 32; 1131 }) 1132 .clampScalar(EltTypeIdx, S32, S64) 1133 .clampScalar(VecTypeIdx, S32, S64) 1134 .clampScalar(IdxTypeIdx, S32, S32); 1135 } 1136 1137 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1138 .unsupportedIf([=](const LegalityQuery &Query) { 1139 const LLT &EltTy = Query.Types[1].getElementType(); 1140 return Query.Types[0] != EltTy; 1141 }); 1142 1143 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1144 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1145 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1146 1147 // FIXME: Doesn't handle extract of illegal sizes. 1148 getActionDefinitionsBuilder(Op) 1149 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1150 // FIXME: Multiples of 16 should not be legal. 1151 .legalIf([=](const LegalityQuery &Query) { 1152 const LLT BigTy = Query.Types[BigTyIdx]; 1153 const LLT LitTy = Query.Types[LitTyIdx]; 1154 return (BigTy.getSizeInBits() % 32 == 0) && 1155 (LitTy.getSizeInBits() % 16 == 0); 1156 }) 1157 .widenScalarIf( 1158 [=](const LegalityQuery &Query) { 1159 const LLT BigTy = Query.Types[BigTyIdx]; 1160 return (BigTy.getScalarSizeInBits() < 16); 1161 }, 1162 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1163 .widenScalarIf( 1164 [=](const LegalityQuery &Query) { 1165 const LLT LitTy = Query.Types[LitTyIdx]; 1166 return (LitTy.getScalarSizeInBits() < 16); 1167 }, 1168 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1169 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1170 .widenScalarToNextPow2(BigTyIdx, 32); 1171 1172 } 1173 1174 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1175 .legalForCartesianProduct(AllS32Vectors, {S32}) 1176 .legalForCartesianProduct(AllS64Vectors, {S64}) 1177 .clampNumElements(0, V16S32, V32S32) 1178 .clampNumElements(0, V2S64, V16S64) 1179 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1180 1181 if (ST.hasScalarPackInsts()) { 1182 BuildVector 1183 // FIXME: Should probably widen s1 vectors straight to s32 1184 .minScalarOrElt(0, S16) 1185 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1186 .minScalar(1, S32); 1187 1188 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1189 .legalFor({V2S16, S32}) 1190 .lower(); 1191 BuildVector.minScalarOrElt(0, S32); 1192 } else { 1193 BuildVector.customFor({V2S16, S16}); 1194 BuildVector.minScalarOrElt(0, S32); 1195 1196 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1197 .customFor({V2S16, S32}) 1198 .lower(); 1199 } 1200 1201 BuildVector.legalIf(isRegisterType(0)); 1202 1203 // FIXME: Clamp maximum size 1204 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1205 .legalIf(isRegisterType(0)); 1206 1207 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1208 // pre-legalize. 1209 if (ST.hasVOP3PInsts()) { 1210 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1211 .customFor({V2S16, V2S16}) 1212 .lower(); 1213 } else 1214 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1215 1216 // Merge/Unmerge 1217 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1218 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1219 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1220 1221 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1222 const LLT Ty = Query.Types[TypeIdx]; 1223 if (Ty.isVector()) { 1224 const LLT &EltTy = Ty.getElementType(); 1225 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1226 return true; 1227 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1228 return true; 1229 } 1230 return false; 1231 }; 1232 1233 auto &Builder = getActionDefinitionsBuilder(Op) 1234 .lowerFor({{S16, V2S16}}) 1235 .lowerIf([=](const LegalityQuery &Query) { 1236 const LLT BigTy = Query.Types[BigTyIdx]; 1237 return BigTy.getSizeInBits() == 32; 1238 }) 1239 // Try to widen to s16 first for small types. 1240 // TODO: Only do this on targets with legal s16 shifts 1241 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1242 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1243 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1244 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1245 elementTypeIs(1, S16)), 1246 changeTo(1, V2S16)) 1247 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1248 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1249 // valid. 1250 .clampScalar(LitTyIdx, S32, S512) 1251 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1252 // Break up vectors with weird elements into scalars 1253 .fewerElementsIf( 1254 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1255 scalarize(0)) 1256 .fewerElementsIf( 1257 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1258 scalarize(1)) 1259 .clampScalar(BigTyIdx, S32, S1024); 1260 1261 if (Op == G_MERGE_VALUES) { 1262 Builder.widenScalarIf( 1263 // TODO: Use 16-bit shifts if legal for 8-bit values? 1264 [=](const LegalityQuery &Query) { 1265 const LLT Ty = Query.Types[LitTyIdx]; 1266 return Ty.getSizeInBits() < 32; 1267 }, 1268 changeTo(LitTyIdx, S32)); 1269 } 1270 1271 Builder.widenScalarIf( 1272 [=](const LegalityQuery &Query) { 1273 const LLT Ty = Query.Types[BigTyIdx]; 1274 return !isPowerOf2_32(Ty.getSizeInBits()) && 1275 Ty.getSizeInBits() % 16 != 0; 1276 }, 1277 [=](const LegalityQuery &Query) { 1278 // Pick the next power of 2, or a multiple of 64 over 128. 1279 // Whichever is smaller. 1280 const LLT &Ty = Query.Types[BigTyIdx]; 1281 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1282 if (NewSizeInBits >= 256) { 1283 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1284 if (RoundedTo < NewSizeInBits) 1285 NewSizeInBits = RoundedTo; 1286 } 1287 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1288 }) 1289 .legalIf([=](const LegalityQuery &Query) { 1290 const LLT &BigTy = Query.Types[BigTyIdx]; 1291 const LLT &LitTy = Query.Types[LitTyIdx]; 1292 1293 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1294 return false; 1295 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1296 return false; 1297 1298 return BigTy.getSizeInBits() % 16 == 0 && 1299 LitTy.getSizeInBits() % 16 == 0 && 1300 BigTy.getSizeInBits() <= 1024; 1301 }) 1302 // Any vectors left are the wrong size. Scalarize them. 1303 .scalarize(0) 1304 .scalarize(1); 1305 } 1306 1307 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1308 // RegBankSelect. 1309 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1310 .legalFor({{S32}, {S64}}); 1311 1312 if (ST.hasVOP3PInsts()) { 1313 SextInReg.lowerFor({{V2S16}}) 1314 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1315 // get more vector shift opportunities, since we'll get those when 1316 // expanded. 1317 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1318 } else if (ST.has16BitInsts()) { 1319 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1320 } else { 1321 // Prefer to promote to s32 before lowering if we don't have 16-bit 1322 // shifts. This avoid a lot of intermediate truncate and extend operations. 1323 SextInReg.lowerFor({{S32}, {S64}}); 1324 } 1325 1326 SextInReg 1327 .scalarize(0) 1328 .clampScalar(0, S32, S64) 1329 .lower(); 1330 1331 getActionDefinitionsBuilder(G_FSHR) 1332 .legalFor({{S32, S32}}) 1333 .scalarize(0) 1334 .lower(); 1335 1336 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1337 .legalFor({S64}); 1338 1339 getActionDefinitionsBuilder({ 1340 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1341 G_FCOPYSIGN, 1342 1343 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1344 G_READ_REGISTER, 1345 G_WRITE_REGISTER, 1346 1347 G_SADDO, G_SSUBO, 1348 1349 // TODO: Implement 1350 G_FMINIMUM, G_FMAXIMUM, 1351 G_FSHL 1352 }).lower(); 1353 1354 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1355 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1356 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1357 .unsupported(); 1358 1359 computeTables(); 1360 verify(*ST.getInstrInfo()); 1361 } 1362 1363 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1364 MachineRegisterInfo &MRI, 1365 MachineIRBuilder &B, 1366 GISelChangeObserver &Observer) const { 1367 switch (MI.getOpcode()) { 1368 case TargetOpcode::G_ADDRSPACE_CAST: 1369 return legalizeAddrSpaceCast(MI, MRI, B); 1370 case TargetOpcode::G_FRINT: 1371 return legalizeFrint(MI, MRI, B); 1372 case TargetOpcode::G_FCEIL: 1373 return legalizeFceil(MI, MRI, B); 1374 case TargetOpcode::G_INTRINSIC_TRUNC: 1375 return legalizeIntrinsicTrunc(MI, MRI, B); 1376 case TargetOpcode::G_SITOFP: 1377 return legalizeITOFP(MI, MRI, B, true); 1378 case TargetOpcode::G_UITOFP: 1379 return legalizeITOFP(MI, MRI, B, false); 1380 case TargetOpcode::G_FPTOSI: 1381 return legalizeFPTOI(MI, MRI, B, true); 1382 case TargetOpcode::G_FPTOUI: 1383 return legalizeFPTOI(MI, MRI, B, false); 1384 case TargetOpcode::G_FMINNUM: 1385 case TargetOpcode::G_FMAXNUM: 1386 case TargetOpcode::G_FMINNUM_IEEE: 1387 case TargetOpcode::G_FMAXNUM_IEEE: 1388 return legalizeMinNumMaxNum(MI, MRI, B); 1389 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1390 return legalizeExtractVectorElt(MI, MRI, B); 1391 case TargetOpcode::G_INSERT_VECTOR_ELT: 1392 return legalizeInsertVectorElt(MI, MRI, B); 1393 case TargetOpcode::G_SHUFFLE_VECTOR: 1394 return legalizeShuffleVector(MI, MRI, B); 1395 case TargetOpcode::G_FSIN: 1396 case TargetOpcode::G_FCOS: 1397 return legalizeSinCos(MI, MRI, B); 1398 case TargetOpcode::G_GLOBAL_VALUE: 1399 return legalizeGlobalValue(MI, MRI, B); 1400 case TargetOpcode::G_LOAD: 1401 return legalizeLoad(MI, MRI, B, Observer); 1402 case TargetOpcode::G_FMAD: 1403 return legalizeFMad(MI, MRI, B); 1404 case TargetOpcode::G_FDIV: 1405 return legalizeFDIV(MI, MRI, B); 1406 case TargetOpcode::G_UDIV: 1407 case TargetOpcode::G_UREM: 1408 return legalizeUDIV_UREM(MI, MRI, B); 1409 case TargetOpcode::G_SDIV: 1410 case TargetOpcode::G_SREM: 1411 return legalizeSDIV_SREM(MI, MRI, B); 1412 case TargetOpcode::G_ATOMIC_CMPXCHG: 1413 return legalizeAtomicCmpXChg(MI, MRI, B); 1414 case TargetOpcode::G_FLOG: 1415 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1416 case TargetOpcode::G_FLOG10: 1417 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1418 case TargetOpcode::G_FEXP: 1419 return legalizeFExp(MI, B); 1420 case TargetOpcode::G_FPOW: 1421 return legalizeFPow(MI, B); 1422 case TargetOpcode::G_FFLOOR: 1423 return legalizeFFloor(MI, MRI, B); 1424 case TargetOpcode::G_BUILD_VECTOR: 1425 return legalizeBuildVector(MI, MRI, B); 1426 default: 1427 return false; 1428 } 1429 1430 llvm_unreachable("expected switch to return"); 1431 } 1432 1433 Register AMDGPULegalizerInfo::getSegmentAperture( 1434 unsigned AS, 1435 MachineRegisterInfo &MRI, 1436 MachineIRBuilder &B) const { 1437 MachineFunction &MF = B.getMF(); 1438 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1439 const LLT S32 = LLT::scalar(32); 1440 1441 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1442 1443 if (ST.hasApertureRegs()) { 1444 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1445 // getreg. 1446 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1447 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1448 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1449 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1450 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1451 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1452 unsigned Encoding = 1453 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1454 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1455 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1456 1457 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1458 1459 B.buildInstr(AMDGPU::S_GETREG_B32) 1460 .addDef(GetReg) 1461 .addImm(Encoding); 1462 MRI.setType(GetReg, S32); 1463 1464 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1465 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1466 } 1467 1468 Register QueuePtr = MRI.createGenericVirtualRegister( 1469 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1470 1471 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1472 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1473 return Register(); 1474 1475 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1476 // private_segment_aperture_base_hi. 1477 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1478 1479 // TODO: can we be smarter about machine pointer info? 1480 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1481 MachineMemOperand *MMO = MF.getMachineMemOperand( 1482 PtrInfo, 1483 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1484 MachineMemOperand::MOInvariant, 1485 4, commonAlignment(Align(64), StructOffset)); 1486 1487 Register LoadAddr; 1488 1489 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1490 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1491 } 1492 1493 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1494 MachineInstr &MI, MachineRegisterInfo &MRI, 1495 MachineIRBuilder &B) const { 1496 MachineFunction &MF = B.getMF(); 1497 1498 B.setInstr(MI); 1499 1500 const LLT S32 = LLT::scalar(32); 1501 Register Dst = MI.getOperand(0).getReg(); 1502 Register Src = MI.getOperand(1).getReg(); 1503 1504 LLT DstTy = MRI.getType(Dst); 1505 LLT SrcTy = MRI.getType(Src); 1506 unsigned DestAS = DstTy.getAddressSpace(); 1507 unsigned SrcAS = SrcTy.getAddressSpace(); 1508 1509 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1510 // vector element. 1511 assert(!DstTy.isVector()); 1512 1513 const AMDGPUTargetMachine &TM 1514 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1515 1516 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1517 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1518 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1519 return true; 1520 } 1521 1522 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1523 // Truncate. 1524 B.buildExtract(Dst, Src, 0); 1525 MI.eraseFromParent(); 1526 return true; 1527 } 1528 1529 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1530 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1531 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1532 1533 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1534 // another. Merge operands are required to be the same type, but creating an 1535 // extra ptrtoint would be kind of pointless. 1536 auto HighAddr = B.buildConstant( 1537 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1538 B.buildMerge(Dst, {Src, HighAddr}); 1539 MI.eraseFromParent(); 1540 return true; 1541 } 1542 1543 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1544 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1545 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1546 unsigned NullVal = TM.getNullPointerValue(DestAS); 1547 1548 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1549 auto FlatNull = B.buildConstant(SrcTy, 0); 1550 1551 // Extract low 32-bits of the pointer. 1552 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1553 1554 auto CmpRes = 1555 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1556 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1557 1558 MI.eraseFromParent(); 1559 return true; 1560 } 1561 1562 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1563 return false; 1564 1565 if (!ST.hasFlatAddressSpace()) 1566 return false; 1567 1568 auto SegmentNull = 1569 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1570 auto FlatNull = 1571 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1572 1573 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1574 if (!ApertureReg.isValid()) 1575 return false; 1576 1577 auto CmpRes = 1578 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1579 1580 // Coerce the type of the low half of the result so we can use merge_values. 1581 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1582 1583 // TODO: Should we allow mismatched types but matching sizes in merges to 1584 // avoid the ptrtoint? 1585 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1586 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1587 1588 MI.eraseFromParent(); 1589 return true; 1590 } 1591 1592 bool AMDGPULegalizerInfo::legalizeFrint( 1593 MachineInstr &MI, MachineRegisterInfo &MRI, 1594 MachineIRBuilder &B) const { 1595 B.setInstr(MI); 1596 1597 Register Src = MI.getOperand(1).getReg(); 1598 LLT Ty = MRI.getType(Src); 1599 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1600 1601 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1602 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1603 1604 auto C1 = B.buildFConstant(Ty, C1Val); 1605 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1606 1607 // TODO: Should this propagate fast-math-flags? 1608 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1609 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1610 1611 auto C2 = B.buildFConstant(Ty, C2Val); 1612 auto Fabs = B.buildFAbs(Ty, Src); 1613 1614 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1615 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1616 return true; 1617 } 1618 1619 bool AMDGPULegalizerInfo::legalizeFceil( 1620 MachineInstr &MI, MachineRegisterInfo &MRI, 1621 MachineIRBuilder &B) const { 1622 B.setInstr(MI); 1623 1624 const LLT S1 = LLT::scalar(1); 1625 const LLT S64 = LLT::scalar(64); 1626 1627 Register Src = MI.getOperand(1).getReg(); 1628 assert(MRI.getType(Src) == S64); 1629 1630 // result = trunc(src) 1631 // if (src > 0.0 && src != result) 1632 // result += 1.0 1633 1634 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1635 1636 const auto Zero = B.buildFConstant(S64, 0.0); 1637 const auto One = B.buildFConstant(S64, 1.0); 1638 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1639 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1640 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1641 auto Add = B.buildSelect(S64, And, One, Zero); 1642 1643 // TODO: Should this propagate fast-math-flags? 1644 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1645 return true; 1646 } 1647 1648 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1649 MachineIRBuilder &B) { 1650 const unsigned FractBits = 52; 1651 const unsigned ExpBits = 11; 1652 LLT S32 = LLT::scalar(32); 1653 1654 auto Const0 = B.buildConstant(S32, FractBits - 32); 1655 auto Const1 = B.buildConstant(S32, ExpBits); 1656 1657 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1658 .addUse(Const0.getReg(0)) 1659 .addUse(Const1.getReg(0)); 1660 1661 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1662 } 1663 1664 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1665 MachineInstr &MI, MachineRegisterInfo &MRI, 1666 MachineIRBuilder &B) const { 1667 B.setInstr(MI); 1668 1669 const LLT S1 = LLT::scalar(1); 1670 const LLT S32 = LLT::scalar(32); 1671 const LLT S64 = LLT::scalar(64); 1672 1673 Register Src = MI.getOperand(1).getReg(); 1674 assert(MRI.getType(Src) == S64); 1675 1676 // TODO: Should this use extract since the low half is unused? 1677 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1678 Register Hi = Unmerge.getReg(1); 1679 1680 // Extract the upper half, since this is where we will find the sign and 1681 // exponent. 1682 auto Exp = extractF64Exponent(Hi, B); 1683 1684 const unsigned FractBits = 52; 1685 1686 // Extract the sign bit. 1687 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1688 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1689 1690 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1691 1692 const auto Zero32 = B.buildConstant(S32, 0); 1693 1694 // Extend back to 64-bits. 1695 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1696 1697 auto Shr = B.buildAShr(S64, FractMask, Exp); 1698 auto Not = B.buildNot(S64, Shr); 1699 auto Tmp0 = B.buildAnd(S64, Src, Not); 1700 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1701 1702 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1703 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1704 1705 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1706 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1707 return true; 1708 } 1709 1710 bool AMDGPULegalizerInfo::legalizeITOFP( 1711 MachineInstr &MI, MachineRegisterInfo &MRI, 1712 MachineIRBuilder &B, bool Signed) const { 1713 B.setInstr(MI); 1714 1715 Register Dst = MI.getOperand(0).getReg(); 1716 Register Src = MI.getOperand(1).getReg(); 1717 1718 const LLT S64 = LLT::scalar(64); 1719 const LLT S32 = LLT::scalar(32); 1720 1721 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1722 1723 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1724 1725 auto CvtHi = Signed ? 1726 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1727 B.buildUITOFP(S64, Unmerge.getReg(1)); 1728 1729 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1730 1731 auto ThirtyTwo = B.buildConstant(S32, 32); 1732 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1733 .addUse(CvtHi.getReg(0)) 1734 .addUse(ThirtyTwo.getReg(0)); 1735 1736 // TODO: Should this propagate fast-math-flags? 1737 B.buildFAdd(Dst, LdExp, CvtLo); 1738 MI.eraseFromParent(); 1739 return true; 1740 } 1741 1742 // TODO: Copied from DAG implementation. Verify logic and document how this 1743 // actually works. 1744 bool AMDGPULegalizerInfo::legalizeFPTOI( 1745 MachineInstr &MI, MachineRegisterInfo &MRI, 1746 MachineIRBuilder &B, bool Signed) const { 1747 B.setInstr(MI); 1748 1749 Register Dst = MI.getOperand(0).getReg(); 1750 Register Src = MI.getOperand(1).getReg(); 1751 1752 const LLT S64 = LLT::scalar(64); 1753 const LLT S32 = LLT::scalar(32); 1754 1755 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1756 1757 unsigned Flags = MI.getFlags(); 1758 1759 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1760 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1761 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1762 1763 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1764 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1765 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1766 1767 auto Hi = Signed ? 1768 B.buildFPTOSI(S32, FloorMul) : 1769 B.buildFPTOUI(S32, FloorMul); 1770 auto Lo = B.buildFPTOUI(S32, Fma); 1771 1772 B.buildMerge(Dst, { Lo, Hi }); 1773 MI.eraseFromParent(); 1774 1775 return true; 1776 } 1777 1778 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1779 MachineInstr &MI, MachineRegisterInfo &MRI, 1780 MachineIRBuilder &B) const { 1781 MachineFunction &MF = B.getMF(); 1782 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1783 1784 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1785 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1786 1787 // With ieee_mode disabled, the instructions have the correct behavior 1788 // already for G_FMINNUM/G_FMAXNUM 1789 if (!MFI->getMode().IEEE) 1790 return !IsIEEEOp; 1791 1792 if (IsIEEEOp) 1793 return true; 1794 1795 MachineIRBuilder HelperBuilder(MI); 1796 GISelObserverWrapper DummyObserver; 1797 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1798 HelperBuilder.setInstr(MI); 1799 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1800 } 1801 1802 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1803 MachineInstr &MI, MachineRegisterInfo &MRI, 1804 MachineIRBuilder &B) const { 1805 // TODO: Should move some of this into LegalizerHelper. 1806 1807 // TODO: Promote dynamic indexing of s16 to s32 1808 1809 // FIXME: Artifact combiner probably should have replaced the truncated 1810 // constant before this, so we shouldn't need 1811 // getConstantVRegValWithLookThrough. 1812 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1813 MI.getOperand(2).getReg(), MRI); 1814 if (!IdxVal) // Dynamic case will be selected to register indexing. 1815 return true; 1816 1817 Register Dst = MI.getOperand(0).getReg(); 1818 Register Vec = MI.getOperand(1).getReg(); 1819 1820 LLT VecTy = MRI.getType(Vec); 1821 LLT EltTy = VecTy.getElementType(); 1822 assert(EltTy == MRI.getType(Dst)); 1823 1824 B.setInstr(MI); 1825 1826 if (IdxVal->Value < VecTy.getNumElements()) 1827 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1828 else 1829 B.buildUndef(Dst); 1830 1831 MI.eraseFromParent(); 1832 return true; 1833 } 1834 1835 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1836 MachineInstr &MI, MachineRegisterInfo &MRI, 1837 MachineIRBuilder &B) const { 1838 // TODO: Should move some of this into LegalizerHelper. 1839 1840 // TODO: Promote dynamic indexing of s16 to s32 1841 1842 // FIXME: Artifact combiner probably should have replaced the truncated 1843 // constant before this, so we shouldn't need 1844 // getConstantVRegValWithLookThrough. 1845 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1846 MI.getOperand(3).getReg(), MRI); 1847 if (!IdxVal) // Dynamic case will be selected to register indexing. 1848 return true; 1849 1850 Register Dst = MI.getOperand(0).getReg(); 1851 Register Vec = MI.getOperand(1).getReg(); 1852 Register Ins = MI.getOperand(2).getReg(); 1853 1854 LLT VecTy = MRI.getType(Vec); 1855 LLT EltTy = VecTy.getElementType(); 1856 assert(EltTy == MRI.getType(Ins)); 1857 1858 B.setInstr(MI); 1859 1860 if (IdxVal->Value < VecTy.getNumElements()) 1861 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1862 else 1863 B.buildUndef(Dst); 1864 1865 MI.eraseFromParent(); 1866 return true; 1867 } 1868 1869 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1870 MachineInstr &MI, MachineRegisterInfo &MRI, 1871 MachineIRBuilder &B) const { 1872 const LLT V2S16 = LLT::vector(2, 16); 1873 1874 Register Dst = MI.getOperand(0).getReg(); 1875 Register Src0 = MI.getOperand(1).getReg(); 1876 LLT DstTy = MRI.getType(Dst); 1877 LLT SrcTy = MRI.getType(Src0); 1878 1879 if (SrcTy == V2S16 && DstTy == V2S16 && 1880 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1881 return true; 1882 1883 MachineIRBuilder HelperBuilder(MI); 1884 GISelObserverWrapper DummyObserver; 1885 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1886 HelperBuilder.setInstr(MI); 1887 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1888 } 1889 1890 bool AMDGPULegalizerInfo::legalizeSinCos( 1891 MachineInstr &MI, MachineRegisterInfo &MRI, 1892 MachineIRBuilder &B) const { 1893 B.setInstr(MI); 1894 1895 Register DstReg = MI.getOperand(0).getReg(); 1896 Register SrcReg = MI.getOperand(1).getReg(); 1897 LLT Ty = MRI.getType(DstReg); 1898 unsigned Flags = MI.getFlags(); 1899 1900 Register TrigVal; 1901 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1902 if (ST.hasTrigReducedRange()) { 1903 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1904 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1905 .addUse(MulVal.getReg(0)) 1906 .setMIFlags(Flags).getReg(0); 1907 } else 1908 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1909 1910 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1911 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1912 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1913 .addUse(TrigVal) 1914 .setMIFlags(Flags); 1915 MI.eraseFromParent(); 1916 return true; 1917 } 1918 1919 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1920 Register DstReg, LLT PtrTy, 1921 MachineIRBuilder &B, const GlobalValue *GV, 1922 unsigned Offset, unsigned GAFlags) const { 1923 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1924 // to the following code sequence: 1925 // 1926 // For constant address space: 1927 // s_getpc_b64 s[0:1] 1928 // s_add_u32 s0, s0, $symbol 1929 // s_addc_u32 s1, s1, 0 1930 // 1931 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1932 // a fixup or relocation is emitted to replace $symbol with a literal 1933 // constant, which is a pc-relative offset from the encoding of the $symbol 1934 // operand to the global variable. 1935 // 1936 // For global address space: 1937 // s_getpc_b64 s[0:1] 1938 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1939 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1940 // 1941 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1942 // fixups or relocations are emitted to replace $symbol@*@lo and 1943 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1944 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1945 // operand to the global variable. 1946 // 1947 // What we want here is an offset from the value returned by s_getpc 1948 // (which is the address of the s_add_u32 instruction) to the global 1949 // variable, but since the encoding of $symbol starts 4 bytes after the start 1950 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1951 // small. This requires us to add 4 to the global variable offset in order to 1952 // compute the correct address. 1953 1954 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1955 1956 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1957 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1958 1959 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1960 .addDef(PCReg); 1961 1962 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1963 if (GAFlags == SIInstrInfo::MO_NONE) 1964 MIB.addImm(0); 1965 else 1966 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1967 1968 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1969 1970 if (PtrTy.getSizeInBits() == 32) 1971 B.buildExtract(DstReg, PCReg, 0); 1972 return true; 1973 } 1974 1975 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1976 MachineInstr &MI, MachineRegisterInfo &MRI, 1977 MachineIRBuilder &B) const { 1978 Register DstReg = MI.getOperand(0).getReg(); 1979 LLT Ty = MRI.getType(DstReg); 1980 unsigned AS = Ty.getAddressSpace(); 1981 1982 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1983 MachineFunction &MF = B.getMF(); 1984 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1985 B.setInstr(MI); 1986 1987 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1988 if (!MFI->isEntryFunction()) { 1989 const Function &Fn = MF.getFunction(); 1990 DiagnosticInfoUnsupported BadLDSDecl( 1991 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1992 DS_Warning); 1993 Fn.getContext().diagnose(BadLDSDecl); 1994 1995 // We currently don't have a way to correctly allocate LDS objects that 1996 // aren't directly associated with a kernel. We do force inlining of 1997 // functions that use local objects. However, if these dead functions are 1998 // not eliminated, we don't want a compile time error. Just emit a warning 1999 // and a trap, since there should be no callable path here. 2000 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2001 B.buildUndef(DstReg); 2002 MI.eraseFromParent(); 2003 return true; 2004 } 2005 2006 // TODO: We could emit code to handle the initialization somewhere. 2007 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2008 const SITargetLowering *TLI = ST.getTargetLowering(); 2009 if (!TLI->shouldUseLDSConstAddress(GV)) { 2010 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2011 return true; // Leave in place; 2012 } 2013 2014 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 2015 MI.eraseFromParent(); 2016 return true; 2017 } 2018 2019 const Function &Fn = MF.getFunction(); 2020 DiagnosticInfoUnsupported BadInit( 2021 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2022 Fn.getContext().diagnose(BadInit); 2023 return true; 2024 } 2025 2026 const SITargetLowering *TLI = ST.getTargetLowering(); 2027 2028 if (TLI->shouldEmitFixup(GV)) { 2029 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2030 MI.eraseFromParent(); 2031 return true; 2032 } 2033 2034 if (TLI->shouldEmitPCReloc(GV)) { 2035 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2036 MI.eraseFromParent(); 2037 return true; 2038 } 2039 2040 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2041 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2042 2043 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2044 MachinePointerInfo::getGOT(MF), 2045 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2046 MachineMemOperand::MOInvariant, 2047 8 /*Size*/, Align(8)); 2048 2049 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2050 2051 if (Ty.getSizeInBits() == 32) { 2052 // Truncate if this is a 32-bit constant adrdess. 2053 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2054 B.buildExtract(DstReg, Load, 0); 2055 } else 2056 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2057 2058 MI.eraseFromParent(); 2059 return true; 2060 } 2061 2062 bool AMDGPULegalizerInfo::legalizeLoad( 2063 MachineInstr &MI, MachineRegisterInfo &MRI, 2064 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2065 B.setInstr(MI); 2066 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2067 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2068 Observer.changingInstr(MI); 2069 MI.getOperand(1).setReg(Cast.getReg(0)); 2070 Observer.changedInstr(MI); 2071 return true; 2072 } 2073 2074 bool AMDGPULegalizerInfo::legalizeFMad( 2075 MachineInstr &MI, MachineRegisterInfo &MRI, 2076 MachineIRBuilder &B) const { 2077 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2078 assert(Ty.isScalar()); 2079 2080 MachineFunction &MF = B.getMF(); 2081 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2082 2083 // TODO: Always legal with future ftz flag. 2084 // FIXME: Do we need just output? 2085 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2086 return true; 2087 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2088 return true; 2089 2090 MachineIRBuilder HelperBuilder(MI); 2091 GISelObserverWrapper DummyObserver; 2092 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2093 HelperBuilder.setInstr(MI); 2094 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2095 } 2096 2097 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2098 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2099 Register DstReg = MI.getOperand(0).getReg(); 2100 Register PtrReg = MI.getOperand(1).getReg(); 2101 Register CmpVal = MI.getOperand(2).getReg(); 2102 Register NewVal = MI.getOperand(3).getReg(); 2103 2104 assert(SITargetLowering::isFlatGlobalAddrSpace( 2105 MRI.getType(PtrReg).getAddressSpace()) && 2106 "this should not have been custom lowered"); 2107 2108 LLT ValTy = MRI.getType(CmpVal); 2109 LLT VecTy = LLT::vector(2, ValTy); 2110 2111 B.setInstr(MI); 2112 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2113 2114 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2115 .addDef(DstReg) 2116 .addUse(PtrReg) 2117 .addUse(PackedVal) 2118 .setMemRefs(MI.memoperands()); 2119 2120 MI.eraseFromParent(); 2121 return true; 2122 } 2123 2124 bool AMDGPULegalizerInfo::legalizeFlog( 2125 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2126 Register Dst = MI.getOperand(0).getReg(); 2127 Register Src = MI.getOperand(1).getReg(); 2128 LLT Ty = B.getMRI()->getType(Dst); 2129 unsigned Flags = MI.getFlags(); 2130 B.setInstr(MI); 2131 2132 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2133 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2134 2135 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2136 MI.eraseFromParent(); 2137 return true; 2138 } 2139 2140 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2141 MachineIRBuilder &B) const { 2142 Register Dst = MI.getOperand(0).getReg(); 2143 Register Src = MI.getOperand(1).getReg(); 2144 unsigned Flags = MI.getFlags(); 2145 LLT Ty = B.getMRI()->getType(Dst); 2146 B.setInstr(MI); 2147 2148 auto K = B.buildFConstant(Ty, numbers::log2e); 2149 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2150 B.buildFExp2(Dst, Mul, Flags); 2151 MI.eraseFromParent(); 2152 return true; 2153 } 2154 2155 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2156 MachineIRBuilder &B) const { 2157 Register Dst = MI.getOperand(0).getReg(); 2158 Register Src0 = MI.getOperand(1).getReg(); 2159 Register Src1 = MI.getOperand(2).getReg(); 2160 unsigned Flags = MI.getFlags(); 2161 LLT Ty = B.getMRI()->getType(Dst); 2162 B.setInstr(MI); 2163 const LLT S16 = LLT::scalar(16); 2164 const LLT S32 = LLT::scalar(32); 2165 2166 if (Ty == S32) { 2167 auto Log = B.buildFLog2(S32, Src0, Flags); 2168 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2169 .addUse(Log.getReg(0)) 2170 .addUse(Src1) 2171 .setMIFlags(Flags); 2172 B.buildFExp2(Dst, Mul, Flags); 2173 } else if (Ty == S16) { 2174 // There's no f16 fmul_legacy, so we need to convert for it. 2175 auto Log = B.buildFLog2(S16, Src0, Flags); 2176 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2177 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2178 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2179 .addUse(Ext0.getReg(0)) 2180 .addUse(Ext1.getReg(0)) 2181 .setMIFlags(Flags); 2182 2183 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2184 } else 2185 return false; 2186 2187 MI.eraseFromParent(); 2188 return true; 2189 } 2190 2191 // Find a source register, ignoring any possible source modifiers. 2192 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2193 Register ModSrc = OrigSrc; 2194 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2195 ModSrc = SrcFNeg->getOperand(1).getReg(); 2196 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2197 ModSrc = SrcFAbs->getOperand(1).getReg(); 2198 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2199 ModSrc = SrcFAbs->getOperand(1).getReg(); 2200 return ModSrc; 2201 } 2202 2203 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2204 MachineRegisterInfo &MRI, 2205 MachineIRBuilder &B) const { 2206 B.setInstr(MI); 2207 2208 const LLT S1 = LLT::scalar(1); 2209 const LLT S64 = LLT::scalar(64); 2210 Register Dst = MI.getOperand(0).getReg(); 2211 Register OrigSrc = MI.getOperand(1).getReg(); 2212 unsigned Flags = MI.getFlags(); 2213 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2214 "this should not have been custom lowered"); 2215 2216 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2217 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2218 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2219 // V_FRACT bug is: 2220 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2221 // 2222 // Convert floor(x) to (x - fract(x)) 2223 2224 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2225 .addUse(OrigSrc) 2226 .setMIFlags(Flags); 2227 2228 // Give source modifier matching some assistance before obscuring a foldable 2229 // pattern. 2230 2231 // TODO: We can avoid the neg on the fract? The input sign to fract 2232 // shouldn't matter? 2233 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2234 2235 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2236 2237 Register Min = MRI.createGenericVirtualRegister(S64); 2238 2239 // We don't need to concern ourselves with the snan handling difference, so 2240 // use the one which will directly select. 2241 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2242 if (MFI->getMode().IEEE) 2243 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2244 else 2245 B.buildFMinNum(Min, Fract, Const, Flags); 2246 2247 Register CorrectedFract = Min; 2248 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2249 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2250 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2251 } 2252 2253 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2254 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2255 2256 MI.eraseFromParent(); 2257 return true; 2258 } 2259 2260 // Turn an illegal packed v2s16 build vector into bit operations. 2261 // TODO: This should probably be a bitcast action in LegalizerHelper. 2262 bool AMDGPULegalizerInfo::legalizeBuildVector( 2263 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2264 Register Dst = MI.getOperand(0).getReg(); 2265 const LLT S32 = LLT::scalar(32); 2266 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2267 2268 Register Src0 = MI.getOperand(1).getReg(); 2269 Register Src1 = MI.getOperand(2).getReg(); 2270 assert(MRI.getType(Src0) == LLT::scalar(16)); 2271 2272 B.setInstr(MI); 2273 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2274 B.buildBitcast(Dst, Merge); 2275 2276 MI.eraseFromParent(); 2277 return true; 2278 } 2279 2280 // Return the use branch instruction, otherwise null if the usage is invalid. 2281 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2282 MachineRegisterInfo &MRI, 2283 MachineInstr *&Br, 2284 MachineBasicBlock *&UncondBrTarget) { 2285 Register CondDef = MI.getOperand(0).getReg(); 2286 if (!MRI.hasOneNonDBGUse(CondDef)) 2287 return nullptr; 2288 2289 MachineBasicBlock *Parent = MI.getParent(); 2290 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2291 if (UseMI.getParent() != Parent || 2292 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2293 return nullptr; 2294 2295 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2296 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2297 if (Next == Parent->end()) { 2298 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2299 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2300 return nullptr; 2301 UncondBrTarget = &*NextMBB; 2302 } else { 2303 if (Next->getOpcode() != AMDGPU::G_BR) 2304 return nullptr; 2305 Br = &*Next; 2306 UncondBrTarget = Br->getOperand(0).getMBB(); 2307 } 2308 2309 return &UseMI; 2310 } 2311 2312 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2313 MachineRegisterInfo &MRI, 2314 Register LiveIn, 2315 Register PhyReg) const { 2316 assert(PhyReg.isPhysical() && "Physical register expected"); 2317 2318 // Insert the live-in copy, if required, by defining destination virtual 2319 // register. 2320 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2321 if (!MRI.getVRegDef(LiveIn)) { 2322 // FIXME: Should have scoped insert pt 2323 MachineBasicBlock &OrigInsBB = B.getMBB(); 2324 auto OrigInsPt = B.getInsertPt(); 2325 2326 MachineBasicBlock &EntryMBB = B.getMF().front(); 2327 EntryMBB.addLiveIn(PhyReg); 2328 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2329 B.buildCopy(LiveIn, PhyReg); 2330 2331 B.setInsertPt(OrigInsBB, OrigInsPt); 2332 } 2333 2334 return LiveIn; 2335 } 2336 2337 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2338 MachineRegisterInfo &MRI, 2339 Register PhyReg, LLT Ty, 2340 bool InsertLiveInCopy) const { 2341 assert(PhyReg.isPhysical() && "Physical register expected"); 2342 2343 // Get or create virtual live-in regester 2344 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2345 if (!LiveIn) { 2346 LiveIn = MRI.createGenericVirtualRegister(Ty); 2347 MRI.addLiveIn(PhyReg, LiveIn); 2348 } 2349 2350 // When the actual true copy required is from virtual register to physical 2351 // register (to be inserted later), live-in copy insertion from physical 2352 // to register virtual register is not required 2353 if (!InsertLiveInCopy) 2354 return LiveIn; 2355 2356 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2357 } 2358 2359 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2360 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2361 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2362 const ArgDescriptor *Arg; 2363 const TargetRegisterClass *RC; 2364 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2365 if (!Arg) { 2366 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2367 return nullptr; 2368 } 2369 return Arg; 2370 } 2371 2372 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2373 const ArgDescriptor *Arg) const { 2374 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2375 return false; // TODO: Handle these 2376 2377 Register SrcReg = Arg->getRegister(); 2378 assert(SrcReg.isPhysical() && "Physical register expected"); 2379 assert(DstReg.isVirtual() && "Virtual register expected"); 2380 2381 MachineRegisterInfo &MRI = *B.getMRI(); 2382 2383 LLT Ty = MRI.getType(DstReg); 2384 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2385 2386 if (Arg->isMasked()) { 2387 // TODO: Should we try to emit this once in the entry block? 2388 const LLT S32 = LLT::scalar(32); 2389 const unsigned Mask = Arg->getMask(); 2390 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2391 2392 Register AndMaskSrc = LiveIn; 2393 2394 if (Shift != 0) { 2395 auto ShiftAmt = B.buildConstant(S32, Shift); 2396 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2397 } 2398 2399 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2400 } else { 2401 B.buildCopy(DstReg, LiveIn); 2402 } 2403 2404 return true; 2405 } 2406 2407 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2408 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2409 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2410 B.setInstr(MI); 2411 2412 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2413 if (!Arg) 2414 return false; 2415 2416 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2417 return false; 2418 2419 MI.eraseFromParent(); 2420 return true; 2421 } 2422 2423 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2424 MachineRegisterInfo &MRI, 2425 MachineIRBuilder &B) const { 2426 B.setInstr(MI); 2427 Register Dst = MI.getOperand(0).getReg(); 2428 LLT DstTy = MRI.getType(Dst); 2429 LLT S16 = LLT::scalar(16); 2430 LLT S32 = LLT::scalar(32); 2431 LLT S64 = LLT::scalar(64); 2432 2433 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2434 return true; 2435 2436 if (DstTy == S16) 2437 return legalizeFDIV16(MI, MRI, B); 2438 if (DstTy == S32) 2439 return legalizeFDIV32(MI, MRI, B); 2440 if (DstTy == S64) 2441 return legalizeFDIV64(MI, MRI, B); 2442 2443 return false; 2444 } 2445 2446 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2447 const LLT S32 = LLT::scalar(32); 2448 2449 auto Cvt0 = B.buildUITOFP(S32, Src); 2450 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2451 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2452 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2453 return B.buildFPTOUI(S32, Mul).getReg(0); 2454 } 2455 2456 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2457 Register DstReg, 2458 Register Num, 2459 Register Den, 2460 bool IsRem) const { 2461 const LLT S1 = LLT::scalar(1); 2462 const LLT S32 = LLT::scalar(32); 2463 2464 // RCP = URECIP(Den) = 2^32 / Den + e 2465 // e is rounding error. 2466 auto RCP = buildDivRCP(B, Den); 2467 2468 // RCP_LO = mul(RCP, Den) 2469 auto RCP_LO = B.buildMul(S32, RCP, Den); 2470 2471 // RCP_HI = mulhu (RCP, Den) */ 2472 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2473 2474 // NEG_RCP_LO = -RCP_LO 2475 auto Zero = B.buildConstant(S32, 0); 2476 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2477 2478 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2479 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2480 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2481 2482 // Calculate the rounding error from the URECIP instruction 2483 // E = mulhu(ABS_RCP_LO, RCP) 2484 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2485 2486 // RCP_A_E = RCP + E 2487 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2488 2489 // RCP_S_E = RCP - E 2490 auto RCP_S_E = B.buildSub(S32, RCP, E); 2491 2492 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2493 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2494 2495 // Quotient = mulhu(Tmp0, Num)stmp 2496 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2497 2498 // Num_S_Remainder = Quotient * Den 2499 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2500 2501 // Remainder = Num - Num_S_Remainder 2502 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2503 2504 // Remainder_GE_Den = Remainder >= Den 2505 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2506 2507 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2508 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2509 Num, Num_S_Remainder); 2510 2511 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2512 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2513 2514 // Calculate Division result: 2515 2516 // Quotient_A_One = Quotient + 1 2517 auto One = B.buildConstant(S32, 1); 2518 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2519 2520 // Quotient_S_One = Quotient - 1 2521 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2522 2523 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2524 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2525 2526 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2527 if (IsRem) { 2528 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2529 2530 // Calculate Rem result: 2531 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2532 2533 // Remainder_A_Den = Remainder + Den 2534 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2535 2536 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2537 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2538 2539 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2540 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2541 } else { 2542 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2543 } 2544 } 2545 2546 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2547 MachineRegisterInfo &MRI, 2548 MachineIRBuilder &B) const { 2549 B.setInstr(MI); 2550 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2551 Register DstReg = MI.getOperand(0).getReg(); 2552 Register Num = MI.getOperand(1).getReg(); 2553 Register Den = MI.getOperand(2).getReg(); 2554 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2555 MI.eraseFromParent(); 2556 return true; 2557 } 2558 2559 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2560 // 2561 // Return lo, hi of result 2562 // 2563 // %cvt.lo = G_UITOFP Val.lo 2564 // %cvt.hi = G_UITOFP Val.hi 2565 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2566 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2567 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2568 // %mul2 = G_FMUL %mul1, 2**(-32) 2569 // %trunc = G_INTRINSIC_TRUNC %mul2 2570 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2571 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2572 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2573 Register Val) { 2574 const LLT S32 = LLT::scalar(32); 2575 auto Unmerge = B.buildUnmerge(S32, Val); 2576 2577 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2578 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2579 2580 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2581 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2582 2583 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2584 auto Mul1 = 2585 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2586 2587 // 2**(-32) 2588 auto Mul2 = 2589 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2590 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2591 2592 // -(2**32) 2593 auto Mad2 = B.buildFMAD(S32, Trunc, 2594 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2595 2596 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2597 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2598 2599 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2600 } 2601 2602 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2603 MachineRegisterInfo &MRI, 2604 MachineIRBuilder &B) const { 2605 B.setInstr(MI); 2606 2607 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2608 const LLT S32 = LLT::scalar(32); 2609 const LLT S64 = LLT::scalar(64); 2610 const LLT S1 = LLT::scalar(1); 2611 Register Numer = MI.getOperand(1).getReg(); 2612 Register Denom = MI.getOperand(2).getReg(); 2613 Register RcpLo, RcpHi; 2614 2615 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2616 2617 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2618 2619 auto Zero64 = B.buildConstant(S64, 0); 2620 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2621 2622 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2623 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2624 2625 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2626 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2627 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2628 2629 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2630 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2631 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2632 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2633 2634 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2635 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2636 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2637 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2638 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2639 2640 auto Zero32 = B.buildConstant(S32, 0); 2641 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2642 auto Add2_HiC = 2643 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2644 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2645 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2646 2647 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2648 Register NumerLo = UnmergeNumer.getReg(0); 2649 Register NumerHi = UnmergeNumer.getReg(1); 2650 2651 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2652 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2653 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2654 Register Mul3_Lo = UnmergeMul3.getReg(0); 2655 Register Mul3_Hi = UnmergeMul3.getReg(1); 2656 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2657 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2658 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2659 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2660 2661 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2662 Register DenomLo = UnmergeDenom.getReg(0); 2663 Register DenomHi = UnmergeDenom.getReg(1); 2664 2665 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2666 auto C1 = B.buildSExt(S32, CmpHi); 2667 2668 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2669 auto C2 = B.buildSExt(S32, CmpLo); 2670 2671 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2672 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2673 2674 // TODO: Here and below portions of the code can be enclosed into if/endif. 2675 // Currently control flow is unconditional and we have 4 selects after 2676 // potential endif to substitute PHIs. 2677 2678 // if C3 != 0 ... 2679 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2680 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2681 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2682 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2683 2684 auto One64 = B.buildConstant(S64, 1); 2685 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2686 2687 auto C4 = 2688 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2689 auto C5 = 2690 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2691 auto C6 = B.buildSelect( 2692 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2693 2694 // if (C6 != 0) 2695 auto Add4 = B.buildAdd(S64, Add3, One64); 2696 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2697 2698 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2699 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2700 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2701 2702 // endif C6 2703 // endif C3 2704 2705 if (IsDiv) { 2706 auto Sel1 = B.buildSelect( 2707 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2708 B.buildSelect(MI.getOperand(0), 2709 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2710 } else { 2711 auto Sel2 = B.buildSelect( 2712 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2713 B.buildSelect(MI.getOperand(0), 2714 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2715 } 2716 2717 MI.eraseFromParent(); 2718 return true; 2719 } 2720 2721 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2722 MachineRegisterInfo &MRI, 2723 MachineIRBuilder &B) const { 2724 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2725 if (Ty == LLT::scalar(32)) 2726 return legalizeUDIV_UREM32(MI, MRI, B); 2727 if (Ty == LLT::scalar(64)) 2728 return legalizeUDIV_UREM64(MI, MRI, B); 2729 return false; 2730 } 2731 2732 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2733 MachineRegisterInfo &MRI, 2734 MachineIRBuilder &B) const { 2735 B.setInstr(MI); 2736 const LLT S32 = LLT::scalar(32); 2737 2738 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2739 Register DstReg = MI.getOperand(0).getReg(); 2740 Register LHS = MI.getOperand(1).getReg(); 2741 Register RHS = MI.getOperand(2).getReg(); 2742 2743 auto ThirtyOne = B.buildConstant(S32, 31); 2744 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2745 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2746 2747 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2748 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2749 2750 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2751 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2752 2753 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2754 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2755 2756 if (IsRem) { 2757 auto RSign = LHSign; // Remainder sign is the same as LHS 2758 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2759 B.buildSub(DstReg, UDivRem, RSign); 2760 } else { 2761 auto DSign = B.buildXor(S32, LHSign, RHSign); 2762 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2763 B.buildSub(DstReg, UDivRem, DSign); 2764 } 2765 2766 MI.eraseFromParent(); 2767 return true; 2768 } 2769 2770 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2771 MachineRegisterInfo &MRI, 2772 MachineIRBuilder &B) const { 2773 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2774 return legalizeSDIV_SREM32(MI, MRI, B); 2775 return false; 2776 } 2777 2778 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2779 MachineRegisterInfo &MRI, 2780 MachineIRBuilder &B) const { 2781 Register Res = MI.getOperand(0).getReg(); 2782 Register LHS = MI.getOperand(1).getReg(); 2783 Register RHS = MI.getOperand(2).getReg(); 2784 2785 uint16_t Flags = MI.getFlags(); 2786 2787 LLT ResTy = MRI.getType(Res); 2788 LLT S32 = LLT::scalar(32); 2789 LLT S64 = LLT::scalar(64); 2790 2791 const MachineFunction &MF = B.getMF(); 2792 bool Unsafe = 2793 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2794 2795 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2796 return false; 2797 2798 if (!Unsafe && ResTy == S32 && 2799 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2800 return false; 2801 2802 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2803 // 1 / x -> RCP(x) 2804 if (CLHS->isExactlyValue(1.0)) { 2805 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2806 .addUse(RHS) 2807 .setMIFlags(Flags); 2808 2809 MI.eraseFromParent(); 2810 return true; 2811 } 2812 2813 // -1 / x -> RCP( FNEG(x) ) 2814 if (CLHS->isExactlyValue(-1.0)) { 2815 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2816 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2817 .addUse(FNeg.getReg(0)) 2818 .setMIFlags(Flags); 2819 2820 MI.eraseFromParent(); 2821 return true; 2822 } 2823 } 2824 2825 // x / y -> x * (1.0 / y) 2826 if (Unsafe) { 2827 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2828 .addUse(RHS) 2829 .setMIFlags(Flags); 2830 B.buildFMul(Res, LHS, RCP, Flags); 2831 2832 MI.eraseFromParent(); 2833 return true; 2834 } 2835 2836 return false; 2837 } 2838 2839 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2840 MachineRegisterInfo &MRI, 2841 MachineIRBuilder &B) const { 2842 B.setInstr(MI); 2843 Register Res = MI.getOperand(0).getReg(); 2844 Register LHS = MI.getOperand(1).getReg(); 2845 Register RHS = MI.getOperand(2).getReg(); 2846 2847 uint16_t Flags = MI.getFlags(); 2848 2849 LLT S16 = LLT::scalar(16); 2850 LLT S32 = LLT::scalar(32); 2851 2852 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2853 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2854 2855 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2856 .addUse(RHSExt.getReg(0)) 2857 .setMIFlags(Flags); 2858 2859 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2860 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2861 2862 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2863 .addUse(RDst.getReg(0)) 2864 .addUse(RHS) 2865 .addUse(LHS) 2866 .setMIFlags(Flags); 2867 2868 MI.eraseFromParent(); 2869 return true; 2870 } 2871 2872 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2873 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2874 static void toggleSPDenormMode(bool Enable, 2875 MachineIRBuilder &B, 2876 const GCNSubtarget &ST, 2877 AMDGPU::SIModeRegisterDefaults Mode) { 2878 // Set SP denorm mode to this value. 2879 unsigned SPDenormMode = 2880 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2881 2882 if (ST.hasDenormModeInst()) { 2883 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2884 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2885 2886 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2887 B.buildInstr(AMDGPU::S_DENORM_MODE) 2888 .addImm(NewDenormModeValue); 2889 2890 } else { 2891 // Select FP32 bit field in mode register. 2892 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2893 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2894 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2895 2896 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2897 .addImm(SPDenormMode) 2898 .addImm(SPDenormModeBitField); 2899 } 2900 } 2901 2902 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2903 MachineRegisterInfo &MRI, 2904 MachineIRBuilder &B) const { 2905 B.setInstr(MI); 2906 Register Res = MI.getOperand(0).getReg(); 2907 Register LHS = MI.getOperand(1).getReg(); 2908 Register RHS = MI.getOperand(2).getReg(); 2909 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2910 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2911 2912 uint16_t Flags = MI.getFlags(); 2913 2914 LLT S32 = LLT::scalar(32); 2915 LLT S1 = LLT::scalar(1); 2916 2917 auto One = B.buildFConstant(S32, 1.0f); 2918 2919 auto DenominatorScaled = 2920 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2921 .addUse(LHS) 2922 .addUse(RHS) 2923 .addImm(0) 2924 .setMIFlags(Flags); 2925 auto NumeratorScaled = 2926 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2927 .addUse(LHS) 2928 .addUse(RHS) 2929 .addImm(1) 2930 .setMIFlags(Flags); 2931 2932 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2933 .addUse(DenominatorScaled.getReg(0)) 2934 .setMIFlags(Flags); 2935 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2936 2937 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2938 // aren't modeled as reading it. 2939 if (!Mode.allFP32Denormals()) 2940 toggleSPDenormMode(true, B, ST, Mode); 2941 2942 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2943 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2944 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2945 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2946 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2947 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2948 2949 if (!Mode.allFP32Denormals()) 2950 toggleSPDenormMode(false, B, ST, Mode); 2951 2952 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2953 .addUse(Fma4.getReg(0)) 2954 .addUse(Fma1.getReg(0)) 2955 .addUse(Fma3.getReg(0)) 2956 .addUse(NumeratorScaled.getReg(1)) 2957 .setMIFlags(Flags); 2958 2959 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2960 .addUse(Fmas.getReg(0)) 2961 .addUse(RHS) 2962 .addUse(LHS) 2963 .setMIFlags(Flags); 2964 2965 MI.eraseFromParent(); 2966 return true; 2967 } 2968 2969 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2970 MachineRegisterInfo &MRI, 2971 MachineIRBuilder &B) const { 2972 B.setInstr(MI); 2973 Register Res = MI.getOperand(0).getReg(); 2974 Register LHS = MI.getOperand(1).getReg(); 2975 Register RHS = MI.getOperand(2).getReg(); 2976 2977 uint16_t Flags = MI.getFlags(); 2978 2979 LLT S64 = LLT::scalar(64); 2980 LLT S1 = LLT::scalar(1); 2981 2982 auto One = B.buildFConstant(S64, 1.0); 2983 2984 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2985 .addUse(LHS) 2986 .addUse(RHS) 2987 .addImm(0) 2988 .setMIFlags(Flags); 2989 2990 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2991 2992 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2993 .addUse(DivScale0.getReg(0)) 2994 .setMIFlags(Flags); 2995 2996 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2997 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2998 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2999 3000 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3001 .addUse(LHS) 3002 .addUse(RHS) 3003 .addImm(1) 3004 .setMIFlags(Flags); 3005 3006 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3007 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3008 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3009 3010 Register Scale; 3011 if (!ST.hasUsableDivScaleConditionOutput()) { 3012 // Workaround a hardware bug on SI where the condition output from div_scale 3013 // is not usable. 3014 3015 LLT S32 = LLT::scalar(32); 3016 3017 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3018 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3019 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3020 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3021 3022 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3023 Scale1Unmerge.getReg(1)); 3024 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3025 Scale0Unmerge.getReg(1)); 3026 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3027 } else { 3028 Scale = DivScale1.getReg(1); 3029 } 3030 3031 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3032 .addUse(Fma4.getReg(0)) 3033 .addUse(Fma3.getReg(0)) 3034 .addUse(Mul.getReg(0)) 3035 .addUse(Scale) 3036 .setMIFlags(Flags); 3037 3038 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3039 .addUse(Fmas.getReg(0)) 3040 .addUse(RHS) 3041 .addUse(LHS) 3042 .setMIFlags(Flags); 3043 3044 MI.eraseFromParent(); 3045 return true; 3046 } 3047 3048 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3049 MachineRegisterInfo &MRI, 3050 MachineIRBuilder &B) const { 3051 B.setInstr(MI); 3052 Register Res = MI.getOperand(0).getReg(); 3053 Register LHS = MI.getOperand(2).getReg(); 3054 Register RHS = MI.getOperand(3).getReg(); 3055 uint16_t Flags = MI.getFlags(); 3056 3057 LLT S32 = LLT::scalar(32); 3058 LLT S1 = LLT::scalar(1); 3059 3060 auto Abs = B.buildFAbs(S32, RHS, Flags); 3061 const APFloat C0Val(1.0f); 3062 3063 auto C0 = B.buildConstant(S32, 0x6f800000); 3064 auto C1 = B.buildConstant(S32, 0x2f800000); 3065 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3066 3067 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3068 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3069 3070 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3071 3072 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3073 .addUse(Mul0.getReg(0)) 3074 .setMIFlags(Flags); 3075 3076 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3077 3078 B.buildFMul(Res, Sel, Mul1, Flags); 3079 3080 MI.eraseFromParent(); 3081 return true; 3082 } 3083 3084 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3085 MachineRegisterInfo &MRI, 3086 MachineIRBuilder &B) const { 3087 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3088 if (!MFI->isEntryFunction()) { 3089 return legalizePreloadedArgIntrin(MI, MRI, B, 3090 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3091 } 3092 3093 B.setInstr(MI); 3094 3095 uint64_t Offset = 3096 ST.getTargetLowering()->getImplicitParameterOffset( 3097 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3098 Register DstReg = MI.getOperand(0).getReg(); 3099 LLT DstTy = MRI.getType(DstReg); 3100 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3101 3102 const ArgDescriptor *Arg; 3103 const TargetRegisterClass *RC; 3104 std::tie(Arg, RC) 3105 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3106 if (!Arg) 3107 return false; 3108 3109 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3110 if (!loadInputValue(KernargPtrReg, B, Arg)) 3111 return false; 3112 3113 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3114 MI.eraseFromParent(); 3115 return true; 3116 } 3117 3118 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3119 MachineRegisterInfo &MRI, 3120 MachineIRBuilder &B, 3121 unsigned AddrSpace) const { 3122 B.setInstr(MI); 3123 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3124 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3125 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3126 MI.eraseFromParent(); 3127 return true; 3128 } 3129 3130 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3131 // offset (the offset that is included in bounds checking and swizzling, to be 3132 // split between the instruction's voffset and immoffset fields) and soffset 3133 // (the offset that is excluded from bounds checking and swizzling, to go in 3134 // the instruction's soffset field). This function takes the first kind of 3135 // offset and figures out how to split it between voffset and immoffset. 3136 std::tuple<Register, unsigned, unsigned> 3137 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3138 Register OrigOffset) const { 3139 const unsigned MaxImm = 4095; 3140 Register BaseReg; 3141 unsigned TotalConstOffset; 3142 MachineInstr *OffsetDef; 3143 const LLT S32 = LLT::scalar(32); 3144 3145 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3146 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3147 3148 unsigned ImmOffset = TotalConstOffset; 3149 3150 // If the immediate value is too big for the immoffset field, put the value 3151 // and -4096 into the immoffset field so that the value that is copied/added 3152 // for the voffset field is a multiple of 4096, and it stands more chance 3153 // of being CSEd with the copy/add for another similar load/store. 3154 // However, do not do that rounding down to a multiple of 4096 if that is a 3155 // negative number, as it appears to be illegal to have a negative offset 3156 // in the vgpr, even if adding the immediate offset makes it positive. 3157 unsigned Overflow = ImmOffset & ~MaxImm; 3158 ImmOffset -= Overflow; 3159 if ((int32_t)Overflow < 0) { 3160 Overflow += ImmOffset; 3161 ImmOffset = 0; 3162 } 3163 3164 if (Overflow != 0) { 3165 if (!BaseReg) { 3166 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3167 } else { 3168 auto OverflowVal = B.buildConstant(S32, Overflow); 3169 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3170 } 3171 } 3172 3173 if (!BaseReg) 3174 BaseReg = B.buildConstant(S32, 0).getReg(0); 3175 3176 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3177 } 3178 3179 /// Handle register layout difference for f16 images for some subtargets. 3180 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3181 MachineRegisterInfo &MRI, 3182 Register Reg) const { 3183 if (!ST.hasUnpackedD16VMem()) 3184 return Reg; 3185 3186 const LLT S16 = LLT::scalar(16); 3187 const LLT S32 = LLT::scalar(32); 3188 LLT StoreVT = MRI.getType(Reg); 3189 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3190 3191 auto Unmerge = B.buildUnmerge(S16, Reg); 3192 3193 SmallVector<Register, 4> WideRegs; 3194 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3195 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3196 3197 int NumElts = StoreVT.getNumElements(); 3198 3199 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3200 } 3201 3202 Register AMDGPULegalizerInfo::fixStoreSourceType( 3203 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3204 MachineRegisterInfo *MRI = B.getMRI(); 3205 LLT Ty = MRI->getType(VData); 3206 3207 const LLT S16 = LLT::scalar(16); 3208 3209 // Fixup illegal register types for i8 stores. 3210 if (Ty == LLT::scalar(8) || Ty == S16) { 3211 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3212 return AnyExt; 3213 } 3214 3215 if (Ty.isVector()) { 3216 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3217 if (IsFormat) 3218 return handleD16VData(B, *MRI, VData); 3219 } 3220 } 3221 3222 return VData; 3223 } 3224 3225 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3226 MachineRegisterInfo &MRI, 3227 MachineIRBuilder &B, 3228 bool IsTyped, 3229 bool IsFormat) const { 3230 B.setInstr(MI); 3231 3232 Register VData = MI.getOperand(1).getReg(); 3233 LLT Ty = MRI.getType(VData); 3234 LLT EltTy = Ty.getScalarType(); 3235 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3236 const LLT S32 = LLT::scalar(32); 3237 3238 VData = fixStoreSourceType(B, VData, IsFormat); 3239 Register RSrc = MI.getOperand(2).getReg(); 3240 3241 MachineMemOperand *MMO = *MI.memoperands_begin(); 3242 const int MemSize = MMO->getSize(); 3243 3244 unsigned ImmOffset; 3245 unsigned TotalOffset; 3246 3247 // The typed intrinsics add an immediate after the registers. 3248 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3249 3250 // The struct intrinsic variants add one additional operand over raw. 3251 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3252 Register VIndex; 3253 int OpOffset = 0; 3254 if (HasVIndex) { 3255 VIndex = MI.getOperand(3).getReg(); 3256 OpOffset = 1; 3257 } 3258 3259 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3260 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3261 3262 unsigned Format = 0; 3263 if (IsTyped) { 3264 Format = MI.getOperand(5 + OpOffset).getImm(); 3265 ++OpOffset; 3266 } 3267 3268 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3269 3270 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3271 if (TotalOffset != 0) 3272 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3273 3274 unsigned Opc; 3275 if (IsTyped) { 3276 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3277 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3278 } else if (IsFormat) { 3279 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3280 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3281 } else { 3282 switch (MemSize) { 3283 case 1: 3284 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3285 break; 3286 case 2: 3287 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3288 break; 3289 default: 3290 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3291 break; 3292 } 3293 } 3294 3295 if (!VIndex) 3296 VIndex = B.buildConstant(S32, 0).getReg(0); 3297 3298 auto MIB = B.buildInstr(Opc) 3299 .addUse(VData) // vdata 3300 .addUse(RSrc) // rsrc 3301 .addUse(VIndex) // vindex 3302 .addUse(VOffset) // voffset 3303 .addUse(SOffset) // soffset 3304 .addImm(ImmOffset); // offset(imm) 3305 3306 if (IsTyped) 3307 MIB.addImm(Format); 3308 3309 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3310 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3311 .addMemOperand(MMO); 3312 3313 MI.eraseFromParent(); 3314 return true; 3315 } 3316 3317 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3318 MachineRegisterInfo &MRI, 3319 MachineIRBuilder &B, 3320 bool IsFormat, 3321 bool IsTyped) const { 3322 B.setInstr(MI); 3323 3324 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3325 MachineMemOperand *MMO = *MI.memoperands_begin(); 3326 const int MemSize = MMO->getSize(); 3327 const LLT S32 = LLT::scalar(32); 3328 3329 Register Dst = MI.getOperand(0).getReg(); 3330 Register RSrc = MI.getOperand(2).getReg(); 3331 3332 // The typed intrinsics add an immediate after the registers. 3333 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3334 3335 // The struct intrinsic variants add one additional operand over raw. 3336 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3337 Register VIndex; 3338 int OpOffset = 0; 3339 if (HasVIndex) { 3340 VIndex = MI.getOperand(3).getReg(); 3341 OpOffset = 1; 3342 } 3343 3344 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3345 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3346 3347 unsigned Format = 0; 3348 if (IsTyped) { 3349 Format = MI.getOperand(5 + OpOffset).getImm(); 3350 ++OpOffset; 3351 } 3352 3353 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3354 unsigned ImmOffset; 3355 unsigned TotalOffset; 3356 3357 LLT Ty = MRI.getType(Dst); 3358 LLT EltTy = Ty.getScalarType(); 3359 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3360 const bool Unpacked = ST.hasUnpackedD16VMem(); 3361 3362 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3363 if (TotalOffset != 0) 3364 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3365 3366 unsigned Opc; 3367 3368 if (IsTyped) { 3369 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3370 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3371 } else if (IsFormat) { 3372 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3373 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3374 } else { 3375 switch (MemSize) { 3376 case 1: 3377 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3378 break; 3379 case 2: 3380 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3381 break; 3382 default: 3383 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3384 break; 3385 } 3386 } 3387 3388 Register LoadDstReg; 3389 3390 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3391 LLT UnpackedTy = Ty.changeElementSize(32); 3392 3393 if (IsExtLoad) 3394 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3395 else if (Unpacked && IsD16 && Ty.isVector()) 3396 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3397 else 3398 LoadDstReg = Dst; 3399 3400 if (!VIndex) 3401 VIndex = B.buildConstant(S32, 0).getReg(0); 3402 3403 auto MIB = B.buildInstr(Opc) 3404 .addDef(LoadDstReg) // vdata 3405 .addUse(RSrc) // rsrc 3406 .addUse(VIndex) // vindex 3407 .addUse(VOffset) // voffset 3408 .addUse(SOffset) // soffset 3409 .addImm(ImmOffset); // offset(imm) 3410 3411 if (IsTyped) 3412 MIB.addImm(Format); 3413 3414 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3415 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3416 .addMemOperand(MMO); 3417 3418 if (LoadDstReg != Dst) { 3419 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3420 3421 // Widen result for extending loads was widened. 3422 if (IsExtLoad) 3423 B.buildTrunc(Dst, LoadDstReg); 3424 else { 3425 // Repack to original 16-bit vector result 3426 // FIXME: G_TRUNC should work, but legalization currently fails 3427 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3428 SmallVector<Register, 4> Repack; 3429 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3430 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3431 B.buildMerge(Dst, Repack); 3432 } 3433 } 3434 3435 MI.eraseFromParent(); 3436 return true; 3437 } 3438 3439 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3440 MachineIRBuilder &B, 3441 bool IsInc) const { 3442 B.setInstr(MI); 3443 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3444 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3445 B.buildInstr(Opc) 3446 .addDef(MI.getOperand(0).getReg()) 3447 .addUse(MI.getOperand(2).getReg()) 3448 .addUse(MI.getOperand(3).getReg()) 3449 .cloneMemRefs(MI); 3450 MI.eraseFromParent(); 3451 return true; 3452 } 3453 3454 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3455 switch (IntrID) { 3456 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3457 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3458 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3459 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3460 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3461 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3462 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3463 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3464 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3465 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3466 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3467 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3468 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3469 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3470 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3471 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3472 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3473 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3474 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3475 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3476 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3477 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3478 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3479 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3480 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3481 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3482 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3483 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3484 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3485 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3486 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3487 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3488 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3489 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3490 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3491 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3492 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3493 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3494 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3495 default: 3496 llvm_unreachable("unhandled atomic opcode"); 3497 } 3498 } 3499 3500 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3501 MachineIRBuilder &B, 3502 Intrinsic::ID IID) const { 3503 B.setInstr(MI); 3504 3505 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3506 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3507 3508 Register Dst = MI.getOperand(0).getReg(); 3509 Register VData = MI.getOperand(2).getReg(); 3510 3511 Register CmpVal; 3512 int OpOffset = 0; 3513 3514 if (IsCmpSwap) { 3515 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3516 ++OpOffset; 3517 } 3518 3519 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3520 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3521 3522 // The struct intrinsic variants add one additional operand over raw. 3523 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3524 Register VIndex; 3525 if (HasVIndex) { 3526 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3527 ++OpOffset; 3528 } 3529 3530 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3531 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3532 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3533 3534 MachineMemOperand *MMO = *MI.memoperands_begin(); 3535 3536 unsigned ImmOffset; 3537 unsigned TotalOffset; 3538 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3539 if (TotalOffset != 0) 3540 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3541 3542 if (!VIndex) 3543 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3544 3545 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3546 .addDef(Dst) 3547 .addUse(VData); // vdata 3548 3549 if (IsCmpSwap) 3550 MIB.addReg(CmpVal); 3551 3552 MIB.addUse(RSrc) // rsrc 3553 .addUse(VIndex) // vindex 3554 .addUse(VOffset) // voffset 3555 .addUse(SOffset) // soffset 3556 .addImm(ImmOffset) // offset(imm) 3557 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3558 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3559 .addMemOperand(MMO); 3560 3561 MI.eraseFromParent(); 3562 return true; 3563 } 3564 3565 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3566 /// vector with s16 typed elements. 3567 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3568 SmallVectorImpl<Register> &PackedAddrs, 3569 int AddrIdx, int DimIdx, int NumVAddrs, 3570 int NumGradients) { 3571 const LLT S16 = LLT::scalar(16); 3572 const LLT V2S16 = LLT::vector(2, 16); 3573 3574 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3575 MachineOperand &SrcOp = MI.getOperand(I); 3576 if (!SrcOp.isReg()) 3577 continue; // _L to _LZ may have eliminated this. 3578 3579 Register AddrReg = SrcOp.getReg(); 3580 3581 if (I < DimIdx) { 3582 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3583 PackedAddrs.push_back(AddrReg); 3584 } else { 3585 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3586 // derivatives dx/dh and dx/dv are packed with undef. 3587 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3588 ((NumGradients / 2) % 2 == 1 && 3589 (I == DimIdx + (NumGradients / 2) - 1 || 3590 I == DimIdx + NumGradients - 1)) || 3591 // Check for _L to _LZ optimization 3592 !MI.getOperand(I + 1).isReg()) { 3593 PackedAddrs.push_back( 3594 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3595 .getReg(0)); 3596 } else { 3597 PackedAddrs.push_back( 3598 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3599 .getReg(0)); 3600 ++I; 3601 } 3602 } 3603 } 3604 } 3605 3606 /// Convert from separate vaddr components to a single vector address register, 3607 /// and replace the remaining operands with $noreg. 3608 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3609 int DimIdx, int NumVAddrs) { 3610 const LLT S32 = LLT::scalar(32); 3611 3612 SmallVector<Register, 8> AddrRegs; 3613 for (int I = 0; I != NumVAddrs; ++I) { 3614 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3615 if (SrcOp.isReg()) { 3616 AddrRegs.push_back(SrcOp.getReg()); 3617 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3618 } 3619 } 3620 3621 int NumAddrRegs = AddrRegs.size(); 3622 if (NumAddrRegs != 1) { 3623 // Round up to 8 elements for v5-v7 3624 // FIXME: Missing intermediate sized register classes and instructions. 3625 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3626 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3627 auto Undef = B.buildUndef(S32); 3628 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3629 NumAddrRegs = RoundedNumRegs; 3630 } 3631 3632 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3633 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3634 } 3635 3636 for (int I = 1; I != NumVAddrs; ++I) { 3637 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3638 if (SrcOp.isReg()) 3639 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3640 } 3641 } 3642 3643 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3644 /// 3645 /// Depending on the subtarget, load/store with 16-bit element data need to be 3646 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3647 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3648 /// registers. 3649 /// 3650 /// We don't want to directly select image instructions just yet, but also want 3651 /// to exposes all register repacking to the legalizer/combiners. We also don't 3652 /// want a selected instrution entering RegBankSelect. In order to avoid 3653 /// defining a multitude of intermediate image instructions, directly hack on 3654 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3655 /// now unnecessary arguments with $noreg. 3656 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3657 MachineInstr &MI, MachineIRBuilder &B, 3658 GISelChangeObserver &Observer, 3659 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3660 B.setInstr(MI); 3661 3662 const int NumDefs = MI.getNumExplicitDefs(); 3663 bool IsTFE = NumDefs == 2; 3664 // We are only processing the operands of d16 image operations on subtargets 3665 // that use the unpacked register layout, or need to repack the TFE result. 3666 3667 // TODO: Do we need to guard against already legalized intrinsics? 3668 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3669 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3670 3671 MachineRegisterInfo *MRI = B.getMRI(); 3672 const LLT S32 = LLT::scalar(32); 3673 const LLT S16 = LLT::scalar(16); 3674 const LLT V2S16 = LLT::vector(2, 16); 3675 3676 // Index of first address argument 3677 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3678 3679 // Check for 16 bit addresses and pack if true. 3680 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3681 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3682 const bool IsA16 = AddrTy == S16; 3683 3684 int NumVAddrs, NumGradients; 3685 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3686 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3687 getDMaskIdx(BaseOpcode, NumDefs); 3688 unsigned DMask = 0; 3689 3690 int DMaskLanes = 0; 3691 if (!BaseOpcode->Atomic) { 3692 DMask = MI.getOperand(DMaskIdx).getImm(); 3693 if (BaseOpcode->Gather4) { 3694 DMaskLanes = 4; 3695 } else if (DMask != 0) { 3696 DMaskLanes = countPopulation(DMask); 3697 } else if (!IsTFE && !BaseOpcode->Store) { 3698 // If dmask is 0, this is a no-op load. This can be eliminated. 3699 B.buildUndef(MI.getOperand(0)); 3700 MI.eraseFromParent(); 3701 return true; 3702 } 3703 } 3704 3705 Observer.changingInstr(MI); 3706 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3707 3708 unsigned NewOpcode = NumDefs == 0 ? 3709 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3710 3711 // Track that we legalized this 3712 MI.setDesc(B.getTII().get(NewOpcode)); 3713 3714 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3715 // dmask to be at least 1 otherwise the instruction will fail 3716 if (IsTFE && DMask == 0) { 3717 DMask = 0x1; 3718 DMaskLanes = 1; 3719 MI.getOperand(DMaskIdx).setImm(DMask); 3720 } 3721 3722 if (BaseOpcode->Atomic) { 3723 Register VData0 = MI.getOperand(2).getReg(); 3724 LLT Ty = MRI->getType(VData0); 3725 3726 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3727 if (Ty.isVector()) 3728 return false; 3729 3730 if (BaseOpcode->AtomicX2) { 3731 Register VData1 = MI.getOperand(3).getReg(); 3732 // The two values are packed in one register. 3733 LLT PackedTy = LLT::vector(2, Ty); 3734 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3735 MI.getOperand(2).setReg(Concat.getReg(0)); 3736 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3737 } 3738 } 3739 3740 int CorrectedNumVAddrs = NumVAddrs; 3741 3742 // Optimize _L to _LZ when _L is zero 3743 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3744 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3745 const ConstantFP *ConstantLod; 3746 const int LodIdx = AddrIdx + NumVAddrs - 1; 3747 3748 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3749 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3750 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3751 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3752 LZMappingInfo->LZ, ImageDimIntr->Dim); 3753 3754 // The starting indexes should remain in the same place. 3755 --NumVAddrs; 3756 --CorrectedNumVAddrs; 3757 3758 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3759 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3760 MI.RemoveOperand(LodIdx); 3761 } 3762 } 3763 } 3764 3765 // Optimize _mip away, when 'lod' is zero 3766 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3767 int64_t ConstantLod; 3768 const int LodIdx = AddrIdx + NumVAddrs - 1; 3769 3770 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3771 if (ConstantLod == 0) { 3772 // TODO: Change intrinsic opcode and remove operand instead or replacing 3773 // it with 0, as the _L to _LZ handling is done above. 3774 MI.getOperand(LodIdx).ChangeToImmediate(0); 3775 --CorrectedNumVAddrs; 3776 } 3777 } 3778 } 3779 3780 // If the register allocator cannot place the address registers contiguously 3781 // without introducing moves, then using the non-sequential address encoding 3782 // is always preferable, since it saves VALU instructions and is usually a 3783 // wash in terms of code size or even better. 3784 // 3785 // However, we currently have no way of hinting to the register allocator 3786 // that MIMG addresses should be placed contiguously when it is possible to 3787 // do so, so force non-NSA for the common 2-address case as a heuristic. 3788 // 3789 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3790 // allocation when possible. 3791 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3792 3793 // Rewrite the addressing register layout before doing anything else. 3794 if (IsA16) { 3795 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3796 // should be introduced. 3797 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3798 return false; 3799 3800 if (NumVAddrs > 1) { 3801 SmallVector<Register, 4> PackedRegs; 3802 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3803 NumGradients); 3804 3805 if (!UseNSA && PackedRegs.size() > 1) { 3806 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3807 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3808 PackedRegs[0] = Concat.getReg(0); 3809 PackedRegs.resize(1); 3810 } 3811 3812 const int NumPacked = PackedRegs.size(); 3813 for (int I = 0; I != NumVAddrs; ++I) { 3814 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3815 if (!SrcOp.isReg()) { 3816 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3817 continue; 3818 } 3819 3820 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3821 3822 if (I < NumPacked) 3823 SrcOp.setReg(PackedRegs[I]); 3824 else 3825 SrcOp.setReg(AMDGPU::NoRegister); 3826 } 3827 } 3828 } else if (!UseNSA && NumVAddrs > 1) { 3829 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3830 } 3831 3832 3833 if (BaseOpcode->Store) { // No TFE for stores? 3834 // TODO: Handle dmask trim 3835 Register VData = MI.getOperand(1).getReg(); 3836 LLT Ty = MRI->getType(VData); 3837 if (!Ty.isVector() || Ty.getElementType() != S16) 3838 return true; 3839 3840 B.setInstr(MI); 3841 3842 Register RepackedReg = handleD16VData(B, *MRI, VData); 3843 if (RepackedReg != VData) { 3844 MI.getOperand(1).setReg(RepackedReg); 3845 } 3846 3847 return true; 3848 } 3849 3850 Register DstReg = MI.getOperand(0).getReg(); 3851 LLT Ty = MRI->getType(DstReg); 3852 const LLT EltTy = Ty.getScalarType(); 3853 const bool IsD16 = Ty.getScalarType() == S16; 3854 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3855 3856 // Confirm that the return type is large enough for the dmask specified 3857 if (NumElts < DMaskLanes) 3858 return false; 3859 3860 if (NumElts > 4 || DMaskLanes > 4) 3861 return false; 3862 3863 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3864 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3865 3866 // The raw dword aligned data component of the load. The only legal cases 3867 // where this matters should be when using the packed D16 format, for 3868 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3869 LLT RoundedTy; 3870 3871 // S32 vector to to cover all data, plus TFE result element. 3872 LLT TFETy; 3873 3874 // Register type to use for each loaded component. Will be S32 or V2S16. 3875 LLT RegTy; 3876 3877 if (IsD16 && ST.hasUnpackedD16VMem()) { 3878 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3879 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3880 RegTy = S32; 3881 } else { 3882 unsigned EltSize = EltTy.getSizeInBits(); 3883 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3884 unsigned RoundedSize = 32 * RoundedElts; 3885 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3886 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3887 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3888 } 3889 3890 // The return type does not need adjustment. 3891 // TODO: Should we change s16 case to s32 or <2 x s16>? 3892 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3893 return true; 3894 3895 Register Dst1Reg; 3896 3897 // Insert after the instruction. 3898 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3899 3900 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3901 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3902 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3903 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3904 3905 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3906 3907 MI.getOperand(0).setReg(NewResultReg); 3908 3909 // In the IR, TFE is supposed to be used with a 2 element struct return 3910 // type. The intruction really returns these two values in one contiguous 3911 // register, with one additional dword beyond the loaded data. Rewrite the 3912 // return type to use a single register result. 3913 3914 if (IsTFE) { 3915 Dst1Reg = MI.getOperand(1).getReg(); 3916 if (MRI->getType(Dst1Reg) != S32) 3917 return false; 3918 3919 // TODO: Make sure the TFE operand bit is set. 3920 MI.RemoveOperand(1); 3921 3922 // Handle the easy case that requires no repack instructions. 3923 if (Ty == S32) { 3924 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3925 return true; 3926 } 3927 } 3928 3929 // Now figure out how to copy the new result register back into the old 3930 // result. 3931 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3932 3933 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3934 3935 if (ResultNumRegs == 1) { 3936 assert(!IsTFE); 3937 ResultRegs[0] = NewResultReg; 3938 } else { 3939 // We have to repack into a new vector of some kind. 3940 for (int I = 0; I != NumDataRegs; ++I) 3941 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3942 B.buildUnmerge(ResultRegs, NewResultReg); 3943 3944 // Drop the final TFE element to get the data part. The TFE result is 3945 // directly written to the right place already. 3946 if (IsTFE) 3947 ResultRegs.resize(NumDataRegs); 3948 } 3949 3950 // For an s16 scalar result, we form an s32 result with a truncate regardless 3951 // of packed vs. unpacked. 3952 if (IsD16 && !Ty.isVector()) { 3953 B.buildTrunc(DstReg, ResultRegs[0]); 3954 return true; 3955 } 3956 3957 // Avoid a build/concat_vector of 1 entry. 3958 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3959 B.buildBitcast(DstReg, ResultRegs[0]); 3960 return true; 3961 } 3962 3963 assert(Ty.isVector()); 3964 3965 if (IsD16) { 3966 // For packed D16 results with TFE enabled, all the data components are 3967 // S32. Cast back to the expected type. 3968 // 3969 // TODO: We don't really need to use load s32 elements. We would only need one 3970 // cast for the TFE result if a multiple of v2s16 was used. 3971 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3972 for (Register &Reg : ResultRegs) 3973 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3974 } else if (ST.hasUnpackedD16VMem()) { 3975 for (Register &Reg : ResultRegs) 3976 Reg = B.buildTrunc(S16, Reg).getReg(0); 3977 } 3978 } 3979 3980 auto padWithUndef = [&](LLT Ty, int NumElts) { 3981 if (NumElts == 0) 3982 return; 3983 Register Undef = B.buildUndef(Ty).getReg(0); 3984 for (int I = 0; I != NumElts; ++I) 3985 ResultRegs.push_back(Undef); 3986 }; 3987 3988 // Pad out any elements eliminated due to the dmask. 3989 LLT ResTy = MRI->getType(ResultRegs[0]); 3990 if (!ResTy.isVector()) { 3991 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3992 B.buildBuildVector(DstReg, ResultRegs); 3993 return true; 3994 } 3995 3996 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 3997 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 3998 3999 // Deal with the one annoying legal case. 4000 const LLT V3S16 = LLT::vector(3, 16); 4001 if (Ty == V3S16) { 4002 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4003 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4004 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4005 return true; 4006 } 4007 4008 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4009 B.buildConcatVectors(DstReg, ResultRegs); 4010 return true; 4011 } 4012 4013 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4014 MachineInstr &MI, MachineIRBuilder &B, 4015 GISelChangeObserver &Observer) const { 4016 Register Dst = MI.getOperand(0).getReg(); 4017 LLT Ty = B.getMRI()->getType(Dst); 4018 unsigned Size = Ty.getSizeInBits(); 4019 MachineFunction &MF = B.getMF(); 4020 4021 Observer.changingInstr(MI); 4022 4023 // FIXME: We don't really need this intermediate instruction. The intrinsic 4024 // should be fixed to have a memory operand. Since it's readnone, we're not 4025 // allowed to add one. 4026 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4027 MI.RemoveOperand(1); // Remove intrinsic ID 4028 4029 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4030 // TODO: Should this use datalayout alignment? 4031 const unsigned MemSize = (Size + 7) / 8; 4032 const Align MemAlign(4); 4033 MachineMemOperand *MMO = MF.getMachineMemOperand( 4034 MachinePointerInfo(), 4035 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4036 MachineMemOperand::MOInvariant, 4037 MemSize, MemAlign); 4038 MI.addMemOperand(MF, MMO); 4039 4040 // There are no 96-bit result scalar loads, but widening to 128-bit should 4041 // always be legal. We may need to restore this to a 96-bit result if it turns 4042 // out this needs to be converted to a vector load during RegBankSelect. 4043 if (!isPowerOf2_32(Size)) { 4044 LegalizerHelper Helper(MF, *this, Observer, B); 4045 B.setInstr(MI); 4046 4047 if (Ty.isVector()) 4048 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4049 else 4050 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4051 } 4052 4053 Observer.changedInstr(MI); 4054 return true; 4055 } 4056 4057 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4058 MachineRegisterInfo &MRI, 4059 MachineIRBuilder &B) const { 4060 B.setInstr(MI); 4061 4062 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4063 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4064 !ST.isTrapHandlerEnabled()) { 4065 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4066 } else { 4067 // Pass queue pointer to trap handler as input, and insert trap instruction 4068 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4069 const ArgDescriptor *Arg = 4070 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4071 if (!Arg) 4072 return false; 4073 MachineRegisterInfo &MRI = *B.getMRI(); 4074 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4075 Register LiveIn = getLiveInRegister( 4076 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4077 /*InsertLiveInCopy=*/false); 4078 if (!loadInputValue(LiveIn, B, Arg)) 4079 return false; 4080 B.buildCopy(SGPR01, LiveIn); 4081 B.buildInstr(AMDGPU::S_TRAP) 4082 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4083 .addReg(SGPR01, RegState::Implicit); 4084 } 4085 4086 MI.eraseFromParent(); 4087 return true; 4088 } 4089 4090 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4091 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4092 B.setInstr(MI); 4093 4094 // Is non-HSA path or trap-handler disabled? then, report a warning 4095 // accordingly 4096 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4097 !ST.isTrapHandlerEnabled()) { 4098 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4099 "debugtrap handler not supported", 4100 MI.getDebugLoc(), DS_Warning); 4101 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4102 Ctx.diagnose(NoTrap); 4103 } else { 4104 // Insert debug-trap instruction 4105 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4106 } 4107 4108 MI.eraseFromParent(); 4109 return true; 4110 } 4111 4112 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4113 MachineIRBuilder &B, 4114 GISelChangeObserver &Observer) const { 4115 MachineRegisterInfo &MRI = *B.getMRI(); 4116 4117 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4118 auto IntrID = MI.getIntrinsicID(); 4119 switch (IntrID) { 4120 case Intrinsic::amdgcn_if: 4121 case Intrinsic::amdgcn_else: { 4122 MachineInstr *Br = nullptr; 4123 MachineBasicBlock *UncondBrTarget = nullptr; 4124 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4125 const SIRegisterInfo *TRI 4126 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4127 4128 B.setInstr(*BrCond); 4129 Register Def = MI.getOperand(1).getReg(); 4130 Register Use = MI.getOperand(3).getReg(); 4131 4132 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4133 if (IntrID == Intrinsic::amdgcn_if) { 4134 B.buildInstr(AMDGPU::SI_IF) 4135 .addDef(Def) 4136 .addUse(Use) 4137 .addMBB(UncondBrTarget); 4138 } else { 4139 B.buildInstr(AMDGPU::SI_ELSE) 4140 .addDef(Def) 4141 .addUse(Use) 4142 .addMBB(UncondBrTarget) 4143 .addImm(0); 4144 } 4145 4146 if (Br) { 4147 Br->getOperand(0).setMBB(CondBrTarget); 4148 } else { 4149 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4150 // since we're swapping branch targets it needs to be reinserted. 4151 // FIXME: IRTranslator should probably not do this 4152 B.buildBr(*CondBrTarget); 4153 } 4154 4155 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4156 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4157 MI.eraseFromParent(); 4158 BrCond->eraseFromParent(); 4159 return true; 4160 } 4161 4162 return false; 4163 } 4164 case Intrinsic::amdgcn_loop: { 4165 MachineInstr *Br = nullptr; 4166 MachineBasicBlock *UncondBrTarget = nullptr; 4167 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4168 const SIRegisterInfo *TRI 4169 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4170 4171 B.setInstr(*BrCond); 4172 4173 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4174 Register Reg = MI.getOperand(2).getReg(); 4175 B.buildInstr(AMDGPU::SI_LOOP) 4176 .addUse(Reg) 4177 .addMBB(UncondBrTarget); 4178 4179 if (Br) 4180 Br->getOperand(0).setMBB(CondBrTarget); 4181 else 4182 B.buildBr(*CondBrTarget); 4183 4184 MI.eraseFromParent(); 4185 BrCond->eraseFromParent(); 4186 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4187 return true; 4188 } 4189 4190 return false; 4191 } 4192 case Intrinsic::amdgcn_kernarg_segment_ptr: 4193 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4194 B.setInstr(MI); 4195 // This only makes sense to call in a kernel, so just lower to null. 4196 B.buildConstant(MI.getOperand(0).getReg(), 0); 4197 MI.eraseFromParent(); 4198 return true; 4199 } 4200 4201 return legalizePreloadedArgIntrin( 4202 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4203 case Intrinsic::amdgcn_implicitarg_ptr: 4204 return legalizeImplicitArgPtr(MI, MRI, B); 4205 case Intrinsic::amdgcn_workitem_id_x: 4206 return legalizePreloadedArgIntrin(MI, MRI, B, 4207 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4208 case Intrinsic::amdgcn_workitem_id_y: 4209 return legalizePreloadedArgIntrin(MI, MRI, B, 4210 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4211 case Intrinsic::amdgcn_workitem_id_z: 4212 return legalizePreloadedArgIntrin(MI, MRI, B, 4213 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4214 case Intrinsic::amdgcn_workgroup_id_x: 4215 return legalizePreloadedArgIntrin(MI, MRI, B, 4216 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4217 case Intrinsic::amdgcn_workgroup_id_y: 4218 return legalizePreloadedArgIntrin(MI, MRI, B, 4219 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4220 case Intrinsic::amdgcn_workgroup_id_z: 4221 return legalizePreloadedArgIntrin(MI, MRI, B, 4222 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4223 case Intrinsic::amdgcn_dispatch_ptr: 4224 return legalizePreloadedArgIntrin(MI, MRI, B, 4225 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4226 case Intrinsic::amdgcn_queue_ptr: 4227 return legalizePreloadedArgIntrin(MI, MRI, B, 4228 AMDGPUFunctionArgInfo::QUEUE_PTR); 4229 case Intrinsic::amdgcn_implicit_buffer_ptr: 4230 return legalizePreloadedArgIntrin( 4231 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4232 case Intrinsic::amdgcn_dispatch_id: 4233 return legalizePreloadedArgIntrin(MI, MRI, B, 4234 AMDGPUFunctionArgInfo::DISPATCH_ID); 4235 case Intrinsic::amdgcn_fdiv_fast: 4236 return legalizeFDIVFastIntrin(MI, MRI, B); 4237 case Intrinsic::amdgcn_is_shared: 4238 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4239 case Intrinsic::amdgcn_is_private: 4240 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4241 case Intrinsic::amdgcn_wavefrontsize: { 4242 B.setInstr(MI); 4243 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4244 MI.eraseFromParent(); 4245 return true; 4246 } 4247 case Intrinsic::amdgcn_s_buffer_load: 4248 return legalizeSBufferLoad(MI, B, Observer); 4249 case Intrinsic::amdgcn_raw_buffer_store: 4250 case Intrinsic::amdgcn_struct_buffer_store: 4251 return legalizeBufferStore(MI, MRI, B, false, false); 4252 case Intrinsic::amdgcn_raw_buffer_store_format: 4253 case Intrinsic::amdgcn_struct_buffer_store_format: 4254 return legalizeBufferStore(MI, MRI, B, false, true); 4255 case Intrinsic::amdgcn_raw_tbuffer_store: 4256 case Intrinsic::amdgcn_struct_tbuffer_store: 4257 return legalizeBufferStore(MI, MRI, B, true, true); 4258 case Intrinsic::amdgcn_raw_buffer_load: 4259 case Intrinsic::amdgcn_struct_buffer_load: 4260 return legalizeBufferLoad(MI, MRI, B, false, false); 4261 case Intrinsic::amdgcn_raw_buffer_load_format: 4262 case Intrinsic::amdgcn_struct_buffer_load_format: 4263 return legalizeBufferLoad(MI, MRI, B, true, false); 4264 case Intrinsic::amdgcn_raw_tbuffer_load: 4265 case Intrinsic::amdgcn_struct_tbuffer_load: 4266 return legalizeBufferLoad(MI, MRI, B, true, true); 4267 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4268 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4269 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4270 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4271 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4272 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4273 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4274 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4275 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4276 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4277 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4278 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4279 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4280 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4281 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4282 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4283 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4284 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4285 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4286 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4287 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4288 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4289 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4290 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4291 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4292 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4293 return legalizeBufferAtomic(MI, B, IntrID); 4294 case Intrinsic::amdgcn_atomic_inc: 4295 return legalizeAtomicIncDec(MI, B, true); 4296 case Intrinsic::amdgcn_atomic_dec: 4297 return legalizeAtomicIncDec(MI, B, false); 4298 case Intrinsic::trap: 4299 return legalizeTrapIntrinsic(MI, MRI, B); 4300 case Intrinsic::debugtrap: 4301 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4302 default: { 4303 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4304 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4305 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4306 return true; 4307 } 4308 } 4309 4310 return true; 4311 } 4312