1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPULegalizerInfo.h" 22 23 #include "AMDGPU.h" 24 #include "AMDGPUGlobalISelUtils.h" 25 #include "AMDGPUTargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "llvm/ADT/ScopeExit.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 31 #include "llvm/CodeGen/TargetOpcodes.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/DerivedTypes.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Type.h" 36 #include "llvm/Support/Debug.h" 37 38 #define DEBUG_TYPE "amdgpu-legalinfo" 39 40 using namespace llvm; 41 using namespace LegalizeActions; 42 using namespace LegalizeMutations; 43 using namespace LegalityPredicates; 44 using namespace MIPatternMatch; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 static LegalityPredicate isMultiple32(unsigned TypeIdx, 61 unsigned MaxSize = 1024) { 62 return [=](const LegalityQuery &Query) { 63 const LLT Ty = Query.Types[TypeIdx]; 64 const LLT EltTy = Ty.getScalarType(); 65 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 66 }; 67 } 68 69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 70 return [=](const LegalityQuery &Query) { 71 return Query.Types[TypeIdx].getSizeInBits() == Size; 72 }; 73 } 74 75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 return Ty.isVector() && 79 Ty.getNumElements() % 2 != 0 && 80 Ty.getElementType().getSizeInBits() < 32 && 81 Ty.getSizeInBits() % 32 != 0; 82 }; 83 } 84 85 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 86 return [=](const LegalityQuery &Query) { 87 const LLT Ty = Query.Types[TypeIdx]; 88 const LLT EltTy = Ty.getScalarType(); 89 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 90 }; 91 } 92 93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getElementType(); 97 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 98 }; 99 } 100 101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 unsigned Size = Ty.getSizeInBits(); 106 unsigned Pieces = (Size + 63) / 64; 107 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 108 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 109 }; 110 } 111 112 // Increase the number of vector elements to reach the next multiple of 32-bit 113 // type. 114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 115 return [=](const LegalityQuery &Query) { 116 const LLT Ty = Query.Types[TypeIdx]; 117 118 const LLT EltTy = Ty.getElementType(); 119 const int Size = Ty.getSizeInBits(); 120 const int EltSize = EltTy.getSizeInBits(); 121 const int NextMul32 = (Size + 31) / 32; 122 123 assert(EltSize < 32); 124 125 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 126 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 127 }; 128 } 129 130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 131 return [=](const LegalityQuery &Query) { 132 const LLT QueryTy = Query.Types[TypeIdx]; 133 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 134 }; 135 } 136 137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 138 return [=](const LegalityQuery &Query) { 139 const LLT QueryTy = Query.Types[TypeIdx]; 140 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 141 }; 142 } 143 144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 145 return [=](const LegalityQuery &Query) { 146 const LLT QueryTy = Query.Types[TypeIdx]; 147 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 148 }; 149 } 150 151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 152 // v2s16. 153 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 154 return [=](const LegalityQuery &Query) { 155 const LLT Ty = Query.Types[TypeIdx]; 156 if (Ty.isVector()) { 157 const int EltSize = Ty.getElementType().getSizeInBits(); 158 return EltSize == 32 || EltSize == 64 || 159 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 160 EltSize == 128 || EltSize == 256; 161 } 162 163 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 164 }; 165 } 166 167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 168 return [=](const LegalityQuery &Query) { 169 const LLT QueryTy = Query.Types[TypeIdx]; 170 return QueryTy.isVector() && QueryTy.getElementType() == Type; 171 }; 172 } 173 174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 175 return [=](const LegalityQuery &Query) { 176 const LLT QueryTy = Query.Types[TypeIdx]; 177 if (!QueryTy.isVector()) 178 return false; 179 const LLT EltTy = QueryTy.getElementType(); 180 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 181 }; 182 } 183 184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 const LLT Ty = Query.Types[TypeIdx]; 187 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 188 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 189 }; 190 } 191 192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) { 193 return [=](const LegalityQuery &Query) { 194 return Query.Types[TypeIdx0].getSizeInBits() < 195 Query.Types[TypeIdx1].getSizeInBits(); 196 }; 197 } 198 199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) { 200 return [=](const LegalityQuery &Query) { 201 return Query.Types[TypeIdx0].getSizeInBits() > 202 Query.Types[TypeIdx1].getSizeInBits(); 203 }; 204 } 205 206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 207 const GCNTargetMachine &TM) 208 : ST(ST_) { 209 using namespace TargetOpcode; 210 211 auto GetAddrSpacePtr = [&TM](unsigned AS) { 212 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 213 }; 214 215 const LLT S1 = LLT::scalar(1); 216 const LLT S16 = LLT::scalar(16); 217 const LLT S32 = LLT::scalar(32); 218 const LLT S64 = LLT::scalar(64); 219 const LLT S128 = LLT::scalar(128); 220 const LLT S256 = LLT::scalar(256); 221 const LLT S512 = LLT::scalar(512); 222 const LLT S1024 = LLT::scalar(1024); 223 224 const LLT V2S16 = LLT::vector(2, 16); 225 const LLT V4S16 = LLT::vector(4, 16); 226 227 const LLT V2S32 = LLT::vector(2, 32); 228 const LLT V3S32 = LLT::vector(3, 32); 229 const LLT V4S32 = LLT::vector(4, 32); 230 const LLT V5S32 = LLT::vector(5, 32); 231 const LLT V6S32 = LLT::vector(6, 32); 232 const LLT V7S32 = LLT::vector(7, 32); 233 const LLT V8S32 = LLT::vector(8, 32); 234 const LLT V9S32 = LLT::vector(9, 32); 235 const LLT V10S32 = LLT::vector(10, 32); 236 const LLT V11S32 = LLT::vector(11, 32); 237 const LLT V12S32 = LLT::vector(12, 32); 238 const LLT V13S32 = LLT::vector(13, 32); 239 const LLT V14S32 = LLT::vector(14, 32); 240 const LLT V15S32 = LLT::vector(15, 32); 241 const LLT V16S32 = LLT::vector(16, 32); 242 const LLT V32S32 = LLT::vector(32, 32); 243 244 const LLT V2S64 = LLT::vector(2, 64); 245 const LLT V3S64 = LLT::vector(3, 64); 246 const LLT V4S64 = LLT::vector(4, 64); 247 const LLT V5S64 = LLT::vector(5, 64); 248 const LLT V6S64 = LLT::vector(6, 64); 249 const LLT V7S64 = LLT::vector(7, 64); 250 const LLT V8S64 = LLT::vector(8, 64); 251 const LLT V16S64 = LLT::vector(16, 64); 252 253 std::initializer_list<LLT> AllS32Vectors = 254 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 255 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 256 std::initializer_list<LLT> AllS64Vectors = 257 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 258 259 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 260 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 261 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 262 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 263 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 264 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 265 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 266 267 const LLT CodePtr = FlatPtr; 268 269 const std::initializer_list<LLT> AddrSpaces64 = { 270 GlobalPtr, ConstantPtr, FlatPtr 271 }; 272 273 const std::initializer_list<LLT> AddrSpaces32 = { 274 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 275 }; 276 277 const std::initializer_list<LLT> FPTypesBase = { 278 S32, S64 279 }; 280 281 const std::initializer_list<LLT> FPTypes16 = { 282 S32, S64, S16 283 }; 284 285 const std::initializer_list<LLT> FPTypesPK16 = { 286 S32, S64, S16, V2S16 287 }; 288 289 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 290 291 setAction({G_BRCOND, S1}, Legal); // VCC branches 292 setAction({G_BRCOND, S32}, Legal); // SCC branches 293 294 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 295 // elements for v3s16 296 getActionDefinitionsBuilder(G_PHI) 297 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 298 .legalFor(AllS32Vectors) 299 .legalFor(AllS64Vectors) 300 .legalFor(AddrSpaces64) 301 .legalFor(AddrSpaces32) 302 .clampScalar(0, S32, S256) 303 .widenScalarToNextPow2(0, 32) 304 .clampMaxNumElements(0, S32, 16) 305 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 306 .legalIf(isPointer(0)); 307 308 if (ST.hasVOP3PInsts()) { 309 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 310 .legalFor({S32, S16, V2S16}) 311 .clampScalar(0, S16, S32) 312 .clampMaxNumElements(0, S16, 2) 313 .scalarize(0) 314 .widenScalarToNextPow2(0, 32); 315 } else if (ST.has16BitInsts()) { 316 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 317 .legalFor({S32, S16}) 318 .clampScalar(0, S16, S32) 319 .scalarize(0) 320 .widenScalarToNextPow2(0, 32); 321 } else { 322 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 323 .legalFor({S32}) 324 .clampScalar(0, S32, S32) 325 .scalarize(0); 326 } 327 328 // FIXME: Not really legal. Placeholder for custom lowering. 329 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 330 .customFor({S32, S64}) 331 .clampScalar(0, S32, S64) 332 .widenScalarToNextPow2(0, 32) 333 .scalarize(0); 334 335 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 336 .legalFor({S32}) 337 .clampScalar(0, S32, S32) 338 .scalarize(0); 339 340 // Report legal for any types we can handle anywhere. For the cases only legal 341 // on the SALU, RegBankSelect will be able to re-legalize. 342 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 343 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 344 .clampScalar(0, S32, S64) 345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 346 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 347 .widenScalarToNextPow2(0) 348 .scalarize(0); 349 350 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 351 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 352 .legalFor({{S32, S1}, {S32, S32}}) 353 .minScalar(0, S32) 354 // TODO: .scalarize(0) 355 .lower(); 356 357 getActionDefinitionsBuilder(G_BITCAST) 358 // Don't worry about the size constraint. 359 .legalIf(all(isRegisterType(0), isRegisterType(1))) 360 .lower(); 361 362 363 getActionDefinitionsBuilder(G_CONSTANT) 364 .legalFor({S1, S32, S64, S16, GlobalPtr, 365 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 366 .clampScalar(0, S32, S64) 367 .widenScalarToNextPow2(0) 368 .legalIf(isPointer(0)); 369 370 getActionDefinitionsBuilder(G_FCONSTANT) 371 .legalFor({S32, S64, S16}) 372 .clampScalar(0, S16, S64); 373 374 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 375 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 376 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 377 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 378 .clampScalarOrElt(0, S32, S1024) 379 .legalIf(isMultiple32(0)) 380 .widenScalarToNextPow2(0, 32) 381 .clampMaxNumElements(0, S32, 16); 382 383 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 384 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 385 .unsupportedFor({PrivatePtr}) 386 .custom(); 387 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 388 389 auto &FPOpActions = getActionDefinitionsBuilder( 390 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 391 .legalFor({S32, S64}); 392 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 393 .customFor({S32, S64}); 394 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 395 .customFor({S32, S64}); 396 397 if (ST.has16BitInsts()) { 398 if (ST.hasVOP3PInsts()) 399 FPOpActions.legalFor({S16, V2S16}); 400 else 401 FPOpActions.legalFor({S16}); 402 403 TrigActions.customFor({S16}); 404 FDIVActions.customFor({S16}); 405 } 406 407 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 408 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 409 410 if (ST.hasVOP3PInsts()) { 411 MinNumMaxNum.customFor(FPTypesPK16) 412 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 413 .clampMaxNumElements(0, S16, 2) 414 .clampScalar(0, S16, S64) 415 .scalarize(0); 416 } else if (ST.has16BitInsts()) { 417 MinNumMaxNum.customFor(FPTypes16) 418 .clampScalar(0, S16, S64) 419 .scalarize(0); 420 } else { 421 MinNumMaxNum.customFor(FPTypesBase) 422 .clampScalar(0, S32, S64) 423 .scalarize(0); 424 } 425 426 if (ST.hasVOP3PInsts()) 427 FPOpActions.clampMaxNumElements(0, S16, 2); 428 429 FPOpActions 430 .scalarize(0) 431 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 432 433 TrigActions 434 .scalarize(0) 435 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 436 437 FDIVActions 438 .scalarize(0) 439 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 440 441 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 442 .legalFor(FPTypesPK16) 443 .clampMaxNumElements(0, S16, 2) 444 .scalarize(0) 445 .clampScalar(0, S16, S64); 446 447 if (ST.has16BitInsts()) { 448 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 449 .legalFor({S32, S64, S16}) 450 .scalarize(0) 451 .clampScalar(0, S16, S64); 452 } else { 453 getActionDefinitionsBuilder(G_FSQRT) 454 .legalFor({S32, S64}) 455 .scalarize(0) 456 .clampScalar(0, S32, S64); 457 458 if (ST.hasFractBug()) { 459 getActionDefinitionsBuilder(G_FFLOOR) 460 .customFor({S64}) 461 .legalFor({S32, S64}) 462 .scalarize(0) 463 .clampScalar(0, S32, S64); 464 } else { 465 getActionDefinitionsBuilder(G_FFLOOR) 466 .legalFor({S32, S64}) 467 .scalarize(0) 468 .clampScalar(0, S32, S64); 469 } 470 } 471 472 getActionDefinitionsBuilder(G_FPTRUNC) 473 .legalFor({{S32, S64}, {S16, S32}}) 474 .scalarize(0) 475 .lower(); 476 477 getActionDefinitionsBuilder(G_FPEXT) 478 .legalFor({{S64, S32}, {S32, S16}}) 479 .lowerFor({{S64, S16}}) // FIXME: Implement 480 .scalarize(0); 481 482 getActionDefinitionsBuilder(G_FSUB) 483 // Use actual fsub instruction 484 .legalFor({S32}) 485 // Must use fadd + fneg 486 .lowerFor({S64, S16, V2S16}) 487 .scalarize(0) 488 .clampScalar(0, S32, S64); 489 490 // Whether this is legal depends on the floating point mode for the function. 491 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 492 if (ST.hasMadF16()) 493 FMad.customFor({S32, S16}); 494 else 495 FMad.customFor({S32}); 496 FMad.scalarize(0) 497 .lower(); 498 499 // TODO: Do we need to clamp maximum bitwidth? 500 getActionDefinitionsBuilder(G_TRUNC) 501 .legalIf(isScalar(0)) 502 .legalFor({{V2S16, V2S32}}) 503 .clampMaxNumElements(0, S16, 2) 504 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 505 // situations (like an invalid implicit use), we don't want to infinite loop 506 // in the legalizer. 507 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 508 .alwaysLegal(); 509 510 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 511 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 512 {S32, S1}, {S64, S1}, {S16, S1}}) 513 .scalarize(0) 514 .clampScalar(0, S32, S64) 515 .widenScalarToNextPow2(1, 32); 516 517 // TODO: Split s1->s64 during regbankselect for VALU. 518 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 519 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 520 .lowerFor({{S32, S64}}) 521 .lowerIf(typeIs(1, S1)) 522 .customFor({{S64, S64}}); 523 if (ST.has16BitInsts()) 524 IToFP.legalFor({{S16, S16}}); 525 IToFP.clampScalar(1, S32, S64) 526 .scalarize(0) 527 .widenScalarToNextPow2(1); 528 529 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 530 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 531 .customFor({{S64, S64}}); 532 if (ST.has16BitInsts()) 533 FPToI.legalFor({{S16, S16}}); 534 else 535 FPToI.minScalar(1, S32); 536 537 FPToI.minScalar(0, S32) 538 .scalarize(0) 539 .lower(); 540 541 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 542 .scalarize(0) 543 .lower(); 544 545 if (ST.has16BitInsts()) { 546 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 547 .legalFor({S16, S32, S64}) 548 .clampScalar(0, S16, S64) 549 .scalarize(0); 550 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 551 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 552 .legalFor({S32, S64}) 553 .clampScalar(0, S32, S64) 554 .scalarize(0); 555 } else { 556 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 557 .legalFor({S32}) 558 .customFor({S64}) 559 .clampScalar(0, S32, S64) 560 .scalarize(0); 561 } 562 563 getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK}) 564 .scalarize(0) 565 .alwaysLegal(); 566 567 auto &CmpBuilder = 568 getActionDefinitionsBuilder(G_ICMP) 569 // The compare output type differs based on the register bank of the output, 570 // so make both s1 and s32 legal. 571 // 572 // Scalar compares producing output in scc will be promoted to s32, as that 573 // is the allocatable register type that will be needed for the copy from 574 // scc. This will be promoted during RegBankSelect, and we assume something 575 // before that won't try to use s32 result types. 576 // 577 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 578 // bank. 579 .legalForCartesianProduct( 580 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 581 .legalForCartesianProduct( 582 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 583 if (ST.has16BitInsts()) { 584 CmpBuilder.legalFor({{S1, S16}}); 585 } 586 587 CmpBuilder 588 .widenScalarToNextPow2(1) 589 .clampScalar(1, S32, S64) 590 .scalarize(0) 591 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 592 593 getActionDefinitionsBuilder(G_FCMP) 594 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 595 .widenScalarToNextPow2(1) 596 .clampScalar(1, S32, S64) 597 .scalarize(0); 598 599 // FIXME: fpow has a selection pattern that should move to custom lowering. 600 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 601 if (ST.has16BitInsts()) 602 Exp2Ops.legalFor({S32, S16}); 603 else 604 Exp2Ops.legalFor({S32}); 605 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 606 Exp2Ops.scalarize(0); 607 608 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 609 if (ST.has16BitInsts()) 610 ExpOps.customFor({{S32}, {S16}}); 611 else 612 ExpOps.customFor({S32}); 613 ExpOps.clampScalar(0, MinScalarFPTy, S32) 614 .scalarize(0); 615 616 // The 64-bit versions produce 32-bit results, but only on the SALU. 617 getActionDefinitionsBuilder(G_CTPOP) 618 .legalFor({{S32, S32}, {S32, S64}}) 619 .clampScalar(0, S32, S32) 620 .clampScalar(1, S32, S64) 621 .scalarize(0) 622 .widenScalarToNextPow2(0, 32) 623 .widenScalarToNextPow2(1, 32); 624 625 // The hardware instructions return a different result on 0 than the generic 626 // instructions expect. The hardware produces -1, but these produce the 627 // bitwidth. 628 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 629 .scalarize(0) 630 .clampScalar(0, S32, S32) 631 .clampScalar(1, S32, S64) 632 .widenScalarToNextPow2(0, 32) 633 .widenScalarToNextPow2(1, 32) 634 .lower(); 635 636 // The 64-bit versions produce 32-bit results, but only on the SALU. 637 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 638 .legalFor({{S32, S32}, {S32, S64}}) 639 .clampScalar(0, S32, S32) 640 .clampScalar(1, S32, S64) 641 .scalarize(0) 642 .widenScalarToNextPow2(0, 32) 643 .widenScalarToNextPow2(1, 32); 644 645 getActionDefinitionsBuilder(G_BITREVERSE) 646 .legalFor({S32}) 647 .clampScalar(0, S32, S32) 648 .scalarize(0); 649 650 if (ST.has16BitInsts()) { 651 getActionDefinitionsBuilder(G_BSWAP) 652 .legalFor({S16, S32, V2S16}) 653 .clampMaxNumElements(0, S16, 2) 654 // FIXME: Fixing non-power-of-2 before clamp is workaround for 655 // narrowScalar limitation. 656 .widenScalarToNextPow2(0) 657 .clampScalar(0, S16, S32) 658 .scalarize(0); 659 660 if (ST.hasVOP3PInsts()) { 661 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 662 .legalFor({S32, S16, V2S16}) 663 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 664 .clampMaxNumElements(0, S16, 2) 665 .minScalar(0, S16) 666 .widenScalarToNextPow2(0) 667 .scalarize(0) 668 .lower(); 669 } else { 670 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 671 .legalFor({S32, S16}) 672 .widenScalarToNextPow2(0) 673 .minScalar(0, S16) 674 .scalarize(0) 675 .lower(); 676 } 677 } else { 678 // TODO: Should have same legality without v_perm_b32 679 getActionDefinitionsBuilder(G_BSWAP) 680 .legalFor({S32}) 681 .lowerIf(narrowerThan(0, 32)) 682 // FIXME: Fixing non-power-of-2 before clamp is workaround for 683 // narrowScalar limitation. 684 .widenScalarToNextPow2(0) 685 .maxScalar(0, S32) 686 .scalarize(0) 687 .lower(); 688 689 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 690 .legalFor({S32}) 691 .minScalar(0, S32) 692 .widenScalarToNextPow2(0) 693 .scalarize(0) 694 .lower(); 695 } 696 697 getActionDefinitionsBuilder(G_INTTOPTR) 698 // List the common cases 699 .legalForCartesianProduct(AddrSpaces64, {S64}) 700 .legalForCartesianProduct(AddrSpaces32, {S32}) 701 .scalarize(0) 702 // Accept any address space as long as the size matches 703 .legalIf(sameSize(0, 1)) 704 .widenScalarIf(smallerThan(1, 0), 705 [](const LegalityQuery &Query) { 706 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 707 }) 708 .narrowScalarIf(greaterThan(1, 0), 709 [](const LegalityQuery &Query) { 710 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 711 }); 712 713 getActionDefinitionsBuilder(G_PTRTOINT) 714 // List the common cases 715 .legalForCartesianProduct(AddrSpaces64, {S64}) 716 .legalForCartesianProduct(AddrSpaces32, {S32}) 717 .scalarize(0) 718 // Accept any address space as long as the size matches 719 .legalIf(sameSize(0, 1)) 720 .widenScalarIf(smallerThan(0, 1), 721 [](const LegalityQuery &Query) { 722 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 723 }) 724 .narrowScalarIf( 725 greaterThan(0, 1), 726 [](const LegalityQuery &Query) { 727 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 728 }); 729 730 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 731 .scalarize(0) 732 .custom(); 733 734 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 735 // handle some operations by just promoting the register during 736 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 737 auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned { 738 switch (AS) { 739 // FIXME: Private element size. 740 case AMDGPUAS::PRIVATE_ADDRESS: 741 return 32; 742 // FIXME: Check subtarget 743 case AMDGPUAS::LOCAL_ADDRESS: 744 return ST.useDS128() ? 128 : 64; 745 746 // Treat constant and global as identical. SMRD loads are sometimes usable 747 // for global loads (ideally constant address space should be eliminated) 748 // depending on the context. Legality cannot be context dependent, but 749 // RegBankSelect can split the load as necessary depending on the pointer 750 // register bank/uniformity and if the memory is invariant or not written in 751 // a kernel. 752 case AMDGPUAS::CONSTANT_ADDRESS: 753 case AMDGPUAS::GLOBAL_ADDRESS: 754 return IsLoad ? 512 : 128; 755 default: 756 return 128; 757 } 758 }; 759 760 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 761 bool IsLoad) -> bool { 762 const LLT DstTy = Query.Types[0]; 763 764 // Split vector extloads. 765 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 766 unsigned Align = Query.MMODescrs[0].AlignInBits; 767 768 if (MemSize < DstTy.getSizeInBits()) 769 MemSize = std::max(MemSize, Align); 770 771 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 772 return true; 773 774 const LLT PtrTy = Query.Types[1]; 775 unsigned AS = PtrTy.getAddressSpace(); 776 if (MemSize > maxSizeForAddrSpace(AS, IsLoad)) 777 return true; 778 779 // Catch weird sized loads that don't evenly divide into the access sizes 780 // TODO: May be able to widen depending on alignment etc. 781 unsigned NumRegs = (MemSize + 31) / 32; 782 if (NumRegs == 3) { 783 if (!ST.hasDwordx3LoadStores()) 784 return true; 785 } else { 786 // If the alignment allows, these should have been widened. 787 if (!isPowerOf2_32(NumRegs)) 788 return true; 789 } 790 791 if (Align < MemSize) { 792 const SITargetLowering *TLI = ST.getTargetLowering(); 793 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 794 } 795 796 return false; 797 }; 798 799 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool { 800 unsigned Size = Query.Types[0].getSizeInBits(); 801 if (isPowerOf2_32(Size)) 802 return false; 803 804 if (Size == 96 && ST.hasDwordx3LoadStores()) 805 return false; 806 807 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 808 if (Size >= maxSizeForAddrSpace(AddrSpace, true)) 809 return false; 810 811 unsigned Align = Query.MMODescrs[0].AlignInBits; 812 unsigned RoundedSize = NextPowerOf2(Size); 813 return (Align >= RoundedSize); 814 }; 815 816 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 817 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 818 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 819 820 // TODO: Refine based on subtargets which support unaligned access or 128-bit 821 // LDS 822 // TODO: Unsupported flat for SI. 823 824 for (unsigned Op : {G_LOAD, G_STORE}) { 825 const bool IsStore = Op == G_STORE; 826 827 auto &Actions = getActionDefinitionsBuilder(Op); 828 // Whitelist the common cases. 829 // TODO: Loads to s16 on gfx9 830 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 831 {V2S32, GlobalPtr, 64, GlobalAlign32}, 832 {V4S32, GlobalPtr, 128, GlobalAlign32}, 833 {S128, GlobalPtr, 128, GlobalAlign32}, 834 {S64, GlobalPtr, 64, GlobalAlign32}, 835 {V2S64, GlobalPtr, 128, GlobalAlign32}, 836 {V2S16, GlobalPtr, 32, GlobalAlign32}, 837 {S32, GlobalPtr, 8, GlobalAlign8}, 838 {S32, GlobalPtr, 16, GlobalAlign16}, 839 840 {S32, LocalPtr, 32, 32}, 841 {S64, LocalPtr, 64, 32}, 842 {V2S32, LocalPtr, 64, 32}, 843 {S32, LocalPtr, 8, 8}, 844 {S32, LocalPtr, 16, 16}, 845 {V2S16, LocalPtr, 32, 32}, 846 847 {S32, PrivatePtr, 32, 32}, 848 {S32, PrivatePtr, 8, 8}, 849 {S32, PrivatePtr, 16, 16}, 850 {V2S16, PrivatePtr, 32, 32}, 851 852 {S32, FlatPtr, 32, GlobalAlign32}, 853 {S32, FlatPtr, 16, GlobalAlign16}, 854 {S32, FlatPtr, 8, GlobalAlign8}, 855 {V2S16, FlatPtr, 32, GlobalAlign32}, 856 857 {S32, ConstantPtr, 32, GlobalAlign32}, 858 {V2S32, ConstantPtr, 64, GlobalAlign32}, 859 {V4S32, ConstantPtr, 128, GlobalAlign32}, 860 {S64, ConstantPtr, 64, GlobalAlign32}, 861 {S128, ConstantPtr, 128, GlobalAlign32}, 862 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 863 Actions 864 .customIf(typeIs(1, Constant32Ptr)) 865 // Widen suitably aligned loads by loading extra elements. 866 .moreElementsIf([=](const LegalityQuery &Query) { 867 const LLT Ty = Query.Types[0]; 868 return Op == G_LOAD && Ty.isVector() && 869 shouldWidenLoadResult(Query); 870 }, moreElementsToNextPow2(0)) 871 .widenScalarIf([=](const LegalityQuery &Query) { 872 const LLT Ty = Query.Types[0]; 873 return Op == G_LOAD && !Ty.isVector() && 874 shouldWidenLoadResult(Query); 875 }, widenScalarOrEltToNextPow2(0)) 876 .narrowScalarIf( 877 [=](const LegalityQuery &Query) -> bool { 878 return !Query.Types[0].isVector() && 879 needToSplitMemOp(Query, Op == G_LOAD); 880 }, 881 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 882 const LLT DstTy = Query.Types[0]; 883 const LLT PtrTy = Query.Types[1]; 884 885 const unsigned DstSize = DstTy.getSizeInBits(); 886 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 887 888 // Split extloads. 889 if (DstSize > MemSize) 890 return std::make_pair(0, LLT::scalar(MemSize)); 891 892 if (!isPowerOf2_32(DstSize)) { 893 // We're probably decomposing an odd sized store. Try to split 894 // to the widest type. TODO: Account for alignment. As-is it 895 // should be OK, since the new parts will be further legalized. 896 unsigned FloorSize = PowerOf2Floor(DstSize); 897 return std::make_pair(0, LLT::scalar(FloorSize)); 898 } 899 900 if (DstSize > 32 && (DstSize % 32 != 0)) { 901 // FIXME: Need a way to specify non-extload of larger size if 902 // suitably aligned. 903 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 904 } 905 906 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 907 Op == G_LOAD); 908 if (MemSize > MaxSize) 909 return std::make_pair(0, LLT::scalar(MaxSize)); 910 911 unsigned Align = Query.MMODescrs[0].AlignInBits; 912 return std::make_pair(0, LLT::scalar(Align)); 913 }) 914 .fewerElementsIf( 915 [=](const LegalityQuery &Query) -> bool { 916 return Query.Types[0].isVector() && 917 needToSplitMemOp(Query, Op == G_LOAD); 918 }, 919 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 920 const LLT DstTy = Query.Types[0]; 921 const LLT PtrTy = Query.Types[1]; 922 923 LLT EltTy = DstTy.getElementType(); 924 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(), 925 Op == G_LOAD); 926 927 // FIXME: Handle widened to power of 2 results better. This ends 928 // up scalarizing. 929 // FIXME: 3 element stores scalarized on SI 930 931 // Split if it's too large for the address space. 932 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 933 unsigned NumElts = DstTy.getNumElements(); 934 unsigned EltSize = EltTy.getSizeInBits(); 935 936 if (MaxSize % EltSize == 0) { 937 return std::make_pair( 938 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 939 } 940 941 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 942 943 // FIXME: Refine when odd breakdowns handled 944 // The scalars will need to be re-legalized. 945 if (NumPieces == 1 || NumPieces >= NumElts || 946 NumElts % NumPieces != 0) 947 return std::make_pair(0, EltTy); 948 949 return std::make_pair(0, 950 LLT::vector(NumElts / NumPieces, EltTy)); 951 } 952 953 // FIXME: We could probably handle weird extending loads better. 954 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 955 if (DstTy.getSizeInBits() > MemSize) 956 return std::make_pair(0, EltTy); 957 958 unsigned EltSize = EltTy.getSizeInBits(); 959 unsigned DstSize = DstTy.getSizeInBits(); 960 if (!isPowerOf2_32(DstSize)) { 961 // We're probably decomposing an odd sized store. Try to split 962 // to the widest type. TODO: Account for alignment. As-is it 963 // should be OK, since the new parts will be further legalized. 964 unsigned FloorSize = PowerOf2Floor(DstSize); 965 return std::make_pair( 966 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 967 } 968 969 // Need to split because of alignment. 970 unsigned Align = Query.MMODescrs[0].AlignInBits; 971 if (EltSize > Align && 972 (EltSize / Align < DstTy.getNumElements())) { 973 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 974 } 975 976 // May need relegalization for the scalars. 977 return std::make_pair(0, EltTy); 978 }) 979 .minScalar(0, S32); 980 981 if (IsStore) 982 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 983 984 // TODO: Need a bitcast lower option? 985 Actions 986 .legalIf([=](const LegalityQuery &Query) { 987 const LLT Ty0 = Query.Types[0]; 988 unsigned Size = Ty0.getSizeInBits(); 989 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 990 unsigned Align = Query.MMODescrs[0].AlignInBits; 991 992 // FIXME: Widening store from alignment not valid. 993 if (MemSize < Size) 994 MemSize = std::max(MemSize, Align); 995 996 // No extending vector loads. 997 if (Size > MemSize && Ty0.isVector()) 998 return false; 999 1000 switch (MemSize) { 1001 case 8: 1002 case 16: 1003 return Size == 32; 1004 case 32: 1005 case 64: 1006 case 128: 1007 return true; 1008 case 96: 1009 return ST.hasDwordx3LoadStores(); 1010 case 256: 1011 case 512: 1012 return true; 1013 default: 1014 return false; 1015 } 1016 }) 1017 .widenScalarToNextPow2(0) 1018 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1019 } 1020 1021 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1022 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1023 {S32, GlobalPtr, 16, 2 * 8}, 1024 {S32, LocalPtr, 8, 8}, 1025 {S32, LocalPtr, 16, 16}, 1026 {S32, PrivatePtr, 8, 8}, 1027 {S32, PrivatePtr, 16, 16}, 1028 {S32, ConstantPtr, 8, 8}, 1029 {S32, ConstantPtr, 16, 2 * 8}}); 1030 if (ST.hasFlatAddressSpace()) { 1031 ExtLoads.legalForTypesWithMemDesc( 1032 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1033 } 1034 1035 ExtLoads.clampScalar(0, S32, S32) 1036 .widenScalarToNextPow2(0) 1037 .unsupportedIfMemSizeNotPow2() 1038 .lower(); 1039 1040 auto &Atomics = getActionDefinitionsBuilder( 1041 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1042 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1043 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1044 G_ATOMICRMW_UMIN}) 1045 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1046 {S64, GlobalPtr}, {S64, LocalPtr}}); 1047 if (ST.hasFlatAddressSpace()) { 1048 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1049 } 1050 1051 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1052 .legalFor({{S32, LocalPtr}}); 1053 1054 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1055 // demarshalling 1056 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1057 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1058 {S32, FlatPtr}, {S64, FlatPtr}}) 1059 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1060 {S32, RegionPtr}, {S64, RegionPtr}}); 1061 // TODO: Pointer types, any 32-bit or 64-bit vector 1062 1063 // Condition should be s32 for scalar, s1 for vector. 1064 getActionDefinitionsBuilder(G_SELECT) 1065 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1066 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1067 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1068 .clampScalar(0, S16, S64) 1069 .scalarize(1) 1070 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1071 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1072 .clampMaxNumElements(0, S32, 2) 1073 .clampMaxNumElements(0, LocalPtr, 2) 1074 .clampMaxNumElements(0, PrivatePtr, 2) 1075 .scalarize(0) 1076 .widenScalarToNextPow2(0) 1077 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1078 1079 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1080 // be more flexible with the shift amount type. 1081 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1082 .legalFor({{S32, S32}, {S64, S32}}); 1083 if (ST.has16BitInsts()) { 1084 if (ST.hasVOP3PInsts()) { 1085 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1086 .clampMaxNumElements(0, S16, 2); 1087 } else 1088 Shifts.legalFor({{S16, S16}}); 1089 1090 // TODO: Support 16-bit shift amounts for all types 1091 Shifts.widenScalarIf( 1092 [=](const LegalityQuery &Query) { 1093 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1094 // 32-bit amount. 1095 const LLT ValTy = Query.Types[0]; 1096 const LLT AmountTy = Query.Types[1]; 1097 return ValTy.getSizeInBits() <= 16 && 1098 AmountTy.getSizeInBits() < 16; 1099 }, changeTo(1, S16)); 1100 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1101 Shifts.clampScalar(1, S32, S32); 1102 Shifts.clampScalar(0, S16, S64); 1103 Shifts.widenScalarToNextPow2(0, 16); 1104 } else { 1105 // Make sure we legalize the shift amount type first, as the general 1106 // expansion for the shifted type will produce much worse code if it hasn't 1107 // been truncated already. 1108 Shifts.clampScalar(1, S32, S32); 1109 Shifts.clampScalar(0, S32, S64); 1110 Shifts.widenScalarToNextPow2(0, 32); 1111 } 1112 Shifts.scalarize(0); 1113 1114 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1115 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1116 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1117 unsigned IdxTypeIdx = 2; 1118 1119 getActionDefinitionsBuilder(Op) 1120 .customIf([=](const LegalityQuery &Query) { 1121 const LLT EltTy = Query.Types[EltTypeIdx]; 1122 const LLT VecTy = Query.Types[VecTypeIdx]; 1123 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1124 return (EltTy.getSizeInBits() == 16 || 1125 EltTy.getSizeInBits() % 32 == 0) && 1126 VecTy.getSizeInBits() % 32 == 0 && 1127 VecTy.getSizeInBits() <= 1024 && 1128 IdxTy.getSizeInBits() == 32; 1129 }) 1130 .clampScalar(EltTypeIdx, S32, S64) 1131 .clampScalar(VecTypeIdx, S32, S64) 1132 .clampScalar(IdxTypeIdx, S32, S32); 1133 } 1134 1135 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1136 .unsupportedIf([=](const LegalityQuery &Query) { 1137 const LLT &EltTy = Query.Types[1].getElementType(); 1138 return Query.Types[0] != EltTy; 1139 }); 1140 1141 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1142 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1143 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1144 1145 // FIXME: Doesn't handle extract of illegal sizes. 1146 getActionDefinitionsBuilder(Op) 1147 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1148 // FIXME: Multiples of 16 should not be legal. 1149 .legalIf([=](const LegalityQuery &Query) { 1150 const LLT BigTy = Query.Types[BigTyIdx]; 1151 const LLT LitTy = Query.Types[LitTyIdx]; 1152 return (BigTy.getSizeInBits() % 32 == 0) && 1153 (LitTy.getSizeInBits() % 16 == 0); 1154 }) 1155 .widenScalarIf( 1156 [=](const LegalityQuery &Query) { 1157 const LLT BigTy = Query.Types[BigTyIdx]; 1158 return (BigTy.getScalarSizeInBits() < 16); 1159 }, 1160 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1161 .widenScalarIf( 1162 [=](const LegalityQuery &Query) { 1163 const LLT LitTy = Query.Types[LitTyIdx]; 1164 return (LitTy.getScalarSizeInBits() < 16); 1165 }, 1166 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1167 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1168 .widenScalarToNextPow2(BigTyIdx, 32); 1169 1170 } 1171 1172 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1173 .legalForCartesianProduct(AllS32Vectors, {S32}) 1174 .legalForCartesianProduct(AllS64Vectors, {S64}) 1175 .clampNumElements(0, V16S32, V32S32) 1176 .clampNumElements(0, V2S64, V16S64) 1177 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1178 1179 if (ST.hasScalarPackInsts()) { 1180 BuildVector 1181 // FIXME: Should probably widen s1 vectors straight to s32 1182 .minScalarOrElt(0, S16) 1183 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1184 .minScalar(1, S32); 1185 1186 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1187 .legalFor({V2S16, S32}) 1188 .lower(); 1189 BuildVector.minScalarOrElt(0, S32); 1190 } else { 1191 BuildVector.customFor({V2S16, S16}); 1192 BuildVector.minScalarOrElt(0, S32); 1193 1194 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1195 .customFor({V2S16, S32}) 1196 .lower(); 1197 } 1198 1199 BuildVector.legalIf(isRegisterType(0)); 1200 1201 // FIXME: Clamp maximum size 1202 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1203 .legalIf(isRegisterType(0)); 1204 1205 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1206 // pre-legalize. 1207 if (ST.hasVOP3PInsts()) { 1208 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1209 .customFor({V2S16, V2S16}) 1210 .lower(); 1211 } else 1212 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1213 1214 // Merge/Unmerge 1215 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1216 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1217 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1218 1219 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1220 const LLT Ty = Query.Types[TypeIdx]; 1221 if (Ty.isVector()) { 1222 const LLT &EltTy = Ty.getElementType(); 1223 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1224 return true; 1225 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1226 return true; 1227 } 1228 return false; 1229 }; 1230 1231 auto &Builder = getActionDefinitionsBuilder(Op) 1232 .lowerFor({{S16, V2S16}}) 1233 .lowerIf([=](const LegalityQuery &Query) { 1234 const LLT BigTy = Query.Types[BigTyIdx]; 1235 return BigTy.getSizeInBits() == 32; 1236 }) 1237 // Try to widen to s16 first for small types. 1238 // TODO: Only do this on targets with legal s16 shifts 1239 .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1240 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1241 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1242 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1243 elementTypeIs(1, S16)), 1244 changeTo(1, V2S16)) 1245 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1246 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1247 // valid. 1248 .clampScalar(LitTyIdx, S32, S512) 1249 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1250 // Break up vectors with weird elements into scalars 1251 .fewerElementsIf( 1252 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1253 scalarize(0)) 1254 .fewerElementsIf( 1255 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1256 scalarize(1)) 1257 .clampScalar(BigTyIdx, S32, S1024); 1258 1259 if (Op == G_MERGE_VALUES) { 1260 Builder.widenScalarIf( 1261 // TODO: Use 16-bit shifts if legal for 8-bit values? 1262 [=](const LegalityQuery &Query) { 1263 const LLT Ty = Query.Types[LitTyIdx]; 1264 return Ty.getSizeInBits() < 32; 1265 }, 1266 changeTo(LitTyIdx, S32)); 1267 } 1268 1269 Builder.widenScalarIf( 1270 [=](const LegalityQuery &Query) { 1271 const LLT Ty = Query.Types[BigTyIdx]; 1272 return !isPowerOf2_32(Ty.getSizeInBits()) && 1273 Ty.getSizeInBits() % 16 != 0; 1274 }, 1275 [=](const LegalityQuery &Query) { 1276 // Pick the next power of 2, or a multiple of 64 over 128. 1277 // Whichever is smaller. 1278 const LLT &Ty = Query.Types[BigTyIdx]; 1279 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1280 if (NewSizeInBits >= 256) { 1281 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1282 if (RoundedTo < NewSizeInBits) 1283 NewSizeInBits = RoundedTo; 1284 } 1285 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1286 }) 1287 .legalIf([=](const LegalityQuery &Query) { 1288 const LLT &BigTy = Query.Types[BigTyIdx]; 1289 const LLT &LitTy = Query.Types[LitTyIdx]; 1290 1291 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1292 return false; 1293 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1294 return false; 1295 1296 return BigTy.getSizeInBits() % 16 == 0 && 1297 LitTy.getSizeInBits() % 16 == 0 && 1298 BigTy.getSizeInBits() <= 1024; 1299 }) 1300 // Any vectors left are the wrong size. Scalarize them. 1301 .scalarize(0) 1302 .scalarize(1); 1303 } 1304 1305 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1306 // RegBankSelect. 1307 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1308 .legalFor({{S32}, {S64}}); 1309 1310 if (ST.hasVOP3PInsts()) { 1311 SextInReg.lowerFor({{V2S16}}) 1312 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1313 // get more vector shift opportunities, since we'll get those when 1314 // expanded. 1315 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1316 } else if (ST.has16BitInsts()) { 1317 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1318 } else { 1319 // Prefer to promote to s32 before lowering if we don't have 16-bit 1320 // shifts. This avoid a lot of intermediate truncate and extend operations. 1321 SextInReg.lowerFor({{S32}, {S64}}); 1322 } 1323 1324 SextInReg 1325 .scalarize(0) 1326 .clampScalar(0, S32, S64) 1327 .lower(); 1328 1329 getActionDefinitionsBuilder(G_FSHR) 1330 .legalFor({{S32, S32}}) 1331 .scalarize(0) 1332 .lower(); 1333 1334 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1335 .legalFor({S64}); 1336 1337 getActionDefinitionsBuilder({ 1338 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1339 G_FCOPYSIGN, 1340 1341 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1342 G_READ_REGISTER, 1343 G_WRITE_REGISTER, 1344 1345 G_SADDO, G_SSUBO, 1346 1347 // TODO: Implement 1348 G_FMINIMUM, G_FMAXIMUM, 1349 G_FSHL 1350 }).lower(); 1351 1352 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1353 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1354 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1355 .unsupported(); 1356 1357 computeTables(); 1358 verify(*ST.getInstrInfo()); 1359 } 1360 1361 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1362 MachineRegisterInfo &MRI, 1363 MachineIRBuilder &B, 1364 GISelChangeObserver &Observer) const { 1365 switch (MI.getOpcode()) { 1366 case TargetOpcode::G_ADDRSPACE_CAST: 1367 return legalizeAddrSpaceCast(MI, MRI, B); 1368 case TargetOpcode::G_FRINT: 1369 return legalizeFrint(MI, MRI, B); 1370 case TargetOpcode::G_FCEIL: 1371 return legalizeFceil(MI, MRI, B); 1372 case TargetOpcode::G_INTRINSIC_TRUNC: 1373 return legalizeIntrinsicTrunc(MI, MRI, B); 1374 case TargetOpcode::G_SITOFP: 1375 return legalizeITOFP(MI, MRI, B, true); 1376 case TargetOpcode::G_UITOFP: 1377 return legalizeITOFP(MI, MRI, B, false); 1378 case TargetOpcode::G_FPTOSI: 1379 return legalizeFPTOI(MI, MRI, B, true); 1380 case TargetOpcode::G_FPTOUI: 1381 return legalizeFPTOI(MI, MRI, B, false); 1382 case TargetOpcode::G_FMINNUM: 1383 case TargetOpcode::G_FMAXNUM: 1384 case TargetOpcode::G_FMINNUM_IEEE: 1385 case TargetOpcode::G_FMAXNUM_IEEE: 1386 return legalizeMinNumMaxNum(MI, MRI, B); 1387 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1388 return legalizeExtractVectorElt(MI, MRI, B); 1389 case TargetOpcode::G_INSERT_VECTOR_ELT: 1390 return legalizeInsertVectorElt(MI, MRI, B); 1391 case TargetOpcode::G_SHUFFLE_VECTOR: 1392 return legalizeShuffleVector(MI, MRI, B); 1393 case TargetOpcode::G_FSIN: 1394 case TargetOpcode::G_FCOS: 1395 return legalizeSinCos(MI, MRI, B); 1396 case TargetOpcode::G_GLOBAL_VALUE: 1397 return legalizeGlobalValue(MI, MRI, B); 1398 case TargetOpcode::G_LOAD: 1399 return legalizeLoad(MI, MRI, B, Observer); 1400 case TargetOpcode::G_FMAD: 1401 return legalizeFMad(MI, MRI, B); 1402 case TargetOpcode::G_FDIV: 1403 return legalizeFDIV(MI, MRI, B); 1404 case TargetOpcode::G_UDIV: 1405 case TargetOpcode::G_UREM: 1406 return legalizeUDIV_UREM(MI, MRI, B); 1407 case TargetOpcode::G_SDIV: 1408 case TargetOpcode::G_SREM: 1409 return legalizeSDIV_SREM(MI, MRI, B); 1410 case TargetOpcode::G_ATOMIC_CMPXCHG: 1411 return legalizeAtomicCmpXChg(MI, MRI, B); 1412 case TargetOpcode::G_FLOG: 1413 return legalizeFlog(MI, B, 1.0f / numbers::log2ef); 1414 case TargetOpcode::G_FLOG10: 1415 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1416 case TargetOpcode::G_FEXP: 1417 return legalizeFExp(MI, B); 1418 case TargetOpcode::G_FPOW: 1419 return legalizeFPow(MI, B); 1420 case TargetOpcode::G_FFLOOR: 1421 return legalizeFFloor(MI, MRI, B); 1422 case TargetOpcode::G_BUILD_VECTOR: 1423 return legalizeBuildVector(MI, MRI, B); 1424 default: 1425 return false; 1426 } 1427 1428 llvm_unreachable("expected switch to return"); 1429 } 1430 1431 Register AMDGPULegalizerInfo::getSegmentAperture( 1432 unsigned AS, 1433 MachineRegisterInfo &MRI, 1434 MachineIRBuilder &B) const { 1435 MachineFunction &MF = B.getMF(); 1436 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1437 const LLT S32 = LLT::scalar(32); 1438 1439 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1440 1441 if (ST.hasApertureRegs()) { 1442 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1443 // getreg. 1444 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1445 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1446 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1447 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1448 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1449 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1450 unsigned Encoding = 1451 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1452 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1453 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1454 1455 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1456 1457 B.buildInstr(AMDGPU::S_GETREG_B32) 1458 .addDef(GetReg) 1459 .addImm(Encoding); 1460 MRI.setType(GetReg, S32); 1461 1462 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1463 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1464 } 1465 1466 Register QueuePtr = MRI.createGenericVirtualRegister( 1467 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1468 1469 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1470 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1471 return Register(); 1472 1473 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1474 // private_segment_aperture_base_hi. 1475 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1476 1477 // TODO: can we be smarter about machine pointer info? 1478 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1479 MachineMemOperand *MMO = MF.getMachineMemOperand( 1480 PtrInfo, 1481 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1482 MachineMemOperand::MOInvariant, 1483 4, commonAlignment(Align(64), StructOffset)); 1484 1485 Register LoadAddr; 1486 1487 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1488 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1489 } 1490 1491 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1492 MachineInstr &MI, MachineRegisterInfo &MRI, 1493 MachineIRBuilder &B) const { 1494 MachineFunction &MF = B.getMF(); 1495 1496 B.setInstr(MI); 1497 1498 const LLT S32 = LLT::scalar(32); 1499 Register Dst = MI.getOperand(0).getReg(); 1500 Register Src = MI.getOperand(1).getReg(); 1501 1502 LLT DstTy = MRI.getType(Dst); 1503 LLT SrcTy = MRI.getType(Src); 1504 unsigned DestAS = DstTy.getAddressSpace(); 1505 unsigned SrcAS = SrcTy.getAddressSpace(); 1506 1507 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1508 // vector element. 1509 assert(!DstTy.isVector()); 1510 1511 const AMDGPUTargetMachine &TM 1512 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1513 1514 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1515 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1516 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1517 return true; 1518 } 1519 1520 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1521 // Truncate. 1522 B.buildExtract(Dst, Src, 0); 1523 MI.eraseFromParent(); 1524 return true; 1525 } 1526 1527 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1528 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1529 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1530 1531 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1532 // another. Merge operands are required to be the same type, but creating an 1533 // extra ptrtoint would be kind of pointless. 1534 auto HighAddr = B.buildConstant( 1535 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1536 B.buildMerge(Dst, {Src, HighAddr}); 1537 MI.eraseFromParent(); 1538 return true; 1539 } 1540 1541 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1542 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1543 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1544 unsigned NullVal = TM.getNullPointerValue(DestAS); 1545 1546 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1547 auto FlatNull = B.buildConstant(SrcTy, 0); 1548 1549 // Extract low 32-bits of the pointer. 1550 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1551 1552 auto CmpRes = 1553 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1554 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1555 1556 MI.eraseFromParent(); 1557 return true; 1558 } 1559 1560 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1561 return false; 1562 1563 if (!ST.hasFlatAddressSpace()) 1564 return false; 1565 1566 auto SegmentNull = 1567 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1568 auto FlatNull = 1569 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1570 1571 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1572 if (!ApertureReg.isValid()) 1573 return false; 1574 1575 auto CmpRes = 1576 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1577 1578 // Coerce the type of the low half of the result so we can use merge_values. 1579 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1580 1581 // TODO: Should we allow mismatched types but matching sizes in merges to 1582 // avoid the ptrtoint? 1583 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1584 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1585 1586 MI.eraseFromParent(); 1587 return true; 1588 } 1589 1590 bool AMDGPULegalizerInfo::legalizeFrint( 1591 MachineInstr &MI, MachineRegisterInfo &MRI, 1592 MachineIRBuilder &B) const { 1593 B.setInstr(MI); 1594 1595 Register Src = MI.getOperand(1).getReg(); 1596 LLT Ty = MRI.getType(Src); 1597 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1598 1599 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1600 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1601 1602 auto C1 = B.buildFConstant(Ty, C1Val); 1603 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1604 1605 // TODO: Should this propagate fast-math-flags? 1606 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1607 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1608 1609 auto C2 = B.buildFConstant(Ty, C2Val); 1610 auto Fabs = B.buildFAbs(Ty, Src); 1611 1612 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1613 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1614 return true; 1615 } 1616 1617 bool AMDGPULegalizerInfo::legalizeFceil( 1618 MachineInstr &MI, MachineRegisterInfo &MRI, 1619 MachineIRBuilder &B) const { 1620 B.setInstr(MI); 1621 1622 const LLT S1 = LLT::scalar(1); 1623 const LLT S64 = LLT::scalar(64); 1624 1625 Register Src = MI.getOperand(1).getReg(); 1626 assert(MRI.getType(Src) == S64); 1627 1628 // result = trunc(src) 1629 // if (src > 0.0 && src != result) 1630 // result += 1.0 1631 1632 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1633 1634 const auto Zero = B.buildFConstant(S64, 0.0); 1635 const auto One = B.buildFConstant(S64, 1.0); 1636 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1637 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1638 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1639 auto Add = B.buildSelect(S64, And, One, Zero); 1640 1641 // TODO: Should this propagate fast-math-flags? 1642 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1643 return true; 1644 } 1645 1646 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1647 MachineIRBuilder &B) { 1648 const unsigned FractBits = 52; 1649 const unsigned ExpBits = 11; 1650 LLT S32 = LLT::scalar(32); 1651 1652 auto Const0 = B.buildConstant(S32, FractBits - 32); 1653 auto Const1 = B.buildConstant(S32, ExpBits); 1654 1655 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1656 .addUse(Const0.getReg(0)) 1657 .addUse(Const1.getReg(0)); 1658 1659 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1660 } 1661 1662 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1663 MachineInstr &MI, MachineRegisterInfo &MRI, 1664 MachineIRBuilder &B) const { 1665 B.setInstr(MI); 1666 1667 const LLT S1 = LLT::scalar(1); 1668 const LLT S32 = LLT::scalar(32); 1669 const LLT S64 = LLT::scalar(64); 1670 1671 Register Src = MI.getOperand(1).getReg(); 1672 assert(MRI.getType(Src) == S64); 1673 1674 // TODO: Should this use extract since the low half is unused? 1675 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1676 Register Hi = Unmerge.getReg(1); 1677 1678 // Extract the upper half, since this is where we will find the sign and 1679 // exponent. 1680 auto Exp = extractF64Exponent(Hi, B); 1681 1682 const unsigned FractBits = 52; 1683 1684 // Extract the sign bit. 1685 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1686 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1687 1688 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1689 1690 const auto Zero32 = B.buildConstant(S32, 0); 1691 1692 // Extend back to 64-bits. 1693 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1694 1695 auto Shr = B.buildAShr(S64, FractMask, Exp); 1696 auto Not = B.buildNot(S64, Shr); 1697 auto Tmp0 = B.buildAnd(S64, Src, Not); 1698 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1699 1700 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1701 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1702 1703 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1704 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1705 return true; 1706 } 1707 1708 bool AMDGPULegalizerInfo::legalizeITOFP( 1709 MachineInstr &MI, MachineRegisterInfo &MRI, 1710 MachineIRBuilder &B, bool Signed) const { 1711 B.setInstr(MI); 1712 1713 Register Dst = MI.getOperand(0).getReg(); 1714 Register Src = MI.getOperand(1).getReg(); 1715 1716 const LLT S64 = LLT::scalar(64); 1717 const LLT S32 = LLT::scalar(32); 1718 1719 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1720 1721 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1722 1723 auto CvtHi = Signed ? 1724 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1725 B.buildUITOFP(S64, Unmerge.getReg(1)); 1726 1727 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1728 1729 auto ThirtyTwo = B.buildConstant(S32, 32); 1730 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1731 .addUse(CvtHi.getReg(0)) 1732 .addUse(ThirtyTwo.getReg(0)); 1733 1734 // TODO: Should this propagate fast-math-flags? 1735 B.buildFAdd(Dst, LdExp, CvtLo); 1736 MI.eraseFromParent(); 1737 return true; 1738 } 1739 1740 // TODO: Copied from DAG implementation. Verify logic and document how this 1741 // actually works. 1742 bool AMDGPULegalizerInfo::legalizeFPTOI( 1743 MachineInstr &MI, MachineRegisterInfo &MRI, 1744 MachineIRBuilder &B, bool Signed) const { 1745 B.setInstr(MI); 1746 1747 Register Dst = MI.getOperand(0).getReg(); 1748 Register Src = MI.getOperand(1).getReg(); 1749 1750 const LLT S64 = LLT::scalar(64); 1751 const LLT S32 = LLT::scalar(32); 1752 1753 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1754 1755 unsigned Flags = MI.getFlags(); 1756 1757 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1758 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1759 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1760 1761 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1762 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1763 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1764 1765 auto Hi = Signed ? 1766 B.buildFPTOSI(S32, FloorMul) : 1767 B.buildFPTOUI(S32, FloorMul); 1768 auto Lo = B.buildFPTOUI(S32, Fma); 1769 1770 B.buildMerge(Dst, { Lo, Hi }); 1771 MI.eraseFromParent(); 1772 1773 return true; 1774 } 1775 1776 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1777 MachineInstr &MI, MachineRegisterInfo &MRI, 1778 MachineIRBuilder &B) const { 1779 MachineFunction &MF = B.getMF(); 1780 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1781 1782 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1783 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1784 1785 // With ieee_mode disabled, the instructions have the correct behavior 1786 // already for G_FMINNUM/G_FMAXNUM 1787 if (!MFI->getMode().IEEE) 1788 return !IsIEEEOp; 1789 1790 if (IsIEEEOp) 1791 return true; 1792 1793 MachineIRBuilder HelperBuilder(MI); 1794 GISelObserverWrapper DummyObserver; 1795 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1796 HelperBuilder.setInstr(MI); 1797 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1798 } 1799 1800 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1801 MachineInstr &MI, MachineRegisterInfo &MRI, 1802 MachineIRBuilder &B) const { 1803 // TODO: Should move some of this into LegalizerHelper. 1804 1805 // TODO: Promote dynamic indexing of s16 to s32 1806 1807 // FIXME: Artifact combiner probably should have replaced the truncated 1808 // constant before this, so we shouldn't need 1809 // getConstantVRegValWithLookThrough. 1810 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1811 MI.getOperand(2).getReg(), MRI); 1812 if (!IdxVal) // Dynamic case will be selected to register indexing. 1813 return true; 1814 1815 Register Dst = MI.getOperand(0).getReg(); 1816 Register Vec = MI.getOperand(1).getReg(); 1817 1818 LLT VecTy = MRI.getType(Vec); 1819 LLT EltTy = VecTy.getElementType(); 1820 assert(EltTy == MRI.getType(Dst)); 1821 1822 B.setInstr(MI); 1823 1824 if (IdxVal->Value < VecTy.getNumElements()) 1825 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1826 else 1827 B.buildUndef(Dst); 1828 1829 MI.eraseFromParent(); 1830 return true; 1831 } 1832 1833 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1834 MachineInstr &MI, MachineRegisterInfo &MRI, 1835 MachineIRBuilder &B) const { 1836 // TODO: Should move some of this into LegalizerHelper. 1837 1838 // TODO: Promote dynamic indexing of s16 to s32 1839 1840 // FIXME: Artifact combiner probably should have replaced the truncated 1841 // constant before this, so we shouldn't need 1842 // getConstantVRegValWithLookThrough. 1843 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1844 MI.getOperand(3).getReg(), MRI); 1845 if (!IdxVal) // Dynamic case will be selected to register indexing. 1846 return true; 1847 1848 Register Dst = MI.getOperand(0).getReg(); 1849 Register Vec = MI.getOperand(1).getReg(); 1850 Register Ins = MI.getOperand(2).getReg(); 1851 1852 LLT VecTy = MRI.getType(Vec); 1853 LLT EltTy = VecTy.getElementType(); 1854 assert(EltTy == MRI.getType(Ins)); 1855 1856 B.setInstr(MI); 1857 1858 if (IdxVal->Value < VecTy.getNumElements()) 1859 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1860 else 1861 B.buildUndef(Dst); 1862 1863 MI.eraseFromParent(); 1864 return true; 1865 } 1866 1867 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1868 MachineInstr &MI, MachineRegisterInfo &MRI, 1869 MachineIRBuilder &B) const { 1870 const LLT V2S16 = LLT::vector(2, 16); 1871 1872 Register Dst = MI.getOperand(0).getReg(); 1873 Register Src0 = MI.getOperand(1).getReg(); 1874 LLT DstTy = MRI.getType(Dst); 1875 LLT SrcTy = MRI.getType(Src0); 1876 1877 if (SrcTy == V2S16 && DstTy == V2S16 && 1878 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1879 return true; 1880 1881 MachineIRBuilder HelperBuilder(MI); 1882 GISelObserverWrapper DummyObserver; 1883 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1884 HelperBuilder.setInstr(MI); 1885 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1886 } 1887 1888 bool AMDGPULegalizerInfo::legalizeSinCos( 1889 MachineInstr &MI, MachineRegisterInfo &MRI, 1890 MachineIRBuilder &B) const { 1891 B.setInstr(MI); 1892 1893 Register DstReg = MI.getOperand(0).getReg(); 1894 Register SrcReg = MI.getOperand(1).getReg(); 1895 LLT Ty = MRI.getType(DstReg); 1896 unsigned Flags = MI.getFlags(); 1897 1898 Register TrigVal; 1899 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1900 if (ST.hasTrigReducedRange()) { 1901 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1902 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1903 .addUse(MulVal.getReg(0)) 1904 .setMIFlags(Flags).getReg(0); 1905 } else 1906 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1907 1908 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1909 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1910 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1911 .addUse(TrigVal) 1912 .setMIFlags(Flags); 1913 MI.eraseFromParent(); 1914 return true; 1915 } 1916 1917 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1918 Register DstReg, LLT PtrTy, 1919 MachineIRBuilder &B, const GlobalValue *GV, 1920 unsigned Offset, unsigned GAFlags) const { 1921 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1922 // to the following code sequence: 1923 // 1924 // For constant address space: 1925 // s_getpc_b64 s[0:1] 1926 // s_add_u32 s0, s0, $symbol 1927 // s_addc_u32 s1, s1, 0 1928 // 1929 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1930 // a fixup or relocation is emitted to replace $symbol with a literal 1931 // constant, which is a pc-relative offset from the encoding of the $symbol 1932 // operand to the global variable. 1933 // 1934 // For global address space: 1935 // s_getpc_b64 s[0:1] 1936 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1937 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1938 // 1939 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1940 // fixups or relocations are emitted to replace $symbol@*@lo and 1941 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1942 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1943 // operand to the global variable. 1944 // 1945 // What we want here is an offset from the value returned by s_getpc 1946 // (which is the address of the s_add_u32 instruction) to the global 1947 // variable, but since the encoding of $symbol starts 4 bytes after the start 1948 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1949 // small. This requires us to add 4 to the global variable offset in order to 1950 // compute the correct address. 1951 1952 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1953 1954 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1955 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1956 1957 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1958 .addDef(PCReg); 1959 1960 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1961 if (GAFlags == SIInstrInfo::MO_NONE) 1962 MIB.addImm(0); 1963 else 1964 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1965 1966 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1967 1968 if (PtrTy.getSizeInBits() == 32) 1969 B.buildExtract(DstReg, PCReg, 0); 1970 return true; 1971 } 1972 1973 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1974 MachineInstr &MI, MachineRegisterInfo &MRI, 1975 MachineIRBuilder &B) const { 1976 Register DstReg = MI.getOperand(0).getReg(); 1977 LLT Ty = MRI.getType(DstReg); 1978 unsigned AS = Ty.getAddressSpace(); 1979 1980 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1981 MachineFunction &MF = B.getMF(); 1982 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1983 B.setInstr(MI); 1984 1985 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1986 if (!MFI->isEntryFunction()) { 1987 const Function &Fn = MF.getFunction(); 1988 DiagnosticInfoUnsupported BadLDSDecl( 1989 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 1990 DS_Warning); 1991 Fn.getContext().diagnose(BadLDSDecl); 1992 1993 // We currently don't have a way to correctly allocate LDS objects that 1994 // aren't directly associated with a kernel. We do force inlining of 1995 // functions that use local objects. However, if these dead functions are 1996 // not eliminated, we don't want a compile time error. Just emit a warning 1997 // and a trap, since there should be no callable path here. 1998 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 1999 B.buildUndef(DstReg); 2000 MI.eraseFromParent(); 2001 return true; 2002 } 2003 2004 // TODO: We could emit code to handle the initialization somewhere. 2005 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2006 const SITargetLowering *TLI = ST.getTargetLowering(); 2007 if (!TLI->shouldUseLDSConstAddress(GV)) { 2008 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2009 return true; // Leave in place; 2010 } 2011 2012 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 2013 MI.eraseFromParent(); 2014 return true; 2015 } 2016 2017 const Function &Fn = MF.getFunction(); 2018 DiagnosticInfoUnsupported BadInit( 2019 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2020 Fn.getContext().diagnose(BadInit); 2021 return true; 2022 } 2023 2024 const SITargetLowering *TLI = ST.getTargetLowering(); 2025 2026 if (TLI->shouldEmitFixup(GV)) { 2027 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2028 MI.eraseFromParent(); 2029 return true; 2030 } 2031 2032 if (TLI->shouldEmitPCReloc(GV)) { 2033 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2034 MI.eraseFromParent(); 2035 return true; 2036 } 2037 2038 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2039 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2040 2041 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2042 MachinePointerInfo::getGOT(MF), 2043 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2044 MachineMemOperand::MOInvariant, 2045 8 /*Size*/, Align(8)); 2046 2047 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2048 2049 if (Ty.getSizeInBits() == 32) { 2050 // Truncate if this is a 32-bit constant adrdess. 2051 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2052 B.buildExtract(DstReg, Load, 0); 2053 } else 2054 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2055 2056 MI.eraseFromParent(); 2057 return true; 2058 } 2059 2060 bool AMDGPULegalizerInfo::legalizeLoad( 2061 MachineInstr &MI, MachineRegisterInfo &MRI, 2062 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2063 B.setInstr(MI); 2064 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2065 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2066 Observer.changingInstr(MI); 2067 MI.getOperand(1).setReg(Cast.getReg(0)); 2068 Observer.changedInstr(MI); 2069 return true; 2070 } 2071 2072 bool AMDGPULegalizerInfo::legalizeFMad( 2073 MachineInstr &MI, MachineRegisterInfo &MRI, 2074 MachineIRBuilder &B) const { 2075 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2076 assert(Ty.isScalar()); 2077 2078 MachineFunction &MF = B.getMF(); 2079 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2080 2081 // TODO: Always legal with future ftz flag. 2082 // FIXME: Do we need just output? 2083 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2084 return true; 2085 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2086 return true; 2087 2088 MachineIRBuilder HelperBuilder(MI); 2089 GISelObserverWrapper DummyObserver; 2090 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2091 HelperBuilder.setInstr(MI); 2092 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2093 } 2094 2095 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2096 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2097 Register DstReg = MI.getOperand(0).getReg(); 2098 Register PtrReg = MI.getOperand(1).getReg(); 2099 Register CmpVal = MI.getOperand(2).getReg(); 2100 Register NewVal = MI.getOperand(3).getReg(); 2101 2102 assert(SITargetLowering::isFlatGlobalAddrSpace( 2103 MRI.getType(PtrReg).getAddressSpace()) && 2104 "this should not have been custom lowered"); 2105 2106 LLT ValTy = MRI.getType(CmpVal); 2107 LLT VecTy = LLT::vector(2, ValTy); 2108 2109 B.setInstr(MI); 2110 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2111 2112 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2113 .addDef(DstReg) 2114 .addUse(PtrReg) 2115 .addUse(PackedVal) 2116 .setMemRefs(MI.memoperands()); 2117 2118 MI.eraseFromParent(); 2119 return true; 2120 } 2121 2122 bool AMDGPULegalizerInfo::legalizeFlog( 2123 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2124 Register Dst = MI.getOperand(0).getReg(); 2125 Register Src = MI.getOperand(1).getReg(); 2126 LLT Ty = B.getMRI()->getType(Dst); 2127 unsigned Flags = MI.getFlags(); 2128 B.setInstr(MI); 2129 2130 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2131 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2132 2133 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2134 MI.eraseFromParent(); 2135 return true; 2136 } 2137 2138 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2139 MachineIRBuilder &B) const { 2140 Register Dst = MI.getOperand(0).getReg(); 2141 Register Src = MI.getOperand(1).getReg(); 2142 unsigned Flags = MI.getFlags(); 2143 LLT Ty = B.getMRI()->getType(Dst); 2144 B.setInstr(MI); 2145 2146 auto K = B.buildFConstant(Ty, numbers::log2e); 2147 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2148 B.buildFExp2(Dst, Mul, Flags); 2149 MI.eraseFromParent(); 2150 return true; 2151 } 2152 2153 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2154 MachineIRBuilder &B) const { 2155 Register Dst = MI.getOperand(0).getReg(); 2156 Register Src0 = MI.getOperand(1).getReg(); 2157 Register Src1 = MI.getOperand(2).getReg(); 2158 unsigned Flags = MI.getFlags(); 2159 LLT Ty = B.getMRI()->getType(Dst); 2160 B.setInstr(MI); 2161 const LLT S16 = LLT::scalar(16); 2162 const LLT S32 = LLT::scalar(32); 2163 2164 if (Ty == S32) { 2165 auto Log = B.buildFLog2(S32, Src0, Flags); 2166 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2167 .addUse(Log.getReg(0)) 2168 .addUse(Src1) 2169 .setMIFlags(Flags); 2170 B.buildFExp2(Dst, Mul, Flags); 2171 } else if (Ty == S16) { 2172 // There's no f16 fmul_legacy, so we need to convert for it. 2173 auto Log = B.buildFLog2(S16, Src0, Flags); 2174 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2175 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2176 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2177 .addUse(Ext0.getReg(0)) 2178 .addUse(Ext1.getReg(0)) 2179 .setMIFlags(Flags); 2180 2181 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2182 } else 2183 return false; 2184 2185 MI.eraseFromParent(); 2186 return true; 2187 } 2188 2189 // Find a source register, ignoring any possible source modifiers. 2190 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2191 Register ModSrc = OrigSrc; 2192 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2193 ModSrc = SrcFNeg->getOperand(1).getReg(); 2194 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2195 ModSrc = SrcFAbs->getOperand(1).getReg(); 2196 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2197 ModSrc = SrcFAbs->getOperand(1).getReg(); 2198 return ModSrc; 2199 } 2200 2201 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2202 MachineRegisterInfo &MRI, 2203 MachineIRBuilder &B) const { 2204 B.setInstr(MI); 2205 2206 const LLT S1 = LLT::scalar(1); 2207 const LLT S64 = LLT::scalar(64); 2208 Register Dst = MI.getOperand(0).getReg(); 2209 Register OrigSrc = MI.getOperand(1).getReg(); 2210 unsigned Flags = MI.getFlags(); 2211 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2212 "this should not have been custom lowered"); 2213 2214 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2215 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2216 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2217 // V_FRACT bug is: 2218 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2219 // 2220 // Convert floor(x) to (x - fract(x)) 2221 2222 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2223 .addUse(OrigSrc) 2224 .setMIFlags(Flags); 2225 2226 // Give source modifier matching some assistance before obscuring a foldable 2227 // pattern. 2228 2229 // TODO: We can avoid the neg on the fract? The input sign to fract 2230 // shouldn't matter? 2231 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2232 2233 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2234 2235 Register Min = MRI.createGenericVirtualRegister(S64); 2236 2237 // We don't need to concern ourselves with the snan handling difference, so 2238 // use the one which will directly select. 2239 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2240 if (MFI->getMode().IEEE) 2241 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2242 else 2243 B.buildFMinNum(Min, Fract, Const, Flags); 2244 2245 Register CorrectedFract = Min; 2246 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2247 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2248 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2249 } 2250 2251 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2252 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2253 2254 MI.eraseFromParent(); 2255 return true; 2256 } 2257 2258 // Turn an illegal packed v2s16 build vector into bit operations. 2259 // TODO: This should probably be a bitcast action in LegalizerHelper. 2260 bool AMDGPULegalizerInfo::legalizeBuildVector( 2261 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2262 Register Dst = MI.getOperand(0).getReg(); 2263 const LLT S32 = LLT::scalar(32); 2264 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2265 2266 Register Src0 = MI.getOperand(1).getReg(); 2267 Register Src1 = MI.getOperand(2).getReg(); 2268 assert(MRI.getType(Src0) == LLT::scalar(16)); 2269 2270 B.setInstr(MI); 2271 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2272 B.buildBitcast(Dst, Merge); 2273 2274 MI.eraseFromParent(); 2275 return true; 2276 } 2277 2278 // Return the use branch instruction, otherwise null if the usage is invalid. 2279 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2280 MachineRegisterInfo &MRI, 2281 MachineInstr *&Br) { 2282 Register CondDef = MI.getOperand(0).getReg(); 2283 if (!MRI.hasOneNonDBGUse(CondDef)) 2284 return nullptr; 2285 2286 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2287 if (UseMI.getParent() != MI.getParent() || 2288 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2289 return nullptr; 2290 2291 // Make sure the cond br is followed by a G_BR 2292 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2293 if (Next != MI.getParent()->end()) { 2294 if (Next->getOpcode() != AMDGPU::G_BR) 2295 return nullptr; 2296 Br = &*Next; 2297 } 2298 2299 return &UseMI; 2300 } 2301 2302 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2303 MachineRegisterInfo &MRI, 2304 Register LiveIn, 2305 Register PhyReg) const { 2306 assert(PhyReg.isPhysical() && "Physical register expected"); 2307 2308 // Insert the live-in copy, if required, by defining destination virtual 2309 // register. 2310 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2311 if (!MRI.getVRegDef(LiveIn)) { 2312 // FIXME: Should have scoped insert pt 2313 MachineBasicBlock &OrigInsBB = B.getMBB(); 2314 auto OrigInsPt = B.getInsertPt(); 2315 2316 MachineBasicBlock &EntryMBB = B.getMF().front(); 2317 EntryMBB.addLiveIn(PhyReg); 2318 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2319 B.buildCopy(LiveIn, PhyReg); 2320 2321 B.setInsertPt(OrigInsBB, OrigInsPt); 2322 } 2323 2324 return LiveIn; 2325 } 2326 2327 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2328 MachineRegisterInfo &MRI, 2329 Register PhyReg, LLT Ty, 2330 bool InsertLiveInCopy) const { 2331 assert(PhyReg.isPhysical() && "Physical register expected"); 2332 2333 // Get or create virtual live-in regester 2334 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2335 if (!LiveIn) { 2336 LiveIn = MRI.createGenericVirtualRegister(Ty); 2337 MRI.addLiveIn(PhyReg, LiveIn); 2338 } 2339 2340 // When the actual true copy required is from virtual register to physical 2341 // register (to be inserted later), live-in copy insertion from physical 2342 // to register virtual register is not required 2343 if (!InsertLiveInCopy) 2344 return LiveIn; 2345 2346 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2347 } 2348 2349 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2350 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2351 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2352 const ArgDescriptor *Arg; 2353 const TargetRegisterClass *RC; 2354 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 2355 if (!Arg) { 2356 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2357 return nullptr; 2358 } 2359 return Arg; 2360 } 2361 2362 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2363 const ArgDescriptor *Arg) const { 2364 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2365 return false; // TODO: Handle these 2366 2367 Register SrcReg = Arg->getRegister(); 2368 assert(SrcReg.isPhysical() && "Physical register expected"); 2369 assert(DstReg.isVirtual() && "Virtual register expected"); 2370 2371 MachineRegisterInfo &MRI = *B.getMRI(); 2372 2373 LLT Ty = MRI.getType(DstReg); 2374 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2375 2376 if (Arg->isMasked()) { 2377 // TODO: Should we try to emit this once in the entry block? 2378 const LLT S32 = LLT::scalar(32); 2379 const unsigned Mask = Arg->getMask(); 2380 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2381 2382 Register AndMaskSrc = LiveIn; 2383 2384 if (Shift != 0) { 2385 auto ShiftAmt = B.buildConstant(S32, Shift); 2386 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2387 } 2388 2389 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2390 } else { 2391 B.buildCopy(DstReg, LiveIn); 2392 } 2393 2394 return true; 2395 } 2396 2397 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2398 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2399 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2400 B.setInstr(MI); 2401 2402 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2403 if (!Arg) 2404 return false; 2405 2406 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2407 return false; 2408 2409 MI.eraseFromParent(); 2410 return true; 2411 } 2412 2413 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2414 MachineRegisterInfo &MRI, 2415 MachineIRBuilder &B) const { 2416 B.setInstr(MI); 2417 Register Dst = MI.getOperand(0).getReg(); 2418 LLT DstTy = MRI.getType(Dst); 2419 LLT S16 = LLT::scalar(16); 2420 LLT S32 = LLT::scalar(32); 2421 LLT S64 = LLT::scalar(64); 2422 2423 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2424 return true; 2425 2426 if (DstTy == S16) 2427 return legalizeFDIV16(MI, MRI, B); 2428 if (DstTy == S32) 2429 return legalizeFDIV32(MI, MRI, B); 2430 if (DstTy == S64) 2431 return legalizeFDIV64(MI, MRI, B); 2432 2433 return false; 2434 } 2435 2436 static Register buildDivRCP(MachineIRBuilder &B, Register Src) { 2437 const LLT S32 = LLT::scalar(32); 2438 2439 auto Cvt0 = B.buildUITOFP(S32, Src); 2440 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0}); 2441 auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000)); 2442 auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1); 2443 return B.buildFPTOUI(S32, Mul).getReg(0); 2444 } 2445 2446 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2447 Register DstReg, 2448 Register Num, 2449 Register Den, 2450 bool IsRem) const { 2451 const LLT S1 = LLT::scalar(1); 2452 const LLT S32 = LLT::scalar(32); 2453 2454 // RCP = URECIP(Den) = 2^32 / Den + e 2455 // e is rounding error. 2456 auto RCP = buildDivRCP(B, Den); 2457 2458 // RCP_LO = mul(RCP, Den) 2459 auto RCP_LO = B.buildMul(S32, RCP, Den); 2460 2461 // RCP_HI = mulhu (RCP, Den) */ 2462 auto RCP_HI = B.buildUMulH(S32, RCP, Den); 2463 2464 // NEG_RCP_LO = -RCP_LO 2465 auto Zero = B.buildConstant(S32, 0); 2466 auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO); 2467 2468 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 2469 auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero); 2470 auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO); 2471 2472 // Calculate the rounding error from the URECIP instruction 2473 // E = mulhu(ABS_RCP_LO, RCP) 2474 auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP); 2475 2476 // RCP_A_E = RCP + E 2477 auto RCP_A_E = B.buildAdd(S32, RCP, E); 2478 2479 // RCP_S_E = RCP - E 2480 auto RCP_S_E = B.buildSub(S32, RCP, E); 2481 2482 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 2483 auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E); 2484 2485 // Quotient = mulhu(Tmp0, Num)stmp 2486 auto Quotient = B.buildUMulH(S32, Tmp0, Num); 2487 2488 // Num_S_Remainder = Quotient * Den 2489 auto Num_S_Remainder = B.buildMul(S32, Quotient, Den); 2490 2491 // Remainder = Num - Num_S_Remainder 2492 auto Remainder = B.buildSub(S32, Num, Num_S_Remainder); 2493 2494 // Remainder_GE_Den = Remainder >= Den 2495 auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den); 2496 2497 // Remainder_GE_Zero = Num >= Num_S_Remainder; 2498 auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1, 2499 Num, Num_S_Remainder); 2500 2501 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 2502 auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero); 2503 2504 // Calculate Division result: 2505 2506 // Quotient_A_One = Quotient + 1 2507 auto One = B.buildConstant(S32, 1); 2508 auto Quotient_A_One = B.buildAdd(S32, Quotient, One); 2509 2510 // Quotient_S_One = Quotient - 1 2511 auto Quotient_S_One = B.buildSub(S32, Quotient, One); 2512 2513 // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient) 2514 auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One); 2515 2516 // Div = (Remainder_GE_Zero ? Div : Quotient_S_One) 2517 if (IsRem) { 2518 Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One); 2519 2520 // Calculate Rem result: 2521 auto Remainder_S_Den = B.buildSub(S32, Remainder, Den); 2522 2523 // Remainder_A_Den = Remainder + Den 2524 auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den); 2525 2526 // Rem = (Tmp1 ? Remainder_S_Den : Remainder) 2527 auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder); 2528 2529 // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den) 2530 B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den); 2531 } else { 2532 B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One); 2533 } 2534 } 2535 2536 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2537 MachineRegisterInfo &MRI, 2538 MachineIRBuilder &B) const { 2539 B.setInstr(MI); 2540 const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM; 2541 Register DstReg = MI.getOperand(0).getReg(); 2542 Register Num = MI.getOperand(1).getReg(); 2543 Register Den = MI.getOperand(2).getReg(); 2544 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem); 2545 MI.eraseFromParent(); 2546 return true; 2547 } 2548 2549 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2550 // 2551 // Return lo, hi of result 2552 // 2553 // %cvt.lo = G_UITOFP Val.lo 2554 // %cvt.hi = G_UITOFP Val.hi 2555 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2556 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2557 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2558 // %mul2 = G_FMUL %mul1, 2**(-32) 2559 // %trunc = G_INTRINSIC_TRUNC %mul2 2560 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2561 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2562 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2563 Register Val) { 2564 const LLT S32 = LLT::scalar(32); 2565 auto Unmerge = B.buildUnmerge(S32, Val); 2566 2567 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2568 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2569 2570 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2571 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2572 2573 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2574 auto Mul1 = 2575 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2576 2577 // 2**(-32) 2578 auto Mul2 = 2579 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2580 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2581 2582 // -(2**32) 2583 auto Mad2 = B.buildFMAD(S32, Trunc, 2584 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2585 2586 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2587 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2588 2589 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2590 } 2591 2592 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI, 2593 MachineRegisterInfo &MRI, 2594 MachineIRBuilder &B) const { 2595 B.setInstr(MI); 2596 2597 const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV; 2598 const LLT S32 = LLT::scalar(32); 2599 const LLT S64 = LLT::scalar(64); 2600 const LLT S1 = LLT::scalar(1); 2601 Register Numer = MI.getOperand(1).getReg(); 2602 Register Denom = MI.getOperand(2).getReg(); 2603 Register RcpLo, RcpHi; 2604 2605 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2606 2607 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2608 2609 auto Zero64 = B.buildConstant(S64, 0); 2610 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2611 2612 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2613 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2614 2615 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2616 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2617 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2618 2619 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2620 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2621 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2622 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2623 2624 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2625 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2626 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2627 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2628 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2629 2630 auto Zero32 = B.buildConstant(S32, 0); 2631 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2632 auto Add2_HiC = 2633 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2634 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2635 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2636 2637 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2638 Register NumerLo = UnmergeNumer.getReg(0); 2639 Register NumerHi = UnmergeNumer.getReg(1); 2640 2641 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2642 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2643 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2644 Register Mul3_Lo = UnmergeMul3.getReg(0); 2645 Register Mul3_Hi = UnmergeMul3.getReg(1); 2646 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2647 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2648 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2649 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2650 2651 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2652 Register DenomLo = UnmergeDenom.getReg(0); 2653 Register DenomHi = UnmergeDenom.getReg(1); 2654 2655 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2656 auto C1 = B.buildSExt(S32, CmpHi); 2657 2658 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2659 auto C2 = B.buildSExt(S32, CmpLo); 2660 2661 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2662 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2663 2664 // TODO: Here and below portions of the code can be enclosed into if/endif. 2665 // Currently control flow is unconditional and we have 4 selects after 2666 // potential endif to substitute PHIs. 2667 2668 // if C3 != 0 ... 2669 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2670 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2671 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2672 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2673 2674 auto One64 = B.buildConstant(S64, 1); 2675 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2676 2677 auto C4 = 2678 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2679 auto C5 = 2680 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2681 auto C6 = B.buildSelect( 2682 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2683 2684 // if (C6 != 0) 2685 auto Add4 = B.buildAdd(S64, Add3, One64); 2686 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2687 2688 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2689 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2690 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2691 2692 // endif C6 2693 // endif C3 2694 2695 if (IsDiv) { 2696 auto Sel1 = B.buildSelect( 2697 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2698 B.buildSelect(MI.getOperand(0), 2699 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2700 } else { 2701 auto Sel2 = B.buildSelect( 2702 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2703 B.buildSelect(MI.getOperand(0), 2704 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2705 } 2706 2707 MI.eraseFromParent(); 2708 return true; 2709 } 2710 2711 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2712 MachineRegisterInfo &MRI, 2713 MachineIRBuilder &B) const { 2714 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2715 if (Ty == LLT::scalar(32)) 2716 return legalizeUDIV_UREM32(MI, MRI, B); 2717 if (Ty == LLT::scalar(64)) 2718 return legalizeUDIV_UREM64(MI, MRI, B); 2719 return false; 2720 } 2721 2722 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI, 2723 MachineRegisterInfo &MRI, 2724 MachineIRBuilder &B) const { 2725 B.setInstr(MI); 2726 const LLT S32 = LLT::scalar(32); 2727 2728 const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM; 2729 Register DstReg = MI.getOperand(0).getReg(); 2730 Register LHS = MI.getOperand(1).getReg(); 2731 Register RHS = MI.getOperand(2).getReg(); 2732 2733 auto ThirtyOne = B.buildConstant(S32, 31); 2734 auto LHSign = B.buildAShr(S32, LHS, ThirtyOne); 2735 auto RHSign = B.buildAShr(S32, LHS, ThirtyOne); 2736 2737 LHS = B.buildAdd(S32, LHS, LHSign).getReg(0); 2738 RHS = B.buildAdd(S32, RHS, RHSign).getReg(0); 2739 2740 LHS = B.buildXor(S32, LHS, LHSign).getReg(0); 2741 RHS = B.buildXor(S32, RHS, RHSign).getReg(0); 2742 2743 Register UDivRem = MRI.createGenericVirtualRegister(S32); 2744 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem); 2745 2746 if (IsRem) { 2747 auto RSign = LHSign; // Remainder sign is the same as LHS 2748 UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0); 2749 B.buildSub(DstReg, UDivRem, RSign); 2750 } else { 2751 auto DSign = B.buildXor(S32, LHSign, RHSign); 2752 UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0); 2753 B.buildSub(DstReg, UDivRem, DSign); 2754 } 2755 2756 MI.eraseFromParent(); 2757 return true; 2758 } 2759 2760 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2761 MachineRegisterInfo &MRI, 2762 MachineIRBuilder &B) const { 2763 if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32)) 2764 return legalizeSDIV_SREM32(MI, MRI, B); 2765 return false; 2766 } 2767 2768 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2769 MachineRegisterInfo &MRI, 2770 MachineIRBuilder &B) const { 2771 Register Res = MI.getOperand(0).getReg(); 2772 Register LHS = MI.getOperand(1).getReg(); 2773 Register RHS = MI.getOperand(2).getReg(); 2774 2775 uint16_t Flags = MI.getFlags(); 2776 2777 LLT ResTy = MRI.getType(Res); 2778 LLT S32 = LLT::scalar(32); 2779 LLT S64 = LLT::scalar(64); 2780 2781 const MachineFunction &MF = B.getMF(); 2782 bool Unsafe = 2783 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2784 2785 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2786 return false; 2787 2788 if (!Unsafe && ResTy == S32 && 2789 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2790 return false; 2791 2792 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2793 // 1 / x -> RCP(x) 2794 if (CLHS->isExactlyValue(1.0)) { 2795 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2796 .addUse(RHS) 2797 .setMIFlags(Flags); 2798 2799 MI.eraseFromParent(); 2800 return true; 2801 } 2802 2803 // -1 / x -> RCP( FNEG(x) ) 2804 if (CLHS->isExactlyValue(-1.0)) { 2805 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2806 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2807 .addUse(FNeg.getReg(0)) 2808 .setMIFlags(Flags); 2809 2810 MI.eraseFromParent(); 2811 return true; 2812 } 2813 } 2814 2815 // x / y -> x * (1.0 / y) 2816 if (Unsafe) { 2817 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2818 .addUse(RHS) 2819 .setMIFlags(Flags); 2820 B.buildFMul(Res, LHS, RCP, Flags); 2821 2822 MI.eraseFromParent(); 2823 return true; 2824 } 2825 2826 return false; 2827 } 2828 2829 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2830 MachineRegisterInfo &MRI, 2831 MachineIRBuilder &B) const { 2832 B.setInstr(MI); 2833 Register Res = MI.getOperand(0).getReg(); 2834 Register LHS = MI.getOperand(1).getReg(); 2835 Register RHS = MI.getOperand(2).getReg(); 2836 2837 uint16_t Flags = MI.getFlags(); 2838 2839 LLT S16 = LLT::scalar(16); 2840 LLT S32 = LLT::scalar(32); 2841 2842 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2843 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2844 2845 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2846 .addUse(RHSExt.getReg(0)) 2847 .setMIFlags(Flags); 2848 2849 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2850 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2851 2852 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2853 .addUse(RDst.getReg(0)) 2854 .addUse(RHS) 2855 .addUse(LHS) 2856 .setMIFlags(Flags); 2857 2858 MI.eraseFromParent(); 2859 return true; 2860 } 2861 2862 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2863 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2864 static void toggleSPDenormMode(bool Enable, 2865 MachineIRBuilder &B, 2866 const GCNSubtarget &ST, 2867 AMDGPU::SIModeRegisterDefaults Mode) { 2868 // Set SP denorm mode to this value. 2869 unsigned SPDenormMode = 2870 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2871 2872 if (ST.hasDenormModeInst()) { 2873 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2874 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2875 2876 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2877 B.buildInstr(AMDGPU::S_DENORM_MODE) 2878 .addImm(NewDenormModeValue); 2879 2880 } else { 2881 // Select FP32 bit field in mode register. 2882 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2883 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2884 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2885 2886 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2887 .addImm(SPDenormMode) 2888 .addImm(SPDenormModeBitField); 2889 } 2890 } 2891 2892 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2893 MachineRegisterInfo &MRI, 2894 MachineIRBuilder &B) const { 2895 B.setInstr(MI); 2896 Register Res = MI.getOperand(0).getReg(); 2897 Register LHS = MI.getOperand(1).getReg(); 2898 Register RHS = MI.getOperand(2).getReg(); 2899 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2900 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2901 2902 uint16_t Flags = MI.getFlags(); 2903 2904 LLT S32 = LLT::scalar(32); 2905 LLT S1 = LLT::scalar(1); 2906 2907 auto One = B.buildFConstant(S32, 1.0f); 2908 2909 auto DenominatorScaled = 2910 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2911 .addUse(LHS) 2912 .addUse(RHS) 2913 .addImm(0) 2914 .setMIFlags(Flags); 2915 auto NumeratorScaled = 2916 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2917 .addUse(LHS) 2918 .addUse(RHS) 2919 .addImm(1) 2920 .setMIFlags(Flags); 2921 2922 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2923 .addUse(DenominatorScaled.getReg(0)) 2924 .setMIFlags(Flags); 2925 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2926 2927 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2928 // aren't modeled as reading it. 2929 if (!Mode.allFP32Denormals()) 2930 toggleSPDenormMode(true, B, ST, Mode); 2931 2932 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2933 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2934 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2935 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2936 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2937 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2938 2939 if (!Mode.allFP32Denormals()) 2940 toggleSPDenormMode(false, B, ST, Mode); 2941 2942 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2943 .addUse(Fma4.getReg(0)) 2944 .addUse(Fma1.getReg(0)) 2945 .addUse(Fma3.getReg(0)) 2946 .addUse(NumeratorScaled.getReg(1)) 2947 .setMIFlags(Flags); 2948 2949 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2950 .addUse(Fmas.getReg(0)) 2951 .addUse(RHS) 2952 .addUse(LHS) 2953 .setMIFlags(Flags); 2954 2955 MI.eraseFromParent(); 2956 return true; 2957 } 2958 2959 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2960 MachineRegisterInfo &MRI, 2961 MachineIRBuilder &B) const { 2962 B.setInstr(MI); 2963 Register Res = MI.getOperand(0).getReg(); 2964 Register LHS = MI.getOperand(1).getReg(); 2965 Register RHS = MI.getOperand(2).getReg(); 2966 2967 uint16_t Flags = MI.getFlags(); 2968 2969 LLT S64 = LLT::scalar(64); 2970 LLT S1 = LLT::scalar(1); 2971 2972 auto One = B.buildFConstant(S64, 1.0); 2973 2974 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2975 .addUse(LHS) 2976 .addUse(RHS) 2977 .addImm(0) 2978 .setMIFlags(Flags); 2979 2980 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2981 2982 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2983 .addUse(DivScale0.getReg(0)) 2984 .setMIFlags(Flags); 2985 2986 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2987 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2988 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2989 2990 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2991 .addUse(LHS) 2992 .addUse(RHS) 2993 .addImm(1) 2994 .setMIFlags(Flags); 2995 2996 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2997 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 2998 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2999 3000 Register Scale; 3001 if (!ST.hasUsableDivScaleConditionOutput()) { 3002 // Workaround a hardware bug on SI where the condition output from div_scale 3003 // is not usable. 3004 3005 LLT S32 = LLT::scalar(32); 3006 3007 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3008 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3009 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3010 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3011 3012 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3013 Scale1Unmerge.getReg(1)); 3014 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3015 Scale0Unmerge.getReg(1)); 3016 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3017 } else { 3018 Scale = DivScale1.getReg(1); 3019 } 3020 3021 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3022 .addUse(Fma4.getReg(0)) 3023 .addUse(Fma3.getReg(0)) 3024 .addUse(Mul.getReg(0)) 3025 .addUse(Scale) 3026 .setMIFlags(Flags); 3027 3028 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3029 .addUse(Fmas.getReg(0)) 3030 .addUse(RHS) 3031 .addUse(LHS) 3032 .setMIFlags(Flags); 3033 3034 MI.eraseFromParent(); 3035 return true; 3036 } 3037 3038 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3039 MachineRegisterInfo &MRI, 3040 MachineIRBuilder &B) const { 3041 B.setInstr(MI); 3042 Register Res = MI.getOperand(0).getReg(); 3043 Register LHS = MI.getOperand(2).getReg(); 3044 Register RHS = MI.getOperand(3).getReg(); 3045 uint16_t Flags = MI.getFlags(); 3046 3047 LLT S32 = LLT::scalar(32); 3048 LLT S1 = LLT::scalar(1); 3049 3050 auto Abs = B.buildFAbs(S32, RHS, Flags); 3051 const APFloat C0Val(1.0f); 3052 3053 auto C0 = B.buildConstant(S32, 0x6f800000); 3054 auto C1 = B.buildConstant(S32, 0x2f800000); 3055 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3056 3057 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3058 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3059 3060 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3061 3062 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3063 .addUse(Mul0.getReg(0)) 3064 .setMIFlags(Flags); 3065 3066 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3067 3068 B.buildFMul(Res, Sel, Mul1, Flags); 3069 3070 MI.eraseFromParent(); 3071 return true; 3072 } 3073 3074 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3075 MachineRegisterInfo &MRI, 3076 MachineIRBuilder &B) const { 3077 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3078 if (!MFI->isEntryFunction()) { 3079 return legalizePreloadedArgIntrin(MI, MRI, B, 3080 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3081 } 3082 3083 B.setInstr(MI); 3084 3085 uint64_t Offset = 3086 ST.getTargetLowering()->getImplicitParameterOffset( 3087 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3088 Register DstReg = MI.getOperand(0).getReg(); 3089 LLT DstTy = MRI.getType(DstReg); 3090 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3091 3092 const ArgDescriptor *Arg; 3093 const TargetRegisterClass *RC; 3094 std::tie(Arg, RC) 3095 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3096 if (!Arg) 3097 return false; 3098 3099 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3100 if (!loadInputValue(KernargPtrReg, B, Arg)) 3101 return false; 3102 3103 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3104 MI.eraseFromParent(); 3105 return true; 3106 } 3107 3108 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3109 MachineRegisterInfo &MRI, 3110 MachineIRBuilder &B, 3111 unsigned AddrSpace) const { 3112 B.setInstr(MI); 3113 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3114 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3115 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3116 MI.eraseFromParent(); 3117 return true; 3118 } 3119 3120 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3121 // offset (the offset that is included in bounds checking and swizzling, to be 3122 // split between the instruction's voffset and immoffset fields) and soffset 3123 // (the offset that is excluded from bounds checking and swizzling, to go in 3124 // the instruction's soffset field). This function takes the first kind of 3125 // offset and figures out how to split it between voffset and immoffset. 3126 std::tuple<Register, unsigned, unsigned> 3127 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3128 Register OrigOffset) const { 3129 const unsigned MaxImm = 4095; 3130 Register BaseReg; 3131 unsigned TotalConstOffset; 3132 MachineInstr *OffsetDef; 3133 const LLT S32 = LLT::scalar(32); 3134 3135 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3136 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3137 3138 unsigned ImmOffset = TotalConstOffset; 3139 3140 // If the immediate value is too big for the immoffset field, put the value 3141 // and -4096 into the immoffset field so that the value that is copied/added 3142 // for the voffset field is a multiple of 4096, and it stands more chance 3143 // of being CSEd with the copy/add for another similar load/store. 3144 // However, do not do that rounding down to a multiple of 4096 if that is a 3145 // negative number, as it appears to be illegal to have a negative offset 3146 // in the vgpr, even if adding the immediate offset makes it positive. 3147 unsigned Overflow = ImmOffset & ~MaxImm; 3148 ImmOffset -= Overflow; 3149 if ((int32_t)Overflow < 0) { 3150 Overflow += ImmOffset; 3151 ImmOffset = 0; 3152 } 3153 3154 if (Overflow != 0) { 3155 if (!BaseReg) { 3156 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3157 } else { 3158 auto OverflowVal = B.buildConstant(S32, Overflow); 3159 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3160 } 3161 } 3162 3163 if (!BaseReg) 3164 BaseReg = B.buildConstant(S32, 0).getReg(0); 3165 3166 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3167 } 3168 3169 /// Handle register layout difference for f16 images for some subtargets. 3170 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3171 MachineRegisterInfo &MRI, 3172 Register Reg) const { 3173 if (!ST.hasUnpackedD16VMem()) 3174 return Reg; 3175 3176 const LLT S16 = LLT::scalar(16); 3177 const LLT S32 = LLT::scalar(32); 3178 LLT StoreVT = MRI.getType(Reg); 3179 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3180 3181 auto Unmerge = B.buildUnmerge(S16, Reg); 3182 3183 SmallVector<Register, 4> WideRegs; 3184 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3185 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3186 3187 int NumElts = StoreVT.getNumElements(); 3188 3189 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3190 } 3191 3192 Register AMDGPULegalizerInfo::fixStoreSourceType( 3193 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3194 MachineRegisterInfo *MRI = B.getMRI(); 3195 LLT Ty = MRI->getType(VData); 3196 3197 const LLT S16 = LLT::scalar(16); 3198 3199 // Fixup illegal register types for i8 stores. 3200 if (Ty == LLT::scalar(8) || Ty == S16) { 3201 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3202 return AnyExt; 3203 } 3204 3205 if (Ty.isVector()) { 3206 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3207 if (IsFormat) 3208 return handleD16VData(B, *MRI, VData); 3209 } 3210 } 3211 3212 return VData; 3213 } 3214 3215 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3216 MachineRegisterInfo &MRI, 3217 MachineIRBuilder &B, 3218 bool IsTyped, 3219 bool IsFormat) const { 3220 B.setInstr(MI); 3221 3222 Register VData = MI.getOperand(1).getReg(); 3223 LLT Ty = MRI.getType(VData); 3224 LLT EltTy = Ty.getScalarType(); 3225 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3226 const LLT S32 = LLT::scalar(32); 3227 3228 VData = fixStoreSourceType(B, VData, IsFormat); 3229 Register RSrc = MI.getOperand(2).getReg(); 3230 3231 MachineMemOperand *MMO = *MI.memoperands_begin(); 3232 const int MemSize = MMO->getSize(); 3233 3234 unsigned ImmOffset; 3235 unsigned TotalOffset; 3236 3237 // The typed intrinsics add an immediate after the registers. 3238 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3239 3240 // The struct intrinsic variants add one additional operand over raw. 3241 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3242 Register VIndex; 3243 int OpOffset = 0; 3244 if (HasVIndex) { 3245 VIndex = MI.getOperand(3).getReg(); 3246 OpOffset = 1; 3247 } 3248 3249 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3250 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3251 3252 unsigned Format = 0; 3253 if (IsTyped) { 3254 Format = MI.getOperand(5 + OpOffset).getImm(); 3255 ++OpOffset; 3256 } 3257 3258 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3259 3260 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3261 if (TotalOffset != 0) 3262 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3263 3264 unsigned Opc; 3265 if (IsTyped) { 3266 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3267 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3268 } else if (IsFormat) { 3269 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3270 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3271 } else { 3272 switch (MemSize) { 3273 case 1: 3274 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3275 break; 3276 case 2: 3277 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3278 break; 3279 default: 3280 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3281 break; 3282 } 3283 } 3284 3285 if (!VIndex) 3286 VIndex = B.buildConstant(S32, 0).getReg(0); 3287 3288 auto MIB = B.buildInstr(Opc) 3289 .addUse(VData) // vdata 3290 .addUse(RSrc) // rsrc 3291 .addUse(VIndex) // vindex 3292 .addUse(VOffset) // voffset 3293 .addUse(SOffset) // soffset 3294 .addImm(ImmOffset); // offset(imm) 3295 3296 if (IsTyped) 3297 MIB.addImm(Format); 3298 3299 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3300 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3301 .addMemOperand(MMO); 3302 3303 MI.eraseFromParent(); 3304 return true; 3305 } 3306 3307 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3308 MachineRegisterInfo &MRI, 3309 MachineIRBuilder &B, 3310 bool IsFormat, 3311 bool IsTyped) const { 3312 B.setInstr(MI); 3313 3314 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3315 MachineMemOperand *MMO = *MI.memoperands_begin(); 3316 const int MemSize = MMO->getSize(); 3317 const LLT S32 = LLT::scalar(32); 3318 3319 Register Dst = MI.getOperand(0).getReg(); 3320 Register RSrc = MI.getOperand(2).getReg(); 3321 3322 // The typed intrinsics add an immediate after the registers. 3323 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3324 3325 // The struct intrinsic variants add one additional operand over raw. 3326 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3327 Register VIndex; 3328 int OpOffset = 0; 3329 if (HasVIndex) { 3330 VIndex = MI.getOperand(3).getReg(); 3331 OpOffset = 1; 3332 } 3333 3334 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3335 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3336 3337 unsigned Format = 0; 3338 if (IsTyped) { 3339 Format = MI.getOperand(5 + OpOffset).getImm(); 3340 ++OpOffset; 3341 } 3342 3343 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3344 unsigned ImmOffset; 3345 unsigned TotalOffset; 3346 3347 LLT Ty = MRI.getType(Dst); 3348 LLT EltTy = Ty.getScalarType(); 3349 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3350 const bool Unpacked = ST.hasUnpackedD16VMem(); 3351 3352 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3353 if (TotalOffset != 0) 3354 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3355 3356 unsigned Opc; 3357 3358 if (IsTyped) { 3359 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3360 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3361 } else if (IsFormat) { 3362 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3363 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3364 } else { 3365 switch (MemSize) { 3366 case 1: 3367 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3368 break; 3369 case 2: 3370 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3371 break; 3372 default: 3373 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3374 break; 3375 } 3376 } 3377 3378 Register LoadDstReg; 3379 3380 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3381 LLT UnpackedTy = Ty.changeElementSize(32); 3382 3383 if (IsExtLoad) 3384 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3385 else if (Unpacked && IsD16 && Ty.isVector()) 3386 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3387 else 3388 LoadDstReg = Dst; 3389 3390 if (!VIndex) 3391 VIndex = B.buildConstant(S32, 0).getReg(0); 3392 3393 auto MIB = B.buildInstr(Opc) 3394 .addDef(LoadDstReg) // vdata 3395 .addUse(RSrc) // rsrc 3396 .addUse(VIndex) // vindex 3397 .addUse(VOffset) // voffset 3398 .addUse(SOffset) // soffset 3399 .addImm(ImmOffset); // offset(imm) 3400 3401 if (IsTyped) 3402 MIB.addImm(Format); 3403 3404 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3405 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3406 .addMemOperand(MMO); 3407 3408 if (LoadDstReg != Dst) { 3409 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3410 3411 // Widen result for extending loads was widened. 3412 if (IsExtLoad) 3413 B.buildTrunc(Dst, LoadDstReg); 3414 else { 3415 // Repack to original 16-bit vector result 3416 // FIXME: G_TRUNC should work, but legalization currently fails 3417 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3418 SmallVector<Register, 4> Repack; 3419 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3420 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3421 B.buildMerge(Dst, Repack); 3422 } 3423 } 3424 3425 MI.eraseFromParent(); 3426 return true; 3427 } 3428 3429 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3430 MachineIRBuilder &B, 3431 bool IsInc) const { 3432 B.setInstr(MI); 3433 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3434 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3435 B.buildInstr(Opc) 3436 .addDef(MI.getOperand(0).getReg()) 3437 .addUse(MI.getOperand(2).getReg()) 3438 .addUse(MI.getOperand(3).getReg()) 3439 .cloneMemRefs(MI); 3440 MI.eraseFromParent(); 3441 return true; 3442 } 3443 3444 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3445 switch (IntrID) { 3446 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3447 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3448 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3449 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3450 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3451 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3452 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3453 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3454 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3455 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3456 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3457 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3458 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3459 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3460 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3461 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3462 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3463 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3464 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3465 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3466 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3467 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3468 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3469 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3470 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3471 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3472 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3473 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3474 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3475 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3476 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3477 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3478 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3479 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3480 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3481 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3482 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3483 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3484 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3485 default: 3486 llvm_unreachable("unhandled atomic opcode"); 3487 } 3488 } 3489 3490 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3491 MachineIRBuilder &B, 3492 Intrinsic::ID IID) const { 3493 B.setInstr(MI); 3494 3495 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3496 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3497 3498 Register Dst = MI.getOperand(0).getReg(); 3499 Register VData = MI.getOperand(2).getReg(); 3500 3501 Register CmpVal; 3502 int OpOffset = 0; 3503 3504 if (IsCmpSwap) { 3505 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3506 ++OpOffset; 3507 } 3508 3509 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3510 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3511 3512 // The struct intrinsic variants add one additional operand over raw. 3513 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3514 Register VIndex; 3515 if (HasVIndex) { 3516 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3517 ++OpOffset; 3518 } 3519 3520 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3521 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3522 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3523 3524 MachineMemOperand *MMO = *MI.memoperands_begin(); 3525 3526 unsigned ImmOffset; 3527 unsigned TotalOffset; 3528 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3529 if (TotalOffset != 0) 3530 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3531 3532 if (!VIndex) 3533 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3534 3535 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3536 .addDef(Dst) 3537 .addUse(VData); // vdata 3538 3539 if (IsCmpSwap) 3540 MIB.addReg(CmpVal); 3541 3542 MIB.addUse(RSrc) // rsrc 3543 .addUse(VIndex) // vindex 3544 .addUse(VOffset) // voffset 3545 .addUse(SOffset) // soffset 3546 .addImm(ImmOffset) // offset(imm) 3547 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3548 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3549 .addMemOperand(MMO); 3550 3551 MI.eraseFromParent(); 3552 return true; 3553 } 3554 3555 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3556 /// vector with s16 typed elements. 3557 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3558 SmallVectorImpl<Register> &PackedAddrs, 3559 int AddrIdx, int DimIdx, int NumVAddrs, 3560 int NumGradients) { 3561 const LLT S16 = LLT::scalar(16); 3562 const LLT V2S16 = LLT::vector(2, 16); 3563 3564 for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) { 3565 MachineOperand &SrcOp = MI.getOperand(I); 3566 if (!SrcOp.isReg()) 3567 continue; // _L to _LZ may have eliminated this. 3568 3569 Register AddrReg = SrcOp.getReg(); 3570 3571 if (I < DimIdx) { 3572 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3573 PackedAddrs.push_back(AddrReg); 3574 } else { 3575 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3576 // derivatives dx/dh and dx/dv are packed with undef. 3577 if (((I + 1) >= (AddrIdx + NumVAddrs)) || 3578 ((NumGradients / 2) % 2 == 1 && 3579 (I == DimIdx + (NumGradients / 2) - 1 || 3580 I == DimIdx + NumGradients - 1)) || 3581 // Check for _L to _LZ optimization 3582 !MI.getOperand(I + 1).isReg()) { 3583 PackedAddrs.push_back( 3584 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3585 .getReg(0)); 3586 } else { 3587 PackedAddrs.push_back( 3588 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3589 .getReg(0)); 3590 ++I; 3591 } 3592 } 3593 } 3594 } 3595 3596 /// Convert from separate vaddr components to a single vector address register, 3597 /// and replace the remaining operands with $noreg. 3598 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3599 int DimIdx, int NumVAddrs) { 3600 const LLT S32 = LLT::scalar(32); 3601 3602 SmallVector<Register, 8> AddrRegs; 3603 for (int I = 0; I != NumVAddrs; ++I) { 3604 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3605 if (SrcOp.isReg()) { 3606 AddrRegs.push_back(SrcOp.getReg()); 3607 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3608 } 3609 } 3610 3611 int NumAddrRegs = AddrRegs.size(); 3612 if (NumAddrRegs != 1) { 3613 // Round up to 8 elements for v5-v7 3614 // FIXME: Missing intermediate sized register classes and instructions. 3615 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3616 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3617 auto Undef = B.buildUndef(S32); 3618 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3619 NumAddrRegs = RoundedNumRegs; 3620 } 3621 3622 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3623 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3624 } 3625 3626 for (int I = 1; I != NumVAddrs; ++I) { 3627 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3628 if (SrcOp.isReg()) 3629 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3630 } 3631 } 3632 3633 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3634 /// 3635 /// Depending on the subtarget, load/store with 16-bit element data need to be 3636 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3637 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3638 /// registers. 3639 /// 3640 /// We don't want to directly select image instructions just yet, but also want 3641 /// to exposes all register repacking to the legalizer/combiners. We also don't 3642 /// want a selected instrution entering RegBankSelect. In order to avoid 3643 /// defining a multitude of intermediate image instructions, directly hack on 3644 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3645 /// now unnecessary arguments with $noreg. 3646 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3647 MachineInstr &MI, MachineIRBuilder &B, 3648 GISelChangeObserver &Observer, 3649 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3650 B.setInstr(MI); 3651 3652 const int NumDefs = MI.getNumExplicitDefs(); 3653 bool IsTFE = NumDefs == 2; 3654 // We are only processing the operands of d16 image operations on subtargets 3655 // that use the unpacked register layout, or need to repack the TFE result. 3656 3657 // TODO: Do we need to guard against already legalized intrinsics? 3658 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3659 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3660 3661 MachineRegisterInfo *MRI = B.getMRI(); 3662 const LLT S32 = LLT::scalar(32); 3663 const LLT S16 = LLT::scalar(16); 3664 const LLT V2S16 = LLT::vector(2, 16); 3665 3666 // Index of first address argument 3667 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3668 3669 // Check for 16 bit addresses and pack if true. 3670 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3671 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3672 const bool IsA16 = AddrTy == S16; 3673 3674 int NumVAddrs, NumGradients; 3675 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3676 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3677 getDMaskIdx(BaseOpcode, NumDefs); 3678 unsigned DMask = 0; 3679 3680 int DMaskLanes = 0; 3681 if (!BaseOpcode->Atomic) { 3682 DMask = MI.getOperand(DMaskIdx).getImm(); 3683 if (BaseOpcode->Gather4) { 3684 DMaskLanes = 4; 3685 } else if (DMask != 0) { 3686 DMaskLanes = countPopulation(DMask); 3687 } else if (!IsTFE && !BaseOpcode->Store) { 3688 // If dmask is 0, this is a no-op load. This can be eliminated. 3689 B.buildUndef(MI.getOperand(0)); 3690 MI.eraseFromParent(); 3691 return true; 3692 } 3693 } 3694 3695 Observer.changingInstr(MI); 3696 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3697 3698 unsigned NewOpcode = NumDefs == 0 ? 3699 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3700 3701 // Track that we legalized this 3702 MI.setDesc(B.getTII().get(NewOpcode)); 3703 3704 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3705 // dmask to be at least 1 otherwise the instruction will fail 3706 if (IsTFE && DMask == 0) { 3707 DMask = 0x1; 3708 DMaskLanes = 1; 3709 MI.getOperand(DMaskIdx).setImm(DMask); 3710 } 3711 3712 if (BaseOpcode->Atomic) { 3713 Register VData0 = MI.getOperand(2).getReg(); 3714 LLT Ty = MRI->getType(VData0); 3715 3716 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3717 if (Ty.isVector()) 3718 return false; 3719 3720 if (BaseOpcode->AtomicX2) { 3721 Register VData1 = MI.getOperand(3).getReg(); 3722 // The two values are packed in one register. 3723 LLT PackedTy = LLT::vector(2, Ty); 3724 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3725 MI.getOperand(2).setReg(Concat.getReg(0)); 3726 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3727 } 3728 } 3729 3730 int CorrectedNumVAddrs = NumVAddrs; 3731 3732 // Optimize _L to _LZ when _L is zero 3733 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3734 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3735 const ConstantFP *ConstantLod; 3736 const int LodIdx = AddrIdx + NumVAddrs - 1; 3737 3738 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3739 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3740 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3741 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3742 LZMappingInfo->LZ, ImageDimIntr->Dim); 3743 3744 // The starting indexes should remain in the same place. 3745 --NumVAddrs; 3746 --CorrectedNumVAddrs; 3747 3748 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3749 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3750 MI.RemoveOperand(LodIdx); 3751 } 3752 } 3753 } 3754 3755 // Optimize _mip away, when 'lod' is zero 3756 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3757 int64_t ConstantLod; 3758 const int LodIdx = AddrIdx + NumVAddrs - 1; 3759 3760 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3761 if (ConstantLod == 0) { 3762 // TODO: Change intrinsic opcode and remove operand instead or replacing 3763 // it with 0, as the _L to _LZ handling is done above. 3764 MI.getOperand(LodIdx).ChangeToImmediate(0); 3765 --CorrectedNumVAddrs; 3766 } 3767 } 3768 } 3769 3770 // If the register allocator cannot place the address registers contiguously 3771 // without introducing moves, then using the non-sequential address encoding 3772 // is always preferable, since it saves VALU instructions and is usually a 3773 // wash in terms of code size or even better. 3774 // 3775 // However, we currently have no way of hinting to the register allocator 3776 // that MIMG addresses should be placed contiguously when it is possible to 3777 // do so, so force non-NSA for the common 2-address case as a heuristic. 3778 // 3779 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3780 // allocation when possible. 3781 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3782 3783 // Rewrite the addressing register layout before doing anything else. 3784 if (IsA16) { 3785 // FIXME: this feature is missing from gfx10. When that is fixed, this check 3786 // should be introduced. 3787 if (!ST.hasR128A16() && !ST.hasGFX10A16()) 3788 return false; 3789 3790 if (NumVAddrs > 1) { 3791 SmallVector<Register, 4> PackedRegs; 3792 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs, 3793 NumGradients); 3794 3795 if (!UseNSA && PackedRegs.size() > 1) { 3796 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3797 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3798 PackedRegs[0] = Concat.getReg(0); 3799 PackedRegs.resize(1); 3800 } 3801 3802 const int NumPacked = PackedRegs.size(); 3803 for (int I = 0; I != NumVAddrs; ++I) { 3804 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3805 if (!SrcOp.isReg()) { 3806 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3807 continue; 3808 } 3809 3810 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3811 3812 if (I < NumPacked) 3813 SrcOp.setReg(PackedRegs[I]); 3814 else 3815 SrcOp.setReg(AMDGPU::NoRegister); 3816 } 3817 } 3818 } else if (!UseNSA && NumVAddrs > 1) { 3819 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3820 } 3821 3822 3823 if (BaseOpcode->Store) { // No TFE for stores? 3824 // TODO: Handle dmask trim 3825 Register VData = MI.getOperand(1).getReg(); 3826 LLT Ty = MRI->getType(VData); 3827 if (!Ty.isVector() || Ty.getElementType() != S16) 3828 return true; 3829 3830 B.setInstr(MI); 3831 3832 Register RepackedReg = handleD16VData(B, *MRI, VData); 3833 if (RepackedReg != VData) { 3834 MI.getOperand(1).setReg(RepackedReg); 3835 } 3836 3837 return true; 3838 } 3839 3840 Register DstReg = MI.getOperand(0).getReg(); 3841 LLT Ty = MRI->getType(DstReg); 3842 const LLT EltTy = Ty.getScalarType(); 3843 const bool IsD16 = Ty.getScalarType() == S16; 3844 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3845 3846 // Confirm that the return type is large enough for the dmask specified 3847 if (NumElts < DMaskLanes) 3848 return false; 3849 3850 if (NumElts > 4 || DMaskLanes > 4) 3851 return false; 3852 3853 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3854 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3855 3856 // The raw dword aligned data component of the load. The only legal cases 3857 // where this matters should be when using the packed D16 format, for 3858 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3859 LLT RoundedTy; 3860 3861 // S32 vector to to cover all data, plus TFE result element. 3862 LLT TFETy; 3863 3864 // Register type to use for each loaded component. Will be S32 or V2S16. 3865 LLT RegTy; 3866 3867 if (IsD16 && ST.hasUnpackedD16VMem()) { 3868 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3869 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3870 RegTy = S32; 3871 } else { 3872 unsigned EltSize = EltTy.getSizeInBits(); 3873 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3874 unsigned RoundedSize = 32 * RoundedElts; 3875 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3876 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3877 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3878 } 3879 3880 // The return type does not need adjustment. 3881 // TODO: Should we change s16 case to s32 or <2 x s16>? 3882 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3883 return true; 3884 3885 Register Dst1Reg; 3886 3887 // Insert after the instruction. 3888 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3889 3890 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3891 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3892 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3893 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3894 3895 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3896 3897 MI.getOperand(0).setReg(NewResultReg); 3898 3899 // In the IR, TFE is supposed to be used with a 2 element struct return 3900 // type. The intruction really returns these two values in one contiguous 3901 // register, with one additional dword beyond the loaded data. Rewrite the 3902 // return type to use a single register result. 3903 3904 if (IsTFE) { 3905 Dst1Reg = MI.getOperand(1).getReg(); 3906 if (MRI->getType(Dst1Reg) != S32) 3907 return false; 3908 3909 // TODO: Make sure the TFE operand bit is set. 3910 MI.RemoveOperand(1); 3911 3912 // Handle the easy case that requires no repack instructions. 3913 if (Ty == S32) { 3914 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3915 return true; 3916 } 3917 } 3918 3919 // Now figure out how to copy the new result register back into the old 3920 // result. 3921 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3922 3923 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3924 3925 if (ResultNumRegs == 1) { 3926 assert(!IsTFE); 3927 ResultRegs[0] = NewResultReg; 3928 } else { 3929 // We have to repack into a new vector of some kind. 3930 for (int I = 0; I != NumDataRegs; ++I) 3931 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3932 B.buildUnmerge(ResultRegs, NewResultReg); 3933 3934 // Drop the final TFE element to get the data part. The TFE result is 3935 // directly written to the right place already. 3936 if (IsTFE) 3937 ResultRegs.resize(NumDataRegs); 3938 } 3939 3940 // For an s16 scalar result, we form an s32 result with a truncate regardless 3941 // of packed vs. unpacked. 3942 if (IsD16 && !Ty.isVector()) { 3943 B.buildTrunc(DstReg, ResultRegs[0]); 3944 return true; 3945 } 3946 3947 // Avoid a build/concat_vector of 1 entry. 3948 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 3949 B.buildBitcast(DstReg, ResultRegs[0]); 3950 return true; 3951 } 3952 3953 assert(Ty.isVector()); 3954 3955 if (IsD16) { 3956 // For packed D16 results with TFE enabled, all the data components are 3957 // S32. Cast back to the expected type. 3958 // 3959 // TODO: We don't really need to use load s32 elements. We would only need one 3960 // cast for the TFE result if a multiple of v2s16 was used. 3961 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 3962 for (Register &Reg : ResultRegs) 3963 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 3964 } else if (ST.hasUnpackedD16VMem()) { 3965 for (Register &Reg : ResultRegs) 3966 Reg = B.buildTrunc(S16, Reg).getReg(0); 3967 } 3968 } 3969 3970 auto padWithUndef = [&](LLT Ty, int NumElts) { 3971 if (NumElts == 0) 3972 return; 3973 Register Undef = B.buildUndef(Ty).getReg(0); 3974 for (int I = 0; I != NumElts; ++I) 3975 ResultRegs.push_back(Undef); 3976 }; 3977 3978 // Pad out any elements eliminated due to the dmask. 3979 LLT ResTy = MRI->getType(ResultRegs[0]); 3980 if (!ResTy.isVector()) { 3981 padWithUndef(ResTy, NumElts - ResultRegs.size()); 3982 B.buildBuildVector(DstReg, ResultRegs); 3983 return true; 3984 } 3985 3986 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 3987 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 3988 3989 // Deal with the one annoying legal case. 3990 const LLT V3S16 = LLT::vector(3, 16); 3991 if (Ty == V3S16) { 3992 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 3993 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 3994 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 3995 return true; 3996 } 3997 3998 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 3999 B.buildConcatVectors(DstReg, ResultRegs); 4000 return true; 4001 } 4002 4003 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4004 MachineInstr &MI, MachineIRBuilder &B, 4005 GISelChangeObserver &Observer) const { 4006 Register Dst = MI.getOperand(0).getReg(); 4007 LLT Ty = B.getMRI()->getType(Dst); 4008 unsigned Size = Ty.getSizeInBits(); 4009 MachineFunction &MF = B.getMF(); 4010 4011 Observer.changingInstr(MI); 4012 4013 // FIXME: We don't really need this intermediate instruction. The intrinsic 4014 // should be fixed to have a memory operand. Since it's readnone, we're not 4015 // allowed to add one. 4016 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4017 MI.RemoveOperand(1); // Remove intrinsic ID 4018 4019 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4020 // TODO: Should this use datalayout alignment? 4021 const unsigned MemSize = (Size + 7) / 8; 4022 const Align MemAlign(4); 4023 MachineMemOperand *MMO = MF.getMachineMemOperand( 4024 MachinePointerInfo(), 4025 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4026 MachineMemOperand::MOInvariant, 4027 MemSize, MemAlign); 4028 MI.addMemOperand(MF, MMO); 4029 4030 // There are no 96-bit result scalar loads, but widening to 128-bit should 4031 // always be legal. We may need to restore this to a 96-bit result if it turns 4032 // out this needs to be converted to a vector load during RegBankSelect. 4033 if (!isPowerOf2_32(Size)) { 4034 LegalizerHelper Helper(MF, *this, Observer, B); 4035 B.setInstr(MI); 4036 4037 if (Ty.isVector()) 4038 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4039 else 4040 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4041 } 4042 4043 Observer.changedInstr(MI); 4044 return true; 4045 } 4046 4047 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4048 MachineRegisterInfo &MRI, 4049 MachineIRBuilder &B) const { 4050 B.setInstr(MI); 4051 4052 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4053 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4054 !ST.isTrapHandlerEnabled()) { 4055 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4056 } else { 4057 // Pass queue pointer to trap handler as input, and insert trap instruction 4058 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4059 const ArgDescriptor *Arg = 4060 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4061 if (!Arg) 4062 return false; 4063 MachineRegisterInfo &MRI = *B.getMRI(); 4064 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4065 Register LiveIn = getLiveInRegister( 4066 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4067 /*InsertLiveInCopy=*/false); 4068 if (!loadInputValue(LiveIn, B, Arg)) 4069 return false; 4070 B.buildCopy(SGPR01, LiveIn); 4071 B.buildInstr(AMDGPU::S_TRAP) 4072 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4073 .addReg(SGPR01, RegState::Implicit); 4074 } 4075 4076 MI.eraseFromParent(); 4077 return true; 4078 } 4079 4080 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4081 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4082 B.setInstr(MI); 4083 4084 // Is non-HSA path or trap-handler disabled? then, report a warning 4085 // accordingly 4086 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4087 !ST.isTrapHandlerEnabled()) { 4088 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4089 "debugtrap handler not supported", 4090 MI.getDebugLoc(), DS_Warning); 4091 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4092 Ctx.diagnose(NoTrap); 4093 } else { 4094 // Insert debug-trap instruction 4095 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4096 } 4097 4098 MI.eraseFromParent(); 4099 return true; 4100 } 4101 4102 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 4103 MachineIRBuilder &B, 4104 GISelChangeObserver &Observer) const { 4105 MachineRegisterInfo &MRI = *B.getMRI(); 4106 4107 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4108 auto IntrID = MI.getIntrinsicID(); 4109 switch (IntrID) { 4110 case Intrinsic::amdgcn_if: 4111 case Intrinsic::amdgcn_else: { 4112 MachineInstr *Br = nullptr; 4113 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 4114 const SIRegisterInfo *TRI 4115 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4116 4117 B.setInstr(*BrCond); 4118 Register Def = MI.getOperand(1).getReg(); 4119 Register Use = MI.getOperand(3).getReg(); 4120 4121 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 4122 if (Br) 4123 BrTarget = Br->getOperand(0).getMBB(); 4124 4125 if (IntrID == Intrinsic::amdgcn_if) { 4126 B.buildInstr(AMDGPU::SI_IF) 4127 .addDef(Def) 4128 .addUse(Use) 4129 .addMBB(BrTarget); 4130 } else { 4131 B.buildInstr(AMDGPU::SI_ELSE) 4132 .addDef(Def) 4133 .addUse(Use) 4134 .addMBB(BrTarget) 4135 .addImm(0); 4136 } 4137 4138 if (Br) 4139 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 4140 4141 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4142 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4143 MI.eraseFromParent(); 4144 BrCond->eraseFromParent(); 4145 return true; 4146 } 4147 4148 return false; 4149 } 4150 case Intrinsic::amdgcn_loop: { 4151 MachineInstr *Br = nullptr; 4152 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 4153 const SIRegisterInfo *TRI 4154 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4155 4156 B.setInstr(*BrCond); 4157 4158 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 4159 if (Br) 4160 BrTarget = Br->getOperand(0).getMBB(); 4161 4162 Register Reg = MI.getOperand(2).getReg(); 4163 B.buildInstr(AMDGPU::SI_LOOP) 4164 .addUse(Reg) 4165 .addMBB(BrTarget); 4166 4167 if (Br) 4168 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 4169 4170 MI.eraseFromParent(); 4171 BrCond->eraseFromParent(); 4172 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4173 return true; 4174 } 4175 4176 return false; 4177 } 4178 case Intrinsic::amdgcn_kernarg_segment_ptr: 4179 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4180 B.setInstr(MI); 4181 // This only makes sense to call in a kernel, so just lower to null. 4182 B.buildConstant(MI.getOperand(0).getReg(), 0); 4183 MI.eraseFromParent(); 4184 return true; 4185 } 4186 4187 return legalizePreloadedArgIntrin( 4188 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4189 case Intrinsic::amdgcn_implicitarg_ptr: 4190 return legalizeImplicitArgPtr(MI, MRI, B); 4191 case Intrinsic::amdgcn_workitem_id_x: 4192 return legalizePreloadedArgIntrin(MI, MRI, B, 4193 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4194 case Intrinsic::amdgcn_workitem_id_y: 4195 return legalizePreloadedArgIntrin(MI, MRI, B, 4196 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4197 case Intrinsic::amdgcn_workitem_id_z: 4198 return legalizePreloadedArgIntrin(MI, MRI, B, 4199 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4200 case Intrinsic::amdgcn_workgroup_id_x: 4201 return legalizePreloadedArgIntrin(MI, MRI, B, 4202 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4203 case Intrinsic::amdgcn_workgroup_id_y: 4204 return legalizePreloadedArgIntrin(MI, MRI, B, 4205 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4206 case Intrinsic::amdgcn_workgroup_id_z: 4207 return legalizePreloadedArgIntrin(MI, MRI, B, 4208 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4209 case Intrinsic::amdgcn_dispatch_ptr: 4210 return legalizePreloadedArgIntrin(MI, MRI, B, 4211 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4212 case Intrinsic::amdgcn_queue_ptr: 4213 return legalizePreloadedArgIntrin(MI, MRI, B, 4214 AMDGPUFunctionArgInfo::QUEUE_PTR); 4215 case Intrinsic::amdgcn_implicit_buffer_ptr: 4216 return legalizePreloadedArgIntrin( 4217 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4218 case Intrinsic::amdgcn_dispatch_id: 4219 return legalizePreloadedArgIntrin(MI, MRI, B, 4220 AMDGPUFunctionArgInfo::DISPATCH_ID); 4221 case Intrinsic::amdgcn_fdiv_fast: 4222 return legalizeFDIVFastIntrin(MI, MRI, B); 4223 case Intrinsic::amdgcn_is_shared: 4224 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4225 case Intrinsic::amdgcn_is_private: 4226 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4227 case Intrinsic::amdgcn_wavefrontsize: { 4228 B.setInstr(MI); 4229 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4230 MI.eraseFromParent(); 4231 return true; 4232 } 4233 case Intrinsic::amdgcn_s_buffer_load: 4234 return legalizeSBufferLoad(MI, B, Observer); 4235 case Intrinsic::amdgcn_raw_buffer_store: 4236 case Intrinsic::amdgcn_struct_buffer_store: 4237 return legalizeBufferStore(MI, MRI, B, false, false); 4238 case Intrinsic::amdgcn_raw_buffer_store_format: 4239 case Intrinsic::amdgcn_struct_buffer_store_format: 4240 return legalizeBufferStore(MI, MRI, B, false, true); 4241 case Intrinsic::amdgcn_raw_tbuffer_store: 4242 case Intrinsic::amdgcn_struct_tbuffer_store: 4243 return legalizeBufferStore(MI, MRI, B, true, true); 4244 case Intrinsic::amdgcn_raw_buffer_load: 4245 case Intrinsic::amdgcn_struct_buffer_load: 4246 return legalizeBufferLoad(MI, MRI, B, false, false); 4247 case Intrinsic::amdgcn_raw_buffer_load_format: 4248 case Intrinsic::amdgcn_struct_buffer_load_format: 4249 return legalizeBufferLoad(MI, MRI, B, true, false); 4250 case Intrinsic::amdgcn_raw_tbuffer_load: 4251 case Intrinsic::amdgcn_struct_tbuffer_load: 4252 return legalizeBufferLoad(MI, MRI, B, true, true); 4253 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4254 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4255 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4256 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4257 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4258 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4259 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4260 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4261 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4262 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4263 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4264 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4265 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4266 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4267 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4268 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4269 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4270 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4271 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4272 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4273 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4274 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4275 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4276 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4277 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4278 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4279 return legalizeBufferAtomic(MI, B, IntrID); 4280 case Intrinsic::amdgcn_atomic_inc: 4281 return legalizeAtomicIncDec(MI, B, true); 4282 case Intrinsic::amdgcn_atomic_dec: 4283 return legalizeAtomicIncDec(MI, B, false); 4284 case Intrinsic::trap: 4285 return legalizeTrapIntrinsic(MI, MRI, B); 4286 case Intrinsic::debugtrap: 4287 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4288 default: { 4289 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4290 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4291 return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr); 4292 return true; 4293 } 4294 } 4295 4296 return true; 4297 } 4298